def testPPOSampleWaste(self): ray.init(num_cpus=4, object_store_memory=1000 * 1024 * 1024) # Check we at least collect the initial wave of samples ppo = PPOTrainer(env="CartPole-v0", config={ "sample_batch_size": 200, "train_batch_size": 128, "num_workers": 3, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 600) ppo.stop() # Check we collect at least the specified amount of samples ppo = PPOTrainer(env="CartPole-v0", config={ "sample_batch_size": 200, "train_batch_size": 900, "num_workers": 3, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 1000) ppo.stop() # Check in vectorized mode ppo = PPOTrainer(env="CartPole-v0", config={ "sample_batch_size": 200, "num_envs_per_worker": 2, "train_batch_size": 900, "num_workers": 3, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 1200) ppo.stop()
iteration = 22 improved = 0 while True: trainer = PPOTrainer(env="fire_mage", config=rnn_config) print(dir(trainer)) #trainer.restore('./checkpoints_flush/checkpoint_379/checkpoint-379') step = 0 best_val = 0.0 if False: save_0 = trainer.save_to_object() while True: if False: save_0 = trainer.save_to_object() result = trainer.train() while result['episode_reward_mean'] > best_val: print('UPENING') best_save = deepcopy(save_0) best_val = result['episode_reward_mean'] save_0 = trainer.save_to_object() trainer.save('./checkpoints_flush') result = trainer.train() print('REVERTING') trainer.restore_from_object(best_save) else: result = trainer.train() if result['episode_reward_mean'] > best_val: improved = step best_val = result['episode_reward_mean'] trainer.save('./checkpoints_iter_' + str(iteration))
# You should see both the printed X and Y approach 200 as this trains: # info: # policy_reward_mean: # dqn_policy: X # ppo_policy: Y for i in range(args.stop_iters): print("== Iteration", i, "==") # improve the DQN policy print("-- DQN --") result_dqn = dqn_trainer.train() print(pretty_print(result_dqn)) # improve the PPO policy print("-- PPO --") result_ppo = ppo_trainer.train() print(pretty_print(result_ppo)) # Test passed gracefully. if args.as_test and \ result_dqn["episode_reward_mean"] > args.stop_reward and \ result_ppo["episode_reward_mean"] > args.stop_reward: print("test passed (both agents above requested reward)") quit(0) # swap weights to synchronize dqn_trainer.set_weights(ppo_trainer.get_weights(["ppo_policy"])) ppo_trainer.set_weights(dqn_trainer.get_weights(["dqn_policy"])) # Desired reward not reached. if args.as_test:
# Note that we do not immediately return value, but rather save it for `value_function` model_out, self._value = self.base_model(input_dict["obs"]) # l = np.array([last_r]) # if l.shape == (1,): # l = l.reshape((1, 1)) return model_out, state def value_function(self): return self._value ModelCatalog.register_custom_model("image-ppo", RLLibPPOCritic) ray.init() trainer = PPOTrainer( env="CartPole-v0", config={ "framework": "torch", "model": { "custom_model": "image-ppo", }, } ) plot = plotter.Plotter('ppo_cartpole') for epoch in range(10): results = trainer.train() plot.add_results(results) plot.plot(title='PPO CartPole-v0')
trainer_config = DEFAULT_CONFIG.copy() trainer_config['num_workers'] = 1 trainer_config["train_batch_size"] = 400 trainer_config["sgd_minibatch_size"] = 64 trainer_config["num_sgd_iter"] = 10 trainer = PPOTrainer(trainer_config, SIR); for i in range(200): print("Training iteration {}...".format(i)) trainer.train() env = SIR() state = env.reset() done = False #max_state = -1 cumulative_reward = 0 total_states = list() while not done: action = trainer.compute_action(state) state, reward, done, results = env.step(action) #max_state = max(max_state, state)
def test_local(self): cf = DEFAULT_CONFIG.copy() for _ in framework_iterator(cf): agent = PPOTrainer(cf, "CartPole-v0") print(agent.train()) agent.stop()
def testLocal(self): ray.init(local_mode=True) cf = DEFAULT_CONFIG.copy() agent = PPOTrainer(cf, "CartPole-v0") print(agent.train())
number_of_stochastic_moves = 5 logdir = str(trainer_obj._logdir) additional_metrics = {"additional_metrics": {}} file_writer = tf.summary.create_file_writer(logdir) file_writer.set_as_default() for epoch in tqdm(range(best_ckpt + 1, epochs)): print("Epoch " + str(epoch)) # when we call the train() methods we are updating the weights but the # win_rate is referred to the previous weights used to collect the # rollouts # if internally the weight changed, also prev_weights change since # it is just a reference (tested, prev_weights doesn't change) prev_weights = trainer_obj.get_policy("player1").get_weights() results = trainer_obj.train() # CHECK IF Array are different #updated_weights = trainer_obj.get_policy("player1").get_weights() # print("there are " + str(len(prev_weights)) + " weights") # indx = 0 # equal_weights = [] # for w1, w2 in zip(prev_weights,updated_weights): # if np.array_equal(w1,w2):: # equal_weights.append(indx) # indx += 1 # print(equal_weights) # input("Press Enter...") player1_win_rate = results["custom_metrics"]["player1_win_rate"] # instead of score_diff we use the win_ratio
def test_basic(self): ppo = PPOTrainer(env="CartPole-v0", config={"lr_schedule": [[0, 1e-5], [1000, 0.0]]}) for _ in range(10): result = ppo.train() assert result["episode_reward_mean"] < 100, "should not have learned"
"interaction_hidden_size": 4, }, }, "clip_actions": True, "framework": "torch", "num_sgd_iter": 3, "lr": 1e-4, #"kl_target": 0.03, "no_done_at_end": False, "soft_horizon": True, "train_batch_size": 100, "rollout_fragment_length": 100, "sgd_minibatch_size": 32 } trainer = PPOTrainer(env="negotiate_roborobo", config=config) print(trainer.config.get('no_final_linear')) print('model built') stop_iter = 2000 #%% import numpy as np for i in range(stop_iter): print("== Iteration", i, "==") result_ppo = trainer.train() if (i + 1) % 1 == 0: trainer.save('model_nego') trainer.save('model_nego') del trainerii ray.shutdown()
import ray from ray.rllib.agents.ppo import PPOTrainer, DEFAULT_CONFIG from ray.tune.logger import pretty_print #ray.shutdown() ray.init(num_cpus=4, ignore_reinit_error=True, log_to_driver=False) config = DEFAULT_CONFIG.copy() config['num_workers'] = 1 config['num_sgd_iter'] = 30 config['sgd_minibatch_size'] = 128 config['model']['fcnet_hiddens'] = [100, 100] config[ 'num_cpus_per_worker'] = 0 # This avoids running out of resources in the notebook environment when this cell is re-executed agent1 = PPOTrainer(config, 'CartPole-v0') for i in range(2): result = agent1.train() print(pretty_print(result)) config2 = DEFAULT_CONFIG.copy() config2['num_workers'] = 4 config2['num_sgd_iter'] = 30 config2['sgd_minibatch_size'] = 128 config2['model']['fcnet_hiddens'] = [100, 100] config2['num_cpus_per_worker'] = 0 agent2 = PPOTrainer(config2, 'CartPole-v0') for i in range(2): result = agent2.train() print(pretty_print(result)) checkpoint_path = agent2.save()
def execute(self): timesteps = 0 best_period_value = None if self.pr.agent.name() == "A2C": trainer = A2CTrainer(config=self.rllib_config, logger_creator=rllib_logger_creator) elif self.pr.agent.name() == "PPO": trainer = PPOTrainer(config=self.rllib_config, logger_creator=rllib_logger_creator) # import pdb; pdb.set_trace() else: raise ValueError('There is no rllib trainer with name ' + self.pr.agent.name()) tf_writer = SummaryWriter( self.pr.save_logs_to) if self.pr.save_logs_to else None reward_metric = Metric(short_name='rews', long_name='trajectory reward', formatting_string='{:5.1f}', higher_is_better=True) time_step_metric = Metric(short_name='steps', long_name='total number of steps', formatting_string='{:5.1f}', higher_is_better=True) metrics = [reward_metric, time_step_metric] if self.pr.train: start_time = time.time() policy_save_tag = 0 while timesteps < self.pr.total_steps: result = trainer.train() timesteps = result["timesteps_total"] reward_metric.log(result['evaluation']['episode_reward_mean']) time_step_metric.log(result['evaluation']['episode_len_mean']) # import pdb; pdb.set_trace() # # Get a metric list from each environment. # if hasattr(trainer, "evaluation_workers"): # metric_lists = sum(trainer.evaluation_workers.foreach_worker(lambda w: w.foreach_env(lambda e: e.metrics)), []) # else: # metric_lists = sum(trainer.workers.foreach_worker(lambda w: w.foreach_env(lambda e: e.metrics)), []) # metrics = metric_lists[0] # # Aggregate metrics from all other environments. # for metric_list in metric_lists[1:]: # for i, metric in enumerate(metric_list): # metrics[i]._values.extend(metric._values) save_logs_to = self.pr.save_logs_to model_save_paths_dict = self.pr.model_save_paths_dict # Consider whether to save a model. saved = False if model_save_paths_dict is not None and metrics[ 0].currently_optimal: # trainer.get_policy().model.save(model_save_paths_dict) policy_save_tag += 1 trainer.get_policy().model.save_model_in_progress( model_save_paths_dict, policy_save_tag) saved = True # Write the metrics for this reporting period. total_seconds = time.time() - start_time logger.write_and_condense_metrics(total_seconds, 'iters', timesteps, saved, metrics, tf_writer) # Clear the metrics, both those maintained by the training workers and by the evaluation ones. condense_fn = lambda environment: [ m.condense_values() for m in environment.metrics ] trainer.workers.foreach_worker( lambda w: w.foreach_env(condense_fn)) if hasattr(trainer, "evaluation_workers"): trainer.evaluation_workers.foreach_worker( lambda w: w.foreach_env(condense_fn)) else: start_time = time.time() env = trainer.workers.local_worker().env metrics = env.metrics worker = trainer.workers.local_worker() steps = steps_since_report = 0 while True: batch = worker.sample() current_steps = len(batch["obs"]) steps += current_steps steps_since_report += current_steps if steps_since_report >= self.pr.reporting_interval: total_seconds = time.time() - start_time # Write the metrics for this reporting period. logger.write_and_condense_metrics(total_seconds, 'iters', steps, False, metrics, tf_writer) steps_since_report = 0 if steps >= self.pr.total_steps: break env.close() # Get a summary metric for the entire stage, based on the environment's first metric. summary_metric = logger.summarize_stage(metrics[0]) # Temporary workaround for https://github.com/ray-project/ray/issues/8205 ray.shutdown() _register_all() return summary_metric
def main(): ray.init() logging.getLogger().setLevel(logging.INFO) date = datetime.now().strftime('%Y%m%d_%H%M%S') parser = argparse.ArgumentParser() # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4') parser.add_argument('--config', type=str, default='config/global_config.json', help='config file') parser.add_argument('--algo', type=str, default='PPO', choices=['DQN', 'DDQN', 'DuelDQN'], help='choose an algorithm') parser.add_argument('--inference', action="store_true", help='inference or training') parser.add_argument('--ckpt', type=str, help='inference or training') parser.add_argument('--epoch', type=int, default=10, help='number of training epochs') parser.add_argument( '--num_step', type=int, default=10**3, help='number of timesteps for one episode, and for inference') parser.add_argument('--save_freq', type=int, default=100, help='model saving frequency') parser.add_argument('--batch_size', type=int, default=128, help='model saving frequency') parser.add_argument('--state_time_span', type=int, default=5, help='state interval to receive long term state') parser.add_argument('--time_span', type=int, default=30, help='time interval to collect data') args = parser.parse_args() config_env = env_config(args) # ray.tune.register_env('gym_cityflow', lambda env_config:CityflowGymEnv(config_env)) config_agent = agent_config(config_env) # # build cityflow environment trainer = PPOTrainer(env=CityflowGymEnv, config=config_agent) for i in range(1000): # Perform one iteration of training the policy with PPO result = trainer.train() print(pretty_print(result)) if i % 30 == 0: checkpoint = trainer.save() print("checkpoint saved at", checkpoint)
class KandboxAgentRLLibPPO(KandboxAgentPlugin): title = "Kandbox Plugin - Agent - realtime - by rllib ppo" slug = "ri_agent_rl_ppo" author = "Kandbox" author_url = "https://github.com/qiyangduan" description = "RLLibPPO for GYM for RL." version = "0.1.0" default_config = { "nbr_of_actions": 4, "n_epochs": 1000, "nbr_of_days_planning_window": 6, "model_path": "default_model_path", "working_dir": "/tmp", "checkpoint_path_key": "ppo_checkpoint_path", } config_form_spec = { "type": "object", "properties": {}, } def __init__(self, agent_config, kandbox_config): self.agent_config = agent_config self.current_best_episode_reward_mean = -99 env_config = agent_config["env_config"] if "rules_slug_config_list" not in env_config.keys(): if "rules" not in env_config.keys(): log.error("no rules_slug_config_list and no rules ") else: env_config["rules_slug_config_list"] = [ [rule.slug, rule.config] for rule in env_config["rules"] ] env_config.pop("rules", None) # self.env_class = env_class = agent_config["env"] self.kandbox_config = self.default_config.copy() self.kandbox_config.update(kandbox_config) # self.trained_model = trained_model self.kandbox_config["create_datetime"] = datetime.now() # self.trainer = None self.env_config = env_config # self.load_model(env_config=self.env_config) print( f"KandboxAgentRLLibPPO __init__ called, at time {self.kandbox_config['create_datetime']}" ) # import pdb # pdb.set_trace() if not ray.is_initialized(): ray.init(ignore_reinit_error=True, log_to_driver=False) # ray.init(redis_address="localhost:6379") def build_model(self): trainer_config = DEFAULT_CONFIG.copy() trainer_config["num_workers"] = 0 # trainer_config["train_batch_size"] = 640 # trainer_config["sgd_minibatch_size"] = 160 # trainer_config["num_sgd_iter"] = 100 trainer_config["exploration_config"] = { "type": "Random", } # EpsilonGreedy(Exploration): # trainer_config["exploration_config"] = { # "type": "Curiosity", # "eta": 0.2, # "lr": 0.001, # "feature_dim": 128, # "feature_net_config": { # "fcnet_hiddens": [], # "fcnet_activation": "relu", # }, # "sub_exploration": { # "type": "StochasticSampling", # } # } # trainer_config["log_level"] = "DEBUG" """ if env_config is not None: for x in env_config.keys(): trainer_config[x] = env_config[x] """ # trainer_config["env_config"] = copy.deepcopy(env_config) # {"rules": "qiyang_role"} trainer_config.update(self.agent_config) self.trainer = PPOTrainer(trainer_config, self.agent_config["env"]) # self.config["trainer"] = self.trainer return self.trainer def load_model(self): # , allow_empty = None env_config = self.agent_config["env_config"] self.trainer = self.build_model() # if (model_path is not None) & (os.path.exists(model_path)): if "ppo_checkpoint_path" in env_config.keys(): # raise FileNotFoundError("can not find model at path: {}".format(model_path)) if os.path.exists(env_config["ppo_checkpoint_path"]): self.trainer.restore(env_config["ppo_checkpoint_path"]) print("Reloaded model from path: {} ".format( env_config["ppo_checkpoint_path"])) else: print( "Env_config has ppo_checkpoint_path = {}, but no files found. I am returning an initial model" .format(env_config["ppo_checkpoint_path"])) else: print( "Env_config has no ppo_checkpoint_path, returning an initial model" ) # self.config["model_path"] = model_path # self.config["trainer"] = self.trainer # self.config["policy"] = self.trainer.workers.local_worker().get_policy() self.policy = self.trainer.workers.local_worker().get_policy() return self.trainer def train_model(self): # self.trainer = self.build_model() for i in range(self.kandbox_config["n_epochs"]): result = self.trainer.train() # print(pretty_print(result)) print( "Finished training iteration {}, Result: episodes_this_iter:{}, policy_reward_max: {}, episode_reward_max {}, episode_reward_mean {}, info.num_steps_trained: {}..." .format( i, result["episodes_this_iter"], result["policy_reward_max"], result["episode_reward_max"], result["episode_reward_mean"], result["info"]["num_steps_trained"], )) if result[ "episode_reward_mean"] > self.current_best_episode_reward_mean * 1.1: model_path = self.save_model() print( "Model is saved after 10 percent increase, episode_reward_mean = {}, file = {}" .format(result["episode_reward_mean"], model_path)) self.current_best_episode_reward_mean = result[ "episode_reward_mean"] return self.save_model() def save_model(self): checkpoint_dir = "{}/model_checkpoint_org_{}_team_{}".format( self.agent_config["env_config"]["working_dir"], self.agent_config["env_config"]["org_code"], self.agent_config["env_config"]["team_id"], ) _path = self.trainer.save(checkpoint_dir=checkpoint_dir) # exported_model_dir = "{}/exported_ppo_model_org_{}_team_{}".format( # self.agent_config["env_config"]["working_dir"], self.agent_config["env_config"]["org_code"], self.agent_config["env_config"]["team_id"] # ) # self.trainer.get_policy().export_model(exported_model_dir + "/1") return _path # self.trainer def predict_action(self, observation=None): action = self.trainer.compute_action(observation) return action def predict_action_list(self, env=None, job_code=None, observation=None): actions = [] if env is not None: self.env = env else: env = self.env if job_code is None: job_i = env.current_job_i else: job_i = env.jobs_dict[job_code].job_index observation = env._get_observation() # export_dir = "/Users/qiyangduan/temp/kandbox/exported_ppo_model_org_duan3_team_3/1" # loaded_policy = tf.saved_model.load(export_dir) # loaded_policy.signatures["serving_default"](observations=observation) predicted_action = self.trainer.compute_action(observation) # V predicted_action = self.policy.compute_action(observation) for _ in range(len(env.workers)): # hist_job_workers_ranked: if len(actions) >= self.config["nbr_of_actions"]: return actions actions.append(list(predicted_action).copy()) max_i = np.argmax(predicted_action[0:len(env.workers)]) predicted_action[max_i] = 0 return actions def predict_action_dict_list(self, env=None, job_code=None, observation=None): if env is not None: self.env = env else: env = self.env curr_job = copy.deepcopy(env.jobs_dict[job_code]) if job_code is None: job_i = env.current_job_i else: job_i = curr_job.job_index env.current_job_i = job_i observation = env._get_observation() action = self.predict_action(observation=observation) action_dict = env.decode_action_into_dict_native(action=action) action_day = int(action_dict.scheduled_start_minutes / 1440) curr_job.requested_start_min_minutes = action_day * 1440 curr_job.requested_start_max_minutes = (action_day + 1) * 1440 action_dict_list = self.env.recommendation_server.search_action_dict_on_worker_day( a_worker_code_list=action.scheduled_worker_codes, curr_job=curr_job, max_number_of_matching=3, ) return action_dict_list
def fulltest(total_trials, training_trials, d, m, q, train_check, evaluation_trials=5000, lr=0.00005, num_workers=4, num_gpus=0.25, SDP=True, LG=False, local_SDP=False, dep=True, rngvec=np.ones(1000)): quantization = 20 separable = True bigvec = np.zeros((total_trials, int(training_trials / train_check) + 1)) vec_SDP = [] vec_local_SDP = [] vec_LG = [] for j in range(total_trials): print("Starting round", j, "of", total_trials) rho, _ = qsdl.generate_initial_state(d, m, rng=rngvec[j], depolarized=dep) if local_SDP == True: lg = max_SDP_sim_order(q, rho, len(d), 1250, d) vec_local_SDP.append(lg) print("local SDP-based") print(lg) if SDP == True: sdpr = sdp.SDP(rho, q, len(d)) vec_SDP.append(sdpr) print("SDP") print(sdpr) if LG == True: lg = LG_sim_order(copy.copy(q), copy.copy(rho), len(d), 2500, d) vec_LG.append(lg) print("LG") print(lg) print("RLNN: ") print(bigvec[-1]) defaultconfig = { "rho": copy.copy(rho), "q": copy.copy(q), "quantization": quantization, "d": d, "separable": True } vec = [] ray.shutdown() ray.init(**ray_init_kwargs) config = ppo.DEFAULT_CONFIG.copy() if (num_gpus > 0): config["num_gpus"] = num_gpus config["num_workers"] = num_workers config["lr"] = lr config["train_batch_size"] = 8000 config["num_sgd_iter"] = 5 config["env_config"] = defaultconfig trainer = Trainer(config=config, env=qsdl.QSDEnv) for i in range(training_trials): result = trainer.train() print("train iteration", i + 1, "/", training_trials, " avg_reward =", result["episode_reward_mean"], " timesteps =", result["timesteps_total"]) # if i % check == check-1: # checkpoint = trainer.save() # print("checkpoint saved at", checkpoint) if i == 0 or (i + 1) % train_check == 0: rew = 0 for i in range(evaluation_trials): env = qsdl.QSDEnv(defaultconfig) obs = env.reset() done = False while not done: action = trainer.compute_action(obs) obs, r, done, _ = env.step(action) rew += r vec.append(rew / evaluation_trials) bigvec[j] = vec return bigvec, vec_SDP, vec_local_SDP, vec_LG
"policy_mapping_fn": policy_mapping_fn, }, "framework": "tf", } # Create the Trainer used for Policy serving. trainer = PPOTrainer(env="fake_unity", config=config) # Attempt to restore from checkpoint if possible. checkpoint_path = CHECKPOINT_FILE.format(args.env) if not args.no_restore and os.path.exists(checkpoint_path): checkpoint_path = open(checkpoint_path).read() print("Restoring from checkpoint path", checkpoint_path) trainer.restore(checkpoint_path) # Serving and training loop. count = 0 while True: # Calls to train() will block on the configured `input` in the Trainer # config above (PolicyServerInput). print(trainer.train()) if count % args.checkpoint_freq == 0: print("Saving learning progress to checkpoint file.") checkpoint = trainer.save() # Write the latest checkpoint location to CHECKPOINT_FILE, # so we can pick up from the latest one after a server re-start. with open(checkpoint_path, "w") as f: f.write(checkpoint) count += 1
"sample_batch_size": 20, "sgd_minibatch_size": 500, "num_sgd_iter": 10, "num_workers": 1, # 32 "num_envs_per_worker": 1, #5 "num_gpus": 1, "model": { "dim": 64 } }) def env_creator(env_config): return PodWorldEnv(max_steps=10000, reward_factor=10000.0) register_env("podworld_env", env_creator) agent = PPOTrainer(config=config, env="podworld_env") agent_save_path = None for i in range(50): stats = agent.train() # print(pretty_print(stats)) if i % 5 == 0 and i > 0: path = agent.save() if agent_save_path is None: agent_save_path = path print('Saved agent at', agent_save_path) logger.write((i, stats['episode_reward_min'])) print('episode_reward_mean', stats['episode_reward_min'])
def train_func(): default_config = { 'env': 'JSSEnv:jss-v1', 'seed': 0, 'framework': 'tf', 'log_level': 'WARN', 'num_gpus': 1, 'instance_path': 'instances/ta41', 'evaluation_interval': None, 'metrics_smoothing_episodes': 2000, 'gamma': 1.0, 'num_workers': mp.cpu_count(), 'layer_nb': 2, 'train_batch_size': mp.cpu_count() * 4 * 704, 'num_envs_per_worker': 4, 'rollout_fragment_length': 704, # TO TUNE 'sgd_minibatch_size': 33000, 'layer_size': 319, 'lr': 0.0006861, # TO TUNE 'lr_start': 0.0006861, # TO TUNE 'lr_end': 0.00007783, # TO TUNE 'clip_param': 0.541, # TO TUNE 'vf_clip_param': 26, # TO TUNE 'num_sgd_iter': 12, # TO TUNE "vf_loss_coeff": 0.7918, "kl_coeff": 0.496, 'kl_target': 0.05047, # TO TUNE 'lambda': 1.0, 'entropy_coeff': 0.0002458, # TUNE LATER 'entropy_start': 0.0002458, 'entropy_end': 0.002042, 'entropy_coeff_schedule': None, "batch_mode": "truncate_episodes", "grad_clip": None, "use_critic": True, "use_gae": True, "shuffle_sequences": True, "vf_share_layers": False, "observation_filter": "NoFilter", "simple_optimizer": False, "_fake_gpus": False, } wandb.init(config=default_config) ray.init() tf.random.set_seed(0) np.random.seed(0) random.seed(0) config = wandb.config ModelCatalog.register_custom_model("fc_masked_model_tf", FCMaskedActionsModelTF) config['model'] = { "fcnet_activation": "relu", "custom_model": "fc_masked_model_tf", 'fcnet_hiddens': [config['layer_size'] for k in range(config['layer_nb'])], "vf_share_layers": False, } config['env_config'] = { 'env_config': { 'instance_path': config['instance_path'] } } config = with_common_config(config) config['seed'] = 0 config['callbacks'] = CustomCallbacks config['train_batch_size'] = config['sgd_minibatch_size'] config['lr'] = config['lr_start'] config['lr_schedule'] = [[0, config['lr_start']], [15000000, config['lr_end']]] config['entropy_coeff'] = config['entropy_start'] config['entropy_coeff_schedule'] = [[0, config['entropy_start']], [15000000, config['entropy_end']]] config.pop('instance_path', None) config.pop('layer_size', None) config.pop('layer_nb', None) config.pop('lr_start', None) config.pop('lr_end', None) config.pop('entropy_start', None) config.pop('entropy_end', None) stop = { "time_total_s": 10 * 60, } start_time = time.time() trainer = PPOTrainer(config=config) while start_time + stop['time_total_s'] > time.time(): result = trainer.train() result = wandb_tune._clean_log(result) log, config_update = _handle_result(result) wandb.log(log) # wandb.config.update(config_update, allow_val_change=True) # trainer.export_policy_model("/home/jupyter/JSS/JSS/models/") ray.shutdown()
"policies": policies, "policy_mapping_fn": lambda agent_id: "ppo_policy", }, # "num_gpus": 0, # "num_gpus_per_worker": 0, "callbacks": PlayerScoreCallbacks }) if restore_checkpoint: trainer.restore(checkpoint_path) start = time.time() try: for i in range(num_iter): res = trainer.train() print("Iteration {}. policy result: {}".format(i, res)) if i % eval_every == 0: trainer_eval.set_weights(trainer.get_weights(["ppo_policy"])) res = trainer_eval.train() if i % checkpoint_every == 0: trainer.save() except: trainer.save() stop = time.time() train_duration = time.strftime('%H:%M:%S', time.gmtime(stop - start)) print( 'Training finished ({}), check the results in ~/ray_results/<dir>/'.format( train_duration))
} # analysis = tune.run( # "PPO", # stop={ # "episode_reward_mean": 500000 # }, # config=trainer_config, # loggers=DEFAULT_LOGGERS + (WandbLogger, ), # checkpoint_at_end=True # ) ## debug code ray.init(num_gpus=num_gpus, local_mode=True) agent = PPOTrainer(env="TradingEnv", config=trainer_config) agent.train() # compute final reward # ray.init(num_gpus=1, local_mode=False) # env = build_env({ # "window_size": 25 # }) # episode_reward = 0 # done = False # obs = env.reset() # # while not done: # action = agent.compute_action(obs) # obs, reward, done, info = env.step(action) # episode_reward += reward # print(f'reward: {episode_reward}')
ten_gig = 10737418240 trainer = PPOTrainer( env="ic20env", config=merge_dicts( DEFAULT_CONFIG, { # -- Rollout-Worker 'num_gpus': 1, 'num_workers': 10, "num_envs_per_worker": 1, "num_cpus_per_worker": 1, "memory_per_worker": ten_gig, 'gamma': 0.99, 'lambda': 0.95 })) # Attempt to restore from checkpoint if possible. if os.path.exists(CHECKPOINT_FILE): checkpoint_path = open(CHECKPOINT_FILE).read() print("Restoring from checkpoint path", checkpoint_path) trainer.restore(checkpoint_path) # Serving and training loop while True: print(pretty_print(trainer.train())) checkpoint_path = trainer.save() print("Last checkpoint", checkpoint_path) with open(CHECKPOINT_FILE, "w") as f: f.write(checkpoint_path)
def test_local(self): cf = DEFAULT_CONFIG.copy() agent = PPOTrainer(cf, "CartPole-v0") print(agent.train())
def test_simple_optimizer_sequencing(self): ModelCatalog.register_custom_model("rnn", RNNSpyModel) register_env("counter", lambda _: DebugCounterEnv()) ppo = PPOTrainer( env="counter", config={ "num_workers": 0, "rollout_fragment_length": 10, "train_batch_size": 10, "sgd_minibatch_size": 10, "num_sgd_iter": 1, "simple_optimizer": True, "model": { "custom_model": "rnn", "max_seq_len": 4, "vf_share_layers": True, }, "framework": "tf", }, ) ppo.train() ppo.train() batch0 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0") ) self.assertEqual( batch0["sequences"].tolist(), [[[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]]], ) self.assertEqual(batch0[SampleBatch.SEQ_LENS].tolist(), [4, 4, 2]) self.assertEqual(batch0["state_in"][0][0].tolist(), [0, 0, 0]) self.assertEqual(batch0["state_in"][1][0].tolist(), [0, 0, 0]) self.assertGreater(abs(np.sum(batch0["state_in"][0][1])), 0) self.assertGreater(abs(np.sum(batch0["state_in"][1][1])), 0) self.assertTrue( np.allclose( batch0["state_in"][0].tolist()[1:], batch0["state_out"][0].tolist()[:-1] ) ) self.assertTrue( np.allclose( batch0["state_in"][1].tolist()[1:], batch0["state_out"][1].tolist()[:-1] ) ) batch1 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1") ) self.assertEqual( batch1["sequences"].tolist(), [ [[10], [11], [12], [13]], [[14], [0], [0], [0]], [[0], [1], [2], [3]], [[4], [0], [0], [0]], ], ) self.assertEqual(batch1[SampleBatch.SEQ_LENS].tolist(), [4, 1, 4, 1]) self.assertEqual(batch1["state_in"][0][2].tolist(), [0, 0, 0]) self.assertEqual(batch1["state_in"][1][2].tolist(), [0, 0, 0]) self.assertGreater(abs(np.sum(batch1["state_in"][0][0])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][0])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][0][1])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][1])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][0][3])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][3])), 0)
def main() -> None: ray.init() np.random.seed(0) # instructions = { # 0: [Instruction(time=0, x=5, y=5)], # 1: [Instruction(time=1, x=5, y=5), Instruction(time=1, x=1, y=5)], # 2: [Instruction(time=2, x=5, y=5, rng=np.random.default_rng())], # } # task = Task( # target_x=1, # target_y=5, # instructions=instructions, # tot_frames=4, # width=42, # height=42, # ) # task = ODR(target_x=1, target_y=5, width=42, height=42) # task = Gap(target_x=1, target_y=5, width=42, height=42) task = ODRDistract(target_x=1, target_y=5, width=42, height=42) def env_creator(env_config): return Environment(env_config) # return an env instance register_env("my_env", env_creator) # trainer_config = DEFAULT_CONFIG.copy() # trainer_config["num_workers"] = 1 # trainer_config["train_batch_size"] = 20 # 100 # trainer_config["sgd_minibatch_size"] = 15 # 32 # trainer_config["num_sgd_iter"] = 50 trainer = PPOTrainer( env="my_env", config={ "env_config": {"task": task}, "framework": "torch", "num_workers": 1, "train_batch_size": 10, "sgd_minibatch_size": 5, "num_sgd_iter": 10, # "model": { # # Whether to wrap the model with an LSTM. # "use_lstm": True, # # Max seq len for training the LSTM, defaults to 20. # "max_seq_len": task.tot_frames - 1, # # # Size of the LSTM cell. # "lstm_cell_size": task.tot_frames - 1, # # # Whether to feed a_{t-1}, r_{t-1} to LSTM. # # # "lstm_use_prev_action_reward": False, # }, }, ) trainer = A2CTrainer( env="my_env", config={ "env_config": {"task": task}, "framework": "torch", "num_workers": 1, "train_batch_size": 10, # "model": { # # Whether to wrap the model with an LSTM. # "use_lstm": True, # # Max seq len for training the LSTM, defaults to 20. # "max_seq_len": task.tot_frames - 1, # # # Size of the LSTM cell. # "lstm_cell_size": task.tot_frames - 1, # # # Whether to feed a_{t-1}, r_{t-1} to LSTM. # # # "lstm_use_prev_action_reward": False, # }, }, ) # trainer = DQNTrainer( # env="my_env", # config={ # "env_config": {"task": task}, # "framework": "torch", # "num_workers": 1, # "train_batch_size": 10, # # "model": { # # # Whether to wrap the model with an LSTM. # # "use_lstm": True, # # # Max seq len for training the LSTM, defaults to 20. # # "max_seq_len": task.tot_frames - 1, # # # # Size of the LSTM cell. # # "lstm_cell_size": task.tot_frames - 1, # # # # Whether to feed a_{t-1}, r_{t-1} to LSTM. # # # # "lstm_use_prev_action_reward": False, # # }, # }, # ) env = Environment(env_config={"task": task}) for i in range(200): print(f"Training iteration {i}...") trainer.train() done = False cumulative_reward = 0.0 observation = env.reset() while not done: action = trainer.compute_action(observation) observation, reward, done, results = env.step(action) print(f"Time: {env.time}. Action: {action}") cumulative_reward += reward print( f"Last step reward: {reward: .3e}; Cumulative reward: {cumulative_reward:.3e}" )
def test_minibatch_sequencing(self): ModelCatalog.register_custom_model("rnn", RNNSpyModel) register_env("counter", lambda _: DebugCounterEnv()) ppo = PPOTrainer( env="counter", config={ "shuffle_sequences": False, # for deterministic testing "num_workers": 0, "rollout_fragment_length": 20, "train_batch_size": 20, "sgd_minibatch_size": 10, "vf_share_layers": True, "simple_optimizer": False, "num_sgd_iter": 1, "model": { "custom_model": "rnn", "max_seq_len": 4, "state_shape": [3, 3], }, "framework": "tf", }) ppo.train() ppo.train() # first epoch: 20 observations get split into 2 minibatches of 8 # four observations are discarded batch0 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0")) batch1 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1")) if batch0["sequences"][0][0][0] > batch1["sequences"][0][0][0]: batch0, batch1 = batch1, batch0 # sort minibatches self.assertEqual(batch0["seq_lens"].tolist(), [4, 4]) self.assertEqual(batch1["seq_lens"].tolist(), [4, 3]) self.assertEqual(batch0["sequences"].tolist(), [ [[0], [1], [2], [3]], [[4], [5], [6], [7]], ]) self.assertEqual(batch1["sequences"].tolist(), [ [[8], [9], [10], [11]], [[12], [13], [14], [0]], ]) # second epoch: 20 observations get split into 2 minibatches of 8 # four observations are discarded batch2 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_2")) batch3 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_3")) if batch2["sequences"][0][0][0] > batch3["sequences"][0][0][0]: batch2, batch3 = batch3, batch2 self.assertEqual(batch2["seq_lens"].tolist(), [4, 4]) self.assertEqual(batch3["seq_lens"].tolist(), [2, 4]) self.assertEqual(batch2["sequences"].tolist(), [ [[5], [6], [7], [8]], [[9], [10], [11], [12]], ]) self.assertEqual(batch3["sequences"].tolist(), [ [[13], [14], [0], [0]], [[0], [1], [2], [3]], ])
def train_poker_approx_best_response_nfsp( br_player, ray_head_address, scenario, general_trainer_config_overrrides, br_policy_config_overrides, get_stopping_condition, avg_policy_specs_for_players: Dict[int, StrategySpec], results_dir: str, trainer_class_override=None, br_policy_class_override=None, print_train_results: bool = True): env_class = scenario.env_class env_config = scenario.env_config other_player = 1 - br_player env_config["discrete_actions_for_players"] = [other_player] policy_classes: Dict[str, Type[Policy]] = scenario.policy_classes if br_policy_class_override is not None: policy_classes["best_response"] = br_policy_class_override get_trainer_config = scenario.get_trainer_config should_log_result_fn = scenario.ray_should_log_result_filter init_ray_for_scenario(scenario=scenario, head_address=ray_head_address, logging_level=logging.INFO) def log(message, level=logging.INFO): logger.log(level, message) def select_policy(agent_id): if agent_id == br_player: return "best_response" else: return f"average_policy" tmp_env = env_class(env_config=env_config) all_discrete_action_env_config = env_config.copy() all_discrete_action_env_config["discrete_actions_for_players"] = [0, 1] all_discrete_action_tmp_env = env_class(env_config) avg_policy_model_config = get_trainer_config( all_discrete_action_tmp_env)["model"] from ray.rllib.agents.ppo import PPOTrainer, PPOTorchPolicy from grl.rl_apps.scenarios.trainer_configs.loss_game_configs import loss_game_psro_ppo_params br_trainer_config = { "log_level": "INFO", # "callbacks": None, "env": env_class, "env_config": env_config, "gamma": 1.0, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. # "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "num_gpus": 0.0, "num_workers": 0, "num_gpus_per_worker": 0.0, "num_envs_per_worker": 1, "multiagent": { "policies_to_train": ["best_response"], "policies": { "average_policy": (policy_classes["average_policy"], tmp_env.observation_space, tmp_env.discrete_action_space, { "model": avg_policy_model_config, "explore": False, }), "best_response": (PPOTorchPolicy, tmp_env.observation_space, tmp_env.continuous_action_space, {}), }, "policy_mapping_fn": select_policy, }, } # br_trainer_config = merge_dicts(br_trainer_config, get_trainer_config(tmp_env)) br_trainer_config = merge_dicts(br_trainer_config, loss_game_psro_ppo_params(tmp_env)) br_trainer = PPOTrainer(config=br_trainer_config, logger_creator=get_trainer_logger_creator( base_dir=results_dir, scenario_name="approx_br", should_log_result_fn=should_log_result_fn)) def _set_avg_policy(worker: RolloutWorker): avg_policy = worker.policy_map["average_policy"] load_pure_strat( policy=avg_policy, pure_strat_spec=avg_policy_specs_for_players[1 - br_player]) br_trainer.workers.foreach_worker(_set_avg_policy) br_trainer.latest_avg_trainer_result = None train_iter_count = 0 stopping_condition: StoppingCondition = get_stopping_condition() max_reward = None while True: train_iter_results = br_trainer.train( ) # do a step (or several) in the main RL loop br_reward_this_iter = train_iter_results["policy_reward_mean"][ f"best_response"] if max_reward is None or br_reward_this_iter > max_reward: max_reward = br_reward_this_iter train_iter_count += 1 if print_train_results: # Delete verbose debugging info before printing if "hist_stats" in train_iter_results: del train_iter_results["hist_stats"] if "td_error" in train_iter_results["info"]["learner"][ "best_response"]: del train_iter_results["info"]["learner"]["best_response"][ "td_error"] print(pretty_dict_str(train_iter_results)) log(f"Trainer logdir is {br_trainer.logdir}") if stopping_condition.should_stop_this_iter( latest_trainer_result=train_iter_results): print("stopping condition met.") break return max_reward, None
config['num_workers'] = 1 config['num_gpus'] = 1 config['framework'] = "torch" config['gamma'] = 0.1 config['monitor'] = False # PPO config ... # config['lr'] = 1e-4 # config['train_batch_size'] config['model']['dim'] = 21 config['model']['conv_filters'] = [[8, [4, 4], 2], [16, [2, 2], 2], [512, [6, 6], 1]] #, #[config['train_batch_size'], 4, 1, 1]] # trainner = PPOTrainer(config=config, env="mars_explorer:explorer-v01") trainner = PPOTrainer(config=config, env="custom-explorer") # import pdb; pdb.set_trace() PATH = "/home/dkoutras/ray_results/290_out_of_400/checkpoint_2991/checkpoint-2991" trainner.restore(PATH) import pdb pdb.set_trace() for _ in range(10): initial_time = time.time() result = trainner.train() print( f"mean:{result['episode_reward_mean']} time:{time.time() - initial_time:.2f}[sec]" )
from ray.rllib.agents.ppo import PPOTrainer from rlcard.rllib_utils.model import ParametricActionsModel from ray.rllib.models import ModelCatalog from rlcard.rllib_utils.examples.envs.rps_env import RockPaperScissors from ray.tune.registry import register_env # Register env and model to be used by rllib register_env("ParametricRPS", lambda _: RockPaperScissors()) ModelCatalog.register_custom_model("parametric_model_tf", ParametricActionsModel) # Initialize ray ray.init(num_cpus=4) # Train the ParametricActionsModel on RockPaperScissors with PPO ppo_trainer_config = { "env": "ParametricRPS", # RockPaperScissors "model": { "custom_model": "parametric_model_tf", # ParametricActionsModel, }, } trainer = PPOTrainer(config=ppo_trainer_config) for i in range(5): res = trainer.train() print("Iteration {}. episode_reward_mean: {}".format( i, res['episode_reward_mean'])) print('Training finished, check the results in ~/ray_results/<dir>/')
import gym import ray from ray.rllib.agents.ppo import PPOTrainer, DEFAULT_CONFIG from ray.tune.logger import pretty_print ray.init(num_gpus=1) config = DEFAULT_CONFIG.copy() config['num_gpus'] = 1 config['num_workers'] = 1 config['num_sgd_iter'] = 30 config['sgd_minibatch_size'] = 128 config['model']['fcnet_hiddens'] = [100, 100] config[ 'num_cpus_per_worker'] = 0 # This avoids running out of resources in the notebook environment when this cell is re-executed agent = PPOTrainer(config, 'CartPole-v0') for i in range(5): result = agent.train() print(pretty_print(result))
"training_iteration", "time_total_s", "timesteps_total", "episode_reward_max", "episode_reward_mean", [ "info", [ "sample_time_ms", "grad_time_ms", "opt_peak_throughput", "sample_peak_throughput" ] ] ] try: result = {"timesteps_total": 0} while result["timesteps_total"] < timesteps_total: # Perform one iteration of training the policy result = train_agent.train() # Print the training status for field in results_fields_filter: if not isinstance(field, list): if field in result.keys(): print(f"{field}: {result[field]}") else: for subfield in field[1]: if subfield in result[field[0]].keys(): print(f"{subfield} : {result[field[0]][subfield]}") print("============================") except KeyboardInterrupt: print("Interrupting training...") finally: checkpoint_path = train_agent.save()