} ppo_trainer_config = { "env": "ParametricScopone", "multiagent": { "policies_to_train": ["ppo_policy_nico"], "policies": policies, "policy_mapping_fn": lambda agent_id: "ppo_policy_albi" if agent_id in ("player_1", "player_3") else "ppo_policy_nico", }, "observation_filter": "NoFilter", "callbacks": PlayerScoreCallbacks } trainer = PPOTrainer(config=ppo_trainer_config) if restore_checkpoint: trainer.restore(checkpoint_path) trainer.get_policy("ppo_policy_albi").model.base_model.summary() trainer.get_policy("ppo_policy_nico").model.base_model.summary() for i in range(10000): res = trainer.train() print("Iteration {}. policy_reward_mean: {}".format( i, res['policy_reward_mean'])) if i % checkpoint_every == 0: trainer.save() print('Training finished, check the results in ~/ray_results/<dir>/')
return out, [] @override(ModelV2) def value_function(self): return tf.reshape(self._value_out, [-1]) if __name__ == "__main__": args = parser.parse_args() ray.init() ModelCatalog.register_custom_model("bn_model", BatchNormModel) config = { "env": "Pendulum-v0" if args.run == "DDPG" else "CartPole-v0", "model": { "custom_model": "bn_model", }, "num_workers": 0, } from ray.rllib.agents.ppo import PPOTrainer trainer = PPOTrainer(config=config) trainer.train() tune.run( args.run, stop={"training_iteration": args.num_iters}, config=config, )
"my_model", TorchMobileV2PlusRNNModel if args.torch else MobileV2PlusRNNModel) # Configure our Trainer. config = { "framework": "torch" if args.torch else "tf", "model": { "custom_model": "my_model", # Extra config passed to the custom model's c'tor as kwargs. "custom_model_config": { "cnn_shape": cnn_shape_torch if args.torch else cnn_shape, }, "max_seq_len": 20, }, "vf_share_layers": True, "num_workers": 0, # no parallelism "env_config": { "action_space": Discrete(2), # Test a simple Image observation space. "observation_space": Box(0.0, 1.0, shape=cnn_shape_torch if args.torch else cnn_shape, dtype=np.float32) }, } trainer = PPOTrainer(config=config, env=RandomEnv) print(trainer.train())
def my_train_fn(config, reporter): iterations = config.pop("train-iterations", 10) # Train for n iterations with high LR agent1 = PPOTrainer(env="CartPole-v0", config=config) for _ in range(iterations): result = agent1.train() result["phase"] = 1 reporter(**result) phase1_time = result["timesteps_total"] state = agent1.save() agent1.stop() # Train for n iterations with low LR config["lr"] = 0.0001 agent2 = PPOTrainer(env="CartPole-v0", config=config) agent2.restore(state) for _ in range(iterations): result = agent2.train() result["phase"] = 2 result["timesteps_total"] += phase1_time # keep time moving forward reporter(**result) agent2.stop()
agent_cfg[ "grad_clip"] = None # If not None, clip gradients during optimization at this value # ================= Configure the learning algorithm ================= # agent_cfg = copy.deepcopy(DEFAULT_CONFIG) # agent_cfg["lr"] = 5.0e-6 # agent_cfg["lr_schedule"] = [ # [ 0, 5.0e-6], # [ 100000, 1.0e-6], # [ 400000, 1.0e-6], # [ 800000, 1.0e-7], # [1000000, 1.0e-7], # [1200000, 1.0e-9], # ] train_agent = Trainer(agent_cfg, env="my_custom_env") # ================= Run the optimization ================= timesteps_total = 400000 results_fields_filter = [ "training_iteration", "time_total_s", "timesteps_total", "episode_reward_max", "episode_reward_mean", [ "info", [ "sample_time_ms", "grad_time_ms", "opt_peak_throughput", "sample_peak_throughput" ] ] ]
# Train the "main" policy to play really well using self-play. results = None if not args.from_checkpoint: results = tune.run( "PPO", config=config, stop=stop, checkpoint_at_end=True, checkpoint_freq=10, verbose=1) # Restore trained trainer (set to non-explore behavior) and play against # human on command line. if args.num_episodes_human_play > 0: num_episodes = 0 trainer = PPOTrainer(config=dict(config, **{"explore": False})) if args.from_checkpoint: trainer.restore(args.from_checkpoint) else: trainer.restore(results.get_last_checkpoint()) # Play from the command line against the trained agent # in an actual (non-RLlib-wrapped) open-spiel env. human_player = 1 env = Environment("connect_four") while num_episodes < args.num_episodes_human_play: print("You play as {}".format("o" if human_player else "x")) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"]
omega_r, omega_l = u_r / k_r_inv, u_l / k_l_inv vel = (omega_r + omega_l) * radius / 2 angle = (omega_r * radius - vel) / (0.5 * baseline) angle_backup = (vel - omega_l * radius) / (0.5 * baseline) # print("angle is {}, angle backup is {}".format(angle, angle_backup)) assert math.isclose(angle, angle_backup) return np.array([vel, angle]) # Set up env ray.init(**config["ray_init_config"]) register_env('Duckietown', launch_and_wrap_env) ########################################################### # Restore agent trainer = PPOTrainer(config=config["rllib_config"]) trainer.restore(checkpoint_path) print_config(trainer.config) # add seed to env config seed = args.seed actions = [] ########################################################### ########################################################### # Simple demonstration of closed loop performance if not (args.analyse_trajectories or args.visualize_salient_obj or args.reward_plots or args.visualize_dot_trajectories): # env = Monitor(env, "gym_monitor_results", write_upon_reset=True, force=True)
'num_workers': 0, "multiagent": { "policies": policies, "policy_mapping_fn": select_policy, }, "clip_actions": True, "framework": "torch", #"num_sgd_iter": 4, "lr": 1e-4, #"kl_target": 0.03, #"train_batch_size": 1024, "rollout_fragment_length": 100, #"sgd_minibatch_size": 32 } trainer = PPOTrainer(env="wanderer_roborobo", config=config) print(trainer.config.get('no_final_linear')) print('model built') stop_iter = 2000 #%% import numpy as np for i in range(stop_iter): print("== Iteration", i, "==") result_ppo = trainer.train() pretty_print(result_ppo) if (i+1) % 200 == 0: trainer.save('model') trainer.save('model') del trainer ray.shutdown()
ray.init(address='auto') # address = None when running locally. address = 'auto' when running on aws.] obs_state_processor = SimpleObsStateProcessor(pathogen_sorting_strategy=infected_population_sorting_per_city) act_state_processor = SimpleActStateProcessor(sort_pathogens=obs_state_processor.sort_pathogens) # Notice that trial_max will only work for stochastic policies register_env("ic20env", lambda _: SimplifiedIC20Environment(obs_state_processor, act_state_processor, UnstableReward(), trial_max=10)) ten_gig = 10737418240 trainer = PPOTrainer( env="ic20env", config=merge_dicts(DEFAULT_CONFIG, { # -- Rollout-Worker 'num_gpus': 0, 'num_workers': 5, "num_envs_per_worker": 1, "num_cpus_per_worker": 1, "memory_per_worker": ten_gig, 'gamma': 0.99, })) # Attempt to restore from checkpoint if possible. if os.path.exists(CHECKPOINT_FILE): checkpoint_path = open(CHECKPOINT_FILE).read() print("Restoring from checkpoint path", checkpoint_path) trainer.restore(checkpoint_path) # Serving and training loop while True: print(pretty_print(trainer.train()))
# Train the "main" policy to play really well using self-play. results = None if not args.from_checkpoint: results = tune.run("PPO", config=config, stop=stop, checkpoint_at_end=True, checkpoint_freq=10, verbose=3) # Restore trained trainer (set to non-explore behavior) and play against # human on command line. if args.num_episodes_human_play > 0: num_episodes = 0 trainer = PPOTrainer(config=dict(config, **{"explore": False})) if args.from_checkpoint: trainer.restore(args.from_checkpoint) else: checkpoint = results.get_last_checkpoint() if not checkpoint: raise ValueError("No last checkpoint found in results!") trainer.restore(checkpoint) # Play from the command line against the trained agent # in an actual (non-RLlib-wrapped) open-spiel env. human_player = 1 env = Environment(args.env) while num_episodes < args.num_episodes_human_play: print("You play as {}".format("o" if human_player else "x"))
def testLocal(self): ray.init(local_mode=True) cf = DEFAULT_CONFIG.copy() agent = PPOTrainer(cf, "CartPole-v0") print(agent.train())
"type": "EpsilonGreedy", "initial_epsilon": 1.0, "final_epsilon": 0.02, "epsilon_timesteps": 1000, }, "learning_starts": 100, "timesteps_per_iteration": 200, "log_level": "INFO", "framework": args.framework, })) elif args.run == "PPO": # Example of using PPO (does NOT support off-policy actions). trainer = PPOTrainer(env=env, config=dict( connector_config, **{ "sample_batch_size": 1000, "train_batch_size": 4000, "framework": args.framework, })) else: raise ValueError("--run must be DQN or PPO") checkpoint_path = CHECKPOINT_FILE.format(args.run) # Attempt to restore from checkpoint if possible. if os.path.exists(checkpoint_path): checkpoint_path = open(checkpoint_path).read() print("Restoring from checkpoint path", checkpoint_path) trainer.restore(checkpoint_path) # Serving and training loop
[9, 101, 100, 11, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4]), spaces.MultiDiscrete( [9, 101, 100, 11, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4]), spaces.MultiDiscrete( [9, 101, 100, 11, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4]), spaces.MultiDiscrete( [9, 101, 100, 11, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4]), spaces.MultiDiscrete( [9, 101, 100, 11, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4]), spaces.MultiDiscrete( [9, 101, 100, 11, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4]) )) DEFAULT_CONFIG["env_config"]["action_space"] = spaces.MultiDiscrete([7, 9, 9]) ray.init() trainer = PPOTrainer(config=DEFAULT_CONFIG, env=RandomEnv) trainer2 = PPOTrainer(config=DEFAULT_CONFIG, env=RandomEnv) checkpoint_path = "checkpoints/" checkpoint1 = "checkpoint_000001/checkpoint-1" fullpath1 = checkpoint_path + checkpoint1 checkpoint2 = "checkpoint_000005/checkpoint-5" fullpath2 = checkpoint_path + checkpoint2 sum1a = 0 sum1b = 0 sum2a = 0 sum2b = 0
def test_minibatch_sequencing(self): ModelCatalog.register_custom_model("rnn", RNNSpyModel) register_env("counter", lambda _: DebugCounterEnv()) ppo = PPOTrainer( env="counter", config={ "shuffle_sequences": False, # for deterministic testing "num_workers": 0, "rollout_fragment_length": 20, "train_batch_size": 20, "sgd_minibatch_size": 10, "num_sgd_iter": 1, "model": { "custom_model": "rnn", "max_seq_len": 4, "vf_share_layers": True, }, "framework": "tf", }) ppo.train() ppo.train() # first epoch: 20 observations get split into 2 minibatches of 8 # four observations are discarded batch0 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0")) batch1 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1")) if batch0["sequences"][0][0][0] > batch1["sequences"][0][0][0]: batch0, batch1 = batch1, batch0 # sort minibatches self.assertEqual(batch0["seq_lens"].tolist(), [4, 4, 2]) self.assertEqual(batch1["seq_lens"].tolist(), [2, 3, 4, 1]) check(batch0["sequences"], [ [[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]], ]) check(batch1["sequences"], [ [[10], [11], [0], [0]], [[12], [13], [14], [0]], [[0], [1], [2], [3]], [[4], [0], [0], [0]], ]) # second epoch: 20 observations get split into 2 minibatches of 8 # four observations are discarded batch2 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_2")) batch3 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_3")) if batch2["sequences"][0][0][0] > batch3["sequences"][0][0][0]: batch2, batch3 = batch3, batch2 self.assertEqual(batch2["seq_lens"].tolist(), [4, 4, 2]) self.assertEqual(batch3["seq_lens"].tolist(), [4, 4, 2]) check(batch2["sequences"], [ [[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]], ]) check(batch3["sequences"], [ [[5], [6], [7], [8]], [[9], [10], [11], [12]], [[13], [14], [0], [0]], ])
done = self.episode_len >= 10 # r = -abs(obs - action) reward = -sum(abs(self.cur_obs - action)) # Set a new observation (random sample). self.cur_obs = self.observation_space.sample() return self.cur_obs, reward, done, {} # Create an RLlib Trainer instance to learn how to act in the above # environment. trainer = PPOTrainer( config={ # Env class to use (here: our gym.Env sub-class from above). "env": ParrotEnv, # Config dict to be passed to our custom env's constructor. "env_config": { "parrot_shriek_range": gym.spaces.Box(-5.0, 5.0, (1, )) }, # Parallelize environment rollouts. "num_workers": 3, }) # Train for n iterations and report results (mean episode rewards). # Since we have to guess 10 times and the optimal reward is 0.0 # (exact match between observation and action value), # we can expect to reach an optimal episode reward of 0.0. for i in range(5): results = trainer.train() print(f"Iter: {i}; avg. reward={results['episode_reward_mean']}") # Perform inference (action computations) based on given env observations.
if agent_id % 2 == 0: return "ppo_policy" else: return "dqn_policy" ppo_trainer = PPOTrainer( env="multi_agent_cartpole", config={ "multiagent": { "policies": policies, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": ["ppo_policy"], }, "model": { "vf_share_layers": True, }, "num_sgd_iter": 6, "vf_loss_coeff": 0.01, # disable filters, otherwise we would need to synchronize those # as well to the DQN agent "observation_filter": "MeanStdFilter", # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "framework": args.framework, }, ) dqn_trainer = DQNTrainer( env="multi_agent_cartpole", config={ "multiagent": {
} # Logger logger.configure( experiment_name= f"Bertrand_competition_discrete_k_{k}_trainer_{trainer_choice}", ) if trainer_choice != 'QL': register_env('Bertrand', lambda env_config: env) ray.init(num_cpus=4) for _ in range(epochs): if trainer_choice == 'DQN': trainer = DQNTrainer(config=config, env='Bertrand') elif trainer_choice == 'PPO': trainer = PPOTrainer(config=config, env='Bertrand') elif trainer_choice == 'A3C': trainer = A3CTrainer(config=config, env='Bertrand') result = trainer.train() log_stats( env ) #TODO: this is not working because env does not store prices. Fix it or find better way to evaluate else: # Q-learning players = ['agent_' + str(i) for i in range(num_agents)] # Hyperparameters alpha = 0.05 beta = 0.2
class KandboxAgentRLLibPPO(KandboxAgentPlugin): title = "Kandbox Plugin - Agent - realtime - by rllib ppo" slug = "ri_agent_rl_ppo" author = "Kandbox" author_url = "https://github.com/qiyangduan" description = "RLLibPPO for GYM for RL." version = "0.1.0" default_config = { "nbr_of_actions": 4, "max_epochs": 1000, "nbr_of_days_planning_window": 6, "model_path": "default_model_path", "working_dir": "/tmp", "checkpoint_path_key": "ppo_checkpoint_path", } config_form_spec = { "type": "object", "properties": {}, } def __init__(self, agent_config, kandbox_config): self.agent_config = agent_config self.current_best_episode_reward_mean = -99 env_config = agent_config["env_config"] if "rules_slug_config_list" not in env_config.keys(): if "rules" not in env_config.keys(): log.error("no rules_slug_config_list and no rules ") else: env_config["rules_slug_config_list"] = [ [rule.slug, rule.config] for rule in env_config["rules"] ] env_config.pop("rules", None) # self.env_class = env_class = agent_config["env"] self.kandbox_config = self.default_config.copy() self.kandbox_config.update(kandbox_config) # self.trained_model = trained_model self.kandbox_config["create_datetime"] = datetime.now() # self.trainer = None self.env_config = env_config # self.load_model(env_config=self.env_config) print( f"KandboxAgentRLLibPPO __init__ called, at time {self.kandbox_config['create_datetime']}" ) # import pdb # pdb.set_trace() if not ray.is_initialized(): ray.init(ignore_reinit_error=True, log_to_driver=False) # ray.init(redis_address="localhost:6379") def build_model(self): trainer_config = DEFAULT_CONFIG.copy() trainer_config["num_workers"] = 0 # trainer_config["train_batch_size"] = 640 # trainer_config["sgd_minibatch_size"] = 160 # trainer_config["num_sgd_iter"] = 100 trainer_config["exploration_config"] = { "type": "Random", } # EpsilonGreedy(Exploration): # trainer_config["exploration_config"] = { # "type": "Curiosity", # "eta": 0.2, # "lr": 0.001, # "feature_dim": 128, # "feature_net_config": { # "fcnet_hiddens": [], # "fcnet_activation": "relu", # }, # "sub_exploration": { # "type": "StochasticSampling", # } # } # trainer_config["log_level"] = "DEBUG" """ if env_config is not None: for x in env_config.keys(): trainer_config[x] = env_config[x] """ # trainer_config["env_config"] = copy.deepcopy(env_config) # {"rules": "qiyang_role"} trainer_config.update(self.agent_config) self.trainer = PPOTrainer(trainer_config, self.agent_config["env"]) # self.config["trainer"] = self.trainer return self.trainer def load_model(self): # , allow_empty = None env_config = self.agent_config["env_config"] self.trainer = self.build_model() # if (model_path is not None) & (os.path.exists(model_path)): if "ppo_checkpoint_path" in env_config.keys(): # raise FileNotFoundError("can not find model at path: {}".format(model_path)) if os.path.exists(env_config["ppo_checkpoint_path"]): self.trainer.restore(env_config["ppo_checkpoint_path"]) print("Reloaded model from path: {} ".format( env_config["ppo_checkpoint_path"])) else: print( "Env_config has ppo_checkpoint_path = {}, but no files found. I am returning an initial model" .format(env_config["ppo_checkpoint_path"])) else: print( "Env_config has no ppo_checkpoint_path, returning an initial model" ) # self.config["model_path"] = model_path # self.config["trainer"] = self.trainer # self.config["policy"] = self.trainer.workers.local_worker().get_policy() self.policy = self.trainer.workers.local_worker().get_policy() return self.trainer def train_model(self): # self.trainer = self.build_model() for i in range(self.kandbox_config["max_epochs"]): result = self.trainer.train() # print(pretty_print(result)) print( "Finished training iteration {}, Result: episodes_this_iter:{}, policy_reward_max: {}, episode_reward_max {}, episode_reward_mean {}, info.num_steps_trained: {}..." .format( i, result["episodes_this_iter"], result["policy_reward_max"], result["episode_reward_max"], result["episode_reward_mean"], result["info"]["num_steps_trained"], )) if result[ "episode_reward_mean"] > self.current_best_episode_reward_mean * 1.1: model_path = self.save_model() print( "Model is saved after 10 percent increase, episode_reward_mean = {}, file = {}" .format(result["episode_reward_mean"], model_path)) self.current_best_episode_reward_mean = result[ "episode_reward_mean"] return self.save_model() def save_model(self): checkpoint_dir = "{}/model_checkpoint_org_{}_team_{}".format( self.agent_config["env_config"]["working_dir"], self.agent_config["env_config"]["org_code"], self.agent_config["env_config"]["team_id"], ) _path = self.trainer.save(checkpoint_dir=checkpoint_dir) # exported_model_dir = "{}/exported_ppo_model_org_{}_team_{}".format( # self.agent_config["env_config"]["working_dir"], self.agent_config["env_config"]["org_code"], self.agent_config["env_config"]["team_id"] # ) # self.trainer.get_policy().export_model(exported_model_dir + "/1") return _path # self.trainer def predict_action(self, observation=None): action = self.trainer.compute_action(observation) return action def predict_action_list(self, env=None, job_code=None, observation=None): actions = [] if env is not None: self.env = env else: env = self.env if job_code is None: job_i = env.current_job_i else: job_i = env.jobs_dict[job_code].job_index observation = env._get_observation() # export_dir = "/Users/qiyangduan/temp/kandbox/exported_ppo_model_org_duan3_team_3/1" # loaded_policy = tf.saved_model.load(export_dir) # loaded_policy.signatures["serving_default"](observations=observation) predicted_action = self.trainer.compute_action(observation) # V predicted_action = self.policy.compute_action(observation) for _ in range(len(env.workers)): # hist_job_workers_ranked: if len(actions) >= self.config["nbr_of_actions"]: return actions actions.append(list(predicted_action).copy()) max_i = np.argmax(predicted_action[0:len(env.workers)]) predicted_action[max_i] = 0 return actions def predict_action_dict_list(self, env=None, job_code=None, observation=None): if env is not None: self.env = env else: env = self.env curr_job = copy.deepcopy(env.jobs_dict[job_code]) if job_code is None: job_i = env.current_job_i else: job_i = curr_job.job_index env.current_job_i = job_i observation = env._get_observation() action = self.predict_action(observation=observation) action_dict = env.decode_action_into_dict_native(action=action) action_day = int(action_dict.scheduled_start_minutes / 1440) curr_job.requested_start_min_minutes = action_day * 1440 curr_job.requested_start_max_minutes = (action_day + 1) * 1440 action_dict_list = self.env.recommendation_server.search_action_dict_on_worker_day( a_worker_code_list=action.scheduled_worker_codes, curr_job=curr_job, max_number_of_matching=3, ) return action_dict_list
done = self._counter >= self._horizon return self.state, reward, done, {} def reset(self): self.state = np.random.normal(0, 1, 2) self._counter = 0 return self.state trainer_config = DEFAULT_CONFIG.copy() trainer_config['num_workers'] = 1 trainer_config["train_batch_size"] = 64 trainer_config["sgd_minibatch_size"] = 64 trainer_config["num_sgd_iter"] = 10 trainer = PPOTrainer(trainer_config, MyEnv) for i in range(50): print("Training iteration {}...".format(i)) trainer.train() cumulative_reward_list = [] M = 100 for rep in range(M): env = MyEnv({}) state = env.reset() done = False cumulative_reward = 0 while not done: action = trainer.compute_action(state) #print(action, state)
env_tmp = RLCardWrapped(None) policies = { "ppo_policy_1": (PPOTFPolicy, env_tmp.observation_space, env_tmp.action_space, ppo_trainer_config), "rand_policy": (RandomPolicy, env_tmp.observation_space, env_tmp.action_space, {}), } # Instantiate the PPO trainer eval trainer_eval = PPOTrainer( config={ "env": rlcard_env_id, "multiagent": { "policies_to_train": ['ppo_policy_1'], "policies": policies, "policy_mapping_fn": lambda agent_id: "ppo_policy_1" if agent_id == "player_1" else "rand_policy", }, # "num_gpus": 0.5, }) trainer = PPOTrainer( config={ "env": rlcard_env_id, "multiagent": { "policies_to_train": ['ppo_policy_1'], "policies": policies, "policy_mapping_fn": lambda agent_id: "ppo_policy_1", },
def testPPOSampleWaste(self): ray.init(num_cpus=4) # Check we at least collect the initial wave of samples ppo = PPOTrainer(env="CartPole-v0", config={ "sample_batch_size": 200, "train_batch_size": 128, "num_workers": 3, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 600) ppo.stop() # Check we collect at least the specified amount of samples ppo = PPOTrainer(env="CartPole-v0", config={ "sample_batch_size": 200, "train_batch_size": 900, "num_workers": 3, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 1000) ppo.stop() # Check in vectorized mode ppo = PPOTrainer(env="CartPole-v0", config={ "sample_batch_size": 200, "num_envs_per_worker": 2, "train_batch_size": 900, "num_workers": 3, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 1200) ppo.stop() # Check legacy mode ppo = PPOTrainer(env="CartPole-v0", config={ "sample_batch_size": 200, "train_batch_size": 128, "num_workers": 3, "straggler_mitigation": True, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 200) ppo.stop()
"lr": 0.0001, # started at 0.0001 "num_sgd_iter": 5, "vf_loss_coeff": 0.001, "log_level": "WARN", "train_batch_size": 512, "sgd_minibatch_size": 32, "clip_param": 0.3, "vf_clip_param": 10.0 } last_improve = 150 iteration = 22 improved = 0 while True: trainer = PPOTrainer(env="fire_mage", config=rnn_config) print(dir(trainer)) #trainer.restore('./checkpoints_flush/checkpoint_379/checkpoint-379') step = 0 best_val = 0.0 if False: save_0 = trainer.save_to_object() while True: if False: save_0 = trainer.save_to_object() result = trainer.train() while result['episode_reward_mean'] > best_val: print('UPENING') best_save = deepcopy(save_0) best_val = result['episode_reward_mean']
trainer.get_policy("low_level_policy").set_state(importedPolicy) chk_freq = 1 # Hanya perlu 1 kali saja di awal untuk save model hasil import while True: result = trainer.train() tune.report(**result) if (trainer._iteration % chk_freq == 0): with tune.checkpoint_dir( step=trainer._iteration) as checkpoint_dir: trainer.save(checkpoint_dir) if __name__ == "__main__": ray.init(ignore_reinit_error=True) config_hier["multiagent"]["policies_to_train"] = ["high_level_policy"] resources = PPOTrainer.default_resource_request(config_hier).to_json() tune.run( train, name="HWalk_Hier_Mimic", # resume=resume, restore= "/home/aditya/ray_results/{}/{}/checkpoint_{}/checkpoint_{}/checkpoint-{}" .format( experiment_name_hier, experiment_id_hier, checkpoint_num_hier, checkpoint_num_hier, checkpoint_num_hier, ) if resumeFromCheckpoint else "", stop={"episode_reward_mean": 10000}, config=config_hier,
phase1_time = result["timesteps_total"] state = agent1.save() agent1.stop() # Train for n iterations with low LR config["lr"] = 0.0001 agent2 = PPOTrainer(env="CartPole-v0", config=config) agent2.restore(state) for _ in range(iterations): result = agent2.train() result["phase"] = 2 result["timesteps_total"] += phase1_time # keep time moving forward reporter(**result) agent2.stop() if __name__ == "__main__": ray.init() args = parser.parse_args() config = { # Special flag signalling `my_train_fn` how many iters to do. "train-iterations": 2, "lr": 0.01, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "num_workers": 0, "framework": args.framework, } resources = PPOTrainer.default_resource_request(config) tune.run(my_train_fn, resources_per_trial=resources, config=config)
def train(config, checkpoint_dir=None): trainer = PPOTrainer(config=config) if checkpoint_dir: trainer.load_checkpoint(checkpoint_dir) chk_freq = 10 if useModelFromLowLevelTrain: config_low["num_workers"] = 0 config_low["num_envs_per_worker"] = 1 config_low["num_gpus"] = 1 agentLow = PPOTrainer(config_low) agentLow.restore( "/home/aditya/ray_results/{}/{}/checkpoint_{}/checkpoint-{}". format(experiment_name, experiment_id, checkpoint_num, checkpoint_num)) lowWeight = agentLow.get_policy().get_weights() highWeight = trainer.get_policy("low_level_policy").get_weights() lowState = agentLow.get_policy().get_state() importedOptState = OrderedDict([ (k.replace("default_policy", "low_level_policy"), v) for k, v in lowState["_optimizer_variables"].items() ]) importedPolicy = { hw: lowWeight[lw] for hw, lw in zip(highWeight.keys(), lowWeight.keys()) } importedPolicy["_optimizer_variables"] = importedOptState trainer.get_policy("low_level_policy").set_state(importedPolicy) chk_freq = 1 # Hanya perlu 1 kali saja di awal untuk save model hasil import while True: result = trainer.train() tune.report(**result) if (trainer._iteration % chk_freq == 0): with tune.checkpoint_dir( step=trainer._iteration) as checkpoint_dir: trainer.save(checkpoint_dir)
config["entropy_coeff_schedule"]) warmup_steps = config["model"]["custom_options"].get( "warmup_steps", 100000) TransformerLearningRateSchedule.__init__( policy, config["model"]["custom_options"]["transformer"]["num_heads"], warmup_steps) TTFPPOPolicy = PPOTFPolicy.with_updates(name="TTFPPOPolicy", before_loss_init=setup_mixins, mixins=[ TransformerLearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin, ValueNetworkMixin ]) TTFPPOPolicyInfer = PPOTFPolicy.with_updates(name="TTFPPOPolicyInfer", before_loss_init=setup_mixins, mixins=[ LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin, ValueNetworkMixin ]) register_trainable( "TTFPPO", PPOTrainer.with_updates(name="TTFPPOTrainer", get_policy_class=lambda c: TTFPPOPolicy), )
"exploration_config": { "type": "EpsilonGreedy", "initial_epsilon": 1.0, "final_epsilon": 0.02, "epsilon_timesteps": 1000, }, # "learning_starts": 100, # "timesteps_per_iteration": 200, # "log_level": "INFO", })) elif args.run == "PPO": # Example of using PPO (does NOT support off-policy actions). trainer = PPOTrainer( env="srv", config=dict( connector_config, **{ "sample_batch_size": 1000, "train_batch_size": 4000, })) else: raise ValueError("--run must be DQN or PPO") checkpoint_path = CHECKPOINT_FILE.format(args.run) # Attempt to restore from checkpoint if possible. if os.path.exists(checkpoint_path): checkpoint_path = open(checkpoint_path).read() print("Restoring from checkpoint path", checkpoint_path) trainer.restore(checkpoint_path) # Serving and training loop