class AlternateTraining(Trainable): def _setup(self, config): self.config = config self.env = config['env'] agent_config = self.config adv_config = deepcopy(self.config) agent_config['multiagent']['policies_to_train'] = ['agent'] adv_config['multiagent']['policies_to_train'] = ['adversary0'] self.agent_trainer = PPOTrainer(env=self.env, config=agent_config) self.adv_trainer = PPOTrainer(env=self.env, config=adv_config) def _train(self): # improve the Adversary policy print("-- Adversary Training --") print(pretty_print(self.adv_trainer.train())) # swap weights to synchronize self.agent_trainer.set_weights(self.adv_trainer.get_weights(["adversary0"])) # improve the Agent policy print("-- Agent Training --") output = self.agent_trainer.train() print(pretty_print(output)) # swap weights to synchronize self.adv_trainer.set_weights(self.agent_trainer.get_weights(["agent"])) return output def _save(self, tmp_checkpoint_dir): return self.agent_trainer._save(tmp_checkpoint_dir)
def my_train_fn(config, reporter): assert args.num_learners >= 4, 'Requires 4 or more trainable agents' ppo_trainer = PPOTrainer(env='c4', config=config) while True: result = ppo_trainer.train() if 'evaluation' in result: train_policies = config['multiagent']['policies_to_train'] scores = {k: v for k, v in result['evaluation']['policy_reward_mean'].items() if k in train_policies} scores_dist = softmax(np.array(list(scores.values())) / tau) new_trainables = random.choices(list(scores.keys()), scores_dist, k=len(scores)) # new_trainables = train_policies # random.shuffle(new_trainables) weights = ppo_trainer.get_weights() new_weights = {old_pid: weights[new_pid] for old_pid, new_pid in zip(weights.keys(), new_trainables)} # new_weights = {pid: np.zeros_like(wt) for pid, wt in weights.items() if wt is not None} # new_weights = {pid: np.ones_like(wt)*-100 for pid, wt in weights.items() if wt is not None} # new_weights = {pid: np.random.rand(*wt.shape) for pid, wt in weights.items() if wt is not None} print('\n\n################\nSETTING WEIGHTS\n################\n\n') ppo_trainer.set_weights(new_weights) num_metrics = 4 c = Counter(new_trainables) result['custom_metrics'].update( {f'most_common{i:02d}': v[1] for i, v in enumerate(c.most_common(num_metrics))}) result['custom_metrics'].update( {f'scores_dist{i:02d}': v for i, v in enumerate(sorted(scores_dist, reverse=True)[:num_metrics])}) print('scores_dist', scores_dist) # result['custom_metrics'].update( # {f'new_agent{i:02d}': int(v[-2:]) for i, v in enumerate(new_trainables)}) reporter(**result)
class AlternateTraining(Trainable): def _setup(self, config): self.config = config self.env = config['env'] agent_config = self.config adv_config = deepcopy(self.config) agent_config['multiagent']['policies_to_train'] = ['agent'] adv_config['multiagent']['policies_to_train'] = ['adversary'] self.agent_trainer = PPOTrainer(env=self.env, config=agent_config) self.adv_trainer = PPOTrainer(env=self.env, config=adv_config) def _train(self): # improve the Adversary policy print("-- Adversary Training --") original_weight = self.adv_trainer.get_weights( ["adversary"])['adversary']['adversary/fc_1/kernel'][0, 0] print(pretty_print(self.adv_trainer.train())) first_weight = self.adv_trainer.get_weights( ["adversary"])['adversary']['adversary/fc_1/kernel'][0, 0] # Check that the weights are updating after training assert original_weight != first_weight, 'The weight hasn\'t changed after training what gives' # swap weights to synchronize self.agent_trainer.set_weights( self.adv_trainer.get_weights(["adversary"])) # improve the Agent policy print("-- Agent Training --") output = self.agent_trainer.train() # Assert that the weight hasn't changed but it has new_weight = self.agent_trainer.get_weights( ["adversary"])['adversary']['adversary/fc_1/kernel'][0, 0] # Check that the adversary is not being trained when the agent trainer is training assert first_weight == new_weight, 'The weight of the adversary matrix has changed but it shouldnt have been updated!' # swap weights to synchronize self.adv_trainer.set_weights(self.agent_trainer.get_weights(["agent"])) return output def _save(self, tmp_checkpoint_dir): return self.agent_trainer._save(tmp_checkpoint_dir)
dqn_trainer = DQNTrainer(env="multi_cartpole", config={ "multiagent": { "policies": policies, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": ["dqn_policy"], }, "gamma": 0.95, "n_step": 3, }) # You should see both the printed X and Y approach 200 as this trains: # info: # policy_reward_mean: # dqn_policy: X # ppo_policy: Y for i in range(args.num_iters): print("== Iteration", i, "==") # improve the DQN policy print("-- DQN --") print(pretty_print(dqn_trainer.train())) # improve the PPO policy print("-- PPO --") print(pretty_print(ppo_trainer.train())) # swap weights to synchronize dqn_trainer.set_weights(ppo_trainer.get_weights(["ppo_policy"])) ppo_trainer.set_weights(dqn_trainer.get_weights(["dqn_policy"]))
config["num_sgd_iter"] = 5 config["sgd_minibatch_size"] = 8192 config["train_batch_size"] = 20000 config["use_gae"] = True config["vf_clip_param"] = 10 config["vf_loss_coeff"] = 1 config["vf_share_layers"] = False # For better gradient estimates in the later stages # of the training, increase the batch sizes. # config["sgd_minibatch_size"] = 8192 * 4 # config["train_batch_size"] = 20000 * 10 ray.init() trainer = PPOTrainer(config=config, env=InventoryEnv) # Use this when you want to continue from a checkpoint. # trainer.restore( # "/home/enes/ray_results/PPO_InventoryEnv_2020-10-06_04-31-2945lwn1wg/checkpoint_737/checkpoint-737" # ) best_mean_reward = np.NINF while True: result = trainer.train() print(pretty_print(result)) mean_reward = result.get("episode_reward_mean", np.NINF) if mean_reward > best_mean_reward: checkpoint = trainer.save() print("checkpoint saved at", checkpoint) best_mean_reward = mean_reward
import numpy as np import matplotlib.pyplot as plt from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import PPO2, A2C import gym import traf_env import multi_traf_env import ray from ray import tune from ray.rllib.policy import Policy from ray.rllib.tests.test_multi_agent_env import MultiCartpole from ray.tune.registry import register_env from ray.rllib.agents.ppo.ppo import PPOTrainer from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy ray.init() register_env("multi_air-v0", lambda c: multi_traf_env.AirTrafficGym(num_agents=2)) trainer = PPOTrainer(env="multi_air-v0") num_train_itr = 50 for i in range(num_train_itr): print("****************************Iteration: ", i, "****************************") print(trainer.train())
# "policy_03": (None, obs_space, act_space, {}), # "policy_04": (None, obs_space, act_space, {}), # }, # "policy_mapping_fn": policy_mapping_fn_eval # }, # }, }) # Uncomment to restore from check-point # ppo_trainer.restore('/home/ubuntu/ray_results/PPO_gfootball_2020-08-05_20-46-06e4wbrrqg/checkpoint_301/checkpoint-301') for i in range(100000): episode_data = [] print("+++++++++++++++Training iteration {!s}+++++++++++++++++++".format(i+1)) result = ppo_trainer.train() print(pretty_print(result)) if ((i > 0) and (i % 10 == 0)): print("===============Swapping weights===================") P4key_P3val = {} # temp storage with "policy_4" keys & "policy_3" values for (k4,v4), (k3,v3) in zip(ppo_trainer.get_policy("policy_04").get_weights().items(), ppo_trainer.get_policy("policy_03").get_weights().items()): P4key_P3val[k4] = v3 P3key_P2val = {} # temp storage with "policy_3" keys & "policy_2" values for (k3,v3), (k2,v2) in zip(ppo_trainer.get_policy("policy_03").get_weights().items(), ppo_trainer.get_policy("policy_02").get_weights().items()):
def _train(self): ppo_trainer = PPOTrainer(env='c4', config=self.config) while True: result = ppo_trainer.train() # reporter(**result) print('ran iteration')
def my_train_fn(config, reporter): active_policy = None threshold = 0.7 trainer_updates = [] # ppo_trainer = MyPPOTrainer(env='c4', config=config) ppo_trainer = PPOTrainer(env='c4', config=config) bandit = Exp3Bandit(len(trainable_policies)) def func(worker): worker.sampler.policy_mapping_fn = learned_vs_random_mapping_fn foo = 1 # ppo_trainer.workers.foreach_worker(lambda w: w.sampler.policy_mapping_fn) ppo_trainer.workers.foreach_worker(func) # trainable_policies = ppo_trainer.workers.foreach_worker(lambda w: w.policies_to_train)[0][:] # trainable_policies = ppo_trainer.workers.foreach_worker( # lambda w: w.foreach_trainable_policy(lambda p, i: (i, p)) # ) while True: result = ppo_trainer.train() reporter(**result) foo = 1 timestep = result['timesteps_total'] training_iteration = result['training_iteration'] # print('\n') # print('$$$$$$$$$$$$$$$$$$$$$$$') # print('timestep: {:,}'.format(timestep)) # print('trainable_policies: %s' % trainable_policies) # if active_policy is None and timestep > int(5e6): # # if active_policy is None and timestep > int(25e4): # active_policy = trainable_policies[0] # # ppo_trainer.workers.foreach_worker( # # lambda w: w.foreach_trainable_policy(lambda p, i: (i, p)) # # ) # ppo_trainer.workers.foreach_worker( # lambda w: w.policies_to_train.remove(trainable_policies[1]) # ) # trainer_updates.append(timestep) # elif active_policy == trainable_policies[0] \ # and result['policy_reward_mean'][trainable_policies[0]] > threshold: # active_policy = trainable_policies[1] # ppo_trainer.workers.foreach_worker( # lambda w: w.policies_to_train.remove(trainable_policies[0]) # ) # ppo_trainer.workers.foreach_worker( # lambda w: w.policies_to_train.append(trainable_policies[1]) # ) # trainer_updates.append(timestep) # elif active_policy == trainable_policies[1] \ # and result['policy_reward_mean'][trainable_policies[1]] > threshold: # active_policy = trainable_policies[0] # ppo_trainer.workers.foreach_worker( # lambda w: w.policies_to_train.remove(trainable_policies[1]) # ) # ppo_trainer.workers.foreach_worker( # lambda w: w.policies_to_train.append(trainable_policies[0]) # ) # trainer_updates.append(timestep) # # print('active_policy: %s' % active_policy) # print('worker TPs: %s' % ppo_trainer.workers.foreach_worker(lambda w: w.policies_to_train)[0]) # print('trainer updates: %s' % '\n - ' + '\n - '.join('{:,}'.format(tu) for tu in trainer_updates)) # print('$$$$$$$$$$$$$$$$$$$$$$$') # print('\n') # if timestep > int(1e6): # if training_iteration >= 20: # break # if result['episode_reward_mean'] > 200: # phase = 2 # elif result['episode_reward_mean'] > 100: # phase = 1 # else: # phase = 0 # ppo_trainer.workers.foreach_worker( # lambda ev: ev.foreach_env( # lambda env: env.set_phase(phase))) state = ppo_trainer.save() ppo_trainer.stop()
num_policies = 4 policies = { "policy_{}".format(i): (None, env.observation_space, env.action_space, {}) for i in range(num_policies) } policy_ids = list(policies.keys()) config = { "multiagent": { "policies": policies, "policy_mapping_fn": (lambda agent_id: random.choice(policy_ids)), }, "framework": "tf", "num_workers": 60 # Adjust this according to the number of CPUs on your machine. } trainer = PPOTrainer(env=TicTacToe, config=config) best_eps_len = 0 mean_reward_thold = -1 while True: results = trainer.train() print(pretty_print(results)) if results["episode_reward_mean"] > mean_reward_thold and results[ "episode_len_mean"] > best_eps_len: trainer.save("ttt_model") best_eps_len = results["episode_len_mean"] print("--------------------- MODEL SAVED!") if results.get("timesteps_total") > 10**7: break ray.shutdown()
def main(): """main function""" ray.init() if 'NUM_WORKERS' in os.environ: num_of_workers = int(os.environ['NUM_WORKERS']) else: num_of_workers = DEFAULT_NUM_WORKERS if os.path.isfile(WORLDS_JSON_PATH): with open(WORLDS_JSON_PATH) as jsonfile: dict_worlds = json.load(jsonfile) else: dict_worlds = None if os.path.isfile(MASTER_URI_JSON_PATH): with open(MASTER_URI_JSON_PATH) as jsonfile: list_master_uri = json.load(jsonfile)['master_uri'] else: list_master_uri = None config = ppo.DEFAULT_CONFIG.copy() config.update({ 'env_config': { 'dict_worlds': dict_worlds, 'list_master_uri': list_master_uri, # 병렬 시뮬레이션 수행 스크립트 사용할 때 # 'list_master_uri': None, # 기본 ROS 마스터 URI로 시뮬레이션 1개만 돌릴 떄 'use_random_heading': True, 'result_csv': RESULT_CSV_NAME, 'num_workers': num_of_workers }, 'num_gpus': 0, # 사용하는 GPU 수에 맞게 설정 'num_workers': num_of_workers, 'train_batch_size': 10000, 'batch_mode': 'complete_episodes' }) register_env('gazebo', lambda cfg: DroneSimEnv(cfg)) trainer = PPOTrainer(env='gazebo', config=config) num_iteration = 10000 latest_index = 0 checkpoint_path = None checkpoint_name = None for name in [ name for name in os.listdir(CHECKPOINT_PATH_BASE) if 'checkpoint_' in name ]: index = int(name.replace('checkpoint_', '')) if index > latest_index: latest_index = index checkpoint_path = CHECKPOINT_PATH_BASE + name + '/' checkpoint_name = 'checkpoint-' + str(index) if checkpoint_name: print('Running using (', checkpoint_name, ').') trainer.restore(checkpoint_path + checkpoint_name) print(checkpoint_name, '==========================================') ## goal/collision data init success_cnt = 0 goal_rate_filename = 'goal_rate_{}.csv'.format( WORLDS_JSON_NAME.replace('curriculum/', '').replace('.json', '')) if not os.path.isfile(goal_rate_filename): with open(goal_rate_filename, 'w') as goal_rate_logfile: goal_rate_logfile.write("training_iteration,goal_rate\n") while True: ## goal/collision data create with open(RESULT_CSV_NAME, 'w+') as file_: pass result = trainer.train() print(pretty_print(result)) # 복구용 체크포인트는 5 iteration 마다 저장 if result['training_iteration'] % 5 == 0: checkpoint = trainer.save(CHECKPOINT_PATH_BASE) print("checkpoint saved at", checkpoint) # 결과 확인용 체크포인트는 100 iteration 마다 저장 if result['training_iteration'] % 100 == 0: checkpoint = trainer.save() print("checkpoint saved at", checkpoint) ## goal/collision data read with open(RESULT_CSV_NAME, 'r') as file_: episodes_raw = file_.read() goal_list = episodes_raw.split(',') goal_cnt = goal_list.count('1') if goal_cnt == 0: goal_ratio = 0 else: goal_ratio = goal_cnt / (goal_cnt + goal_list.count('0')) print('goal rate:', goal_ratio) with open(goal_rate_filename, 'a') as goal_rate_logfile: goal_rate_logfile.write( str(result['training_iteration']) + ',' + str(goal_ratio) + '\n') if goal_ratio >= 0.95: success_cnt += 1 print('success in raw:', success_cnt) else: success_cnt = 0 if success_cnt >= 5 and EXIT_ON_SUCCESS: if result['training_iteration'] % 5 != 0: checkpoint = trainer.save(CHECKPOINT_PATH_BASE) print("checkpoint saved at", checkpoint) break if result['training_iteration'] >= num_iteration: break print('PPO training is done.')