config["num_sgd_iter"] = 5 config["sgd_minibatch_size"] = 8192 config["train_batch_size"] = 20000 config["use_gae"] = True config["vf_clip_param"] = 10 config["vf_loss_coeff"] = 1 config["vf_share_layers"] = False # For better gradient estimates in the later stages # of the training, increase the batch sizes. # config["sgd_minibatch_size"] = 8192 * 4 # config["train_batch_size"] = 20000 * 10 ray.init() trainer = PPOTrainer(config=config, env=InventoryEnv) # Use this when you want to continue from a checkpoint. # trainer.restore( # "/home/enes/ray_results/PPO_InventoryEnv_2020-10-06_04-31-2945lwn1wg/checkpoint_737/checkpoint-737" # ) best_mean_reward = np.NINF while True: result = trainer.train() print(pretty_print(result)) mean_reward = result.get("episode_reward_mean", np.NINF) if mean_reward > best_mean_reward: checkpoint = trainer.save() print("checkpoint saved at", checkpoint) best_mean_reward = mean_reward
iteration_reward_mean=[str(result['policy_reward_mean'][policy_name]) for policy_name in policy_graphs.keys()] iteration_eplen_mean=str(result['episode_len_mean']) iteration_episodes_total=str(result['episodes_total']) savedata = "" savedata += iteration_episodes_total # first entry is the total number of episodes savedata += " " + iteration_eplen_mean # second entry is the mean episode length for pc in iteration_reward_mean: # other entries are mean policy rewards savedata += " " savedata += pc file_str = 'N_{0}_k_{1}_Seq_{2}_r_{3}_{4:%H_%M_%S_%d%m%Y}_Iteration_Data'.format(n_agents, signal_size, S_eq, growth_rate, start_time) f = open(log_dir + "/" + file_str, "a") f.write(savedata + "\n") f.close() trainer.save(checkpoint_folder) # save checkpoint after final iteration # Outputs: Final episode rewards and lengths episode_reward_final = result['episode_reward_mean'] episode_len_final = result['episode_len_mean'] print("Experiment finished successfully.") # Create success flag file success_file_str = 'SUCCESS_N_{0}_k_{1}_Seq_{2}_r_{3}_{4:%H_%M_%S_%d%m%Y}'.format(n_agents, signal_size, S_eq, growth_rate, start_time) f = open("./" + success_file_str, "a") f.close()
for (k3,v3), (k2,v2) in zip(ppo_trainer.get_policy("policy_03").get_weights().items(), ppo_trainer.get_policy("policy_02").get_weights().items()): P3key_P2val[k3] = v2 P2key_P1val = {} # temp storage with "policy_2" keys & "policy_1" values for (k2,v2), (k1,v1) in zip(ppo_trainer.get_policy("policy_02").get_weights().items(), ppo_trainer.get_policy("policy_01").get_weights().items()): P2key_P1val[k2] = v1 #Set weights of policies ppo_trainer.set_weights( "policy_04":P4key_P3val, # weights or values from "policy_03" with "policy_04" keys "policy_03":P3key_P2val, # weights or values from "policy_02" with "policy_03" keys "policy_02":P2key_P1val, # weights or values from "policy_01" with "policy_02" keys "policy_01":ppo_trainer.get_policy("policy_01").get_weights() # no change }) # To check for (k,v), (k2,v2) in zip(ppo_trainer.get_policy("policy_01").get_weights().items(), ppo_trainer.get_policy("policy_02").get_weights().items()): print("Check weights have been swapped") assert (v == v2).all() if i % 200 == 0: # Save checkpoint checkpoint = ppo_trainer.save() print("checkpoint saved at", checkpoint)
def my_train_fn(config, reporter): active_policy = None threshold = 0.7 trainer_updates = [] # ppo_trainer = MyPPOTrainer(env='c4', config=config) ppo_trainer = PPOTrainer(env='c4', config=config) bandit = Exp3Bandit(len(trainable_policies)) def func(worker): worker.sampler.policy_mapping_fn = learned_vs_random_mapping_fn foo = 1 # ppo_trainer.workers.foreach_worker(lambda w: w.sampler.policy_mapping_fn) ppo_trainer.workers.foreach_worker(func) # trainable_policies = ppo_trainer.workers.foreach_worker(lambda w: w.policies_to_train)[0][:] # trainable_policies = ppo_trainer.workers.foreach_worker( # lambda w: w.foreach_trainable_policy(lambda p, i: (i, p)) # ) while True: result = ppo_trainer.train() reporter(**result) foo = 1 timestep = result['timesteps_total'] training_iteration = result['training_iteration'] # print('\n') # print('$$$$$$$$$$$$$$$$$$$$$$$') # print('timestep: {:,}'.format(timestep)) # print('trainable_policies: %s' % trainable_policies) # if active_policy is None and timestep > int(5e6): # # if active_policy is None and timestep > int(25e4): # active_policy = trainable_policies[0] # # ppo_trainer.workers.foreach_worker( # # lambda w: w.foreach_trainable_policy(lambda p, i: (i, p)) # # ) # ppo_trainer.workers.foreach_worker( # lambda w: w.policies_to_train.remove(trainable_policies[1]) # ) # trainer_updates.append(timestep) # elif active_policy == trainable_policies[0] \ # and result['policy_reward_mean'][trainable_policies[0]] > threshold: # active_policy = trainable_policies[1] # ppo_trainer.workers.foreach_worker( # lambda w: w.policies_to_train.remove(trainable_policies[0]) # ) # ppo_trainer.workers.foreach_worker( # lambda w: w.policies_to_train.append(trainable_policies[1]) # ) # trainer_updates.append(timestep) # elif active_policy == trainable_policies[1] \ # and result['policy_reward_mean'][trainable_policies[1]] > threshold: # active_policy = trainable_policies[0] # ppo_trainer.workers.foreach_worker( # lambda w: w.policies_to_train.remove(trainable_policies[1]) # ) # ppo_trainer.workers.foreach_worker( # lambda w: w.policies_to_train.append(trainable_policies[0]) # ) # trainer_updates.append(timestep) # # print('active_policy: %s' % active_policy) # print('worker TPs: %s' % ppo_trainer.workers.foreach_worker(lambda w: w.policies_to_train)[0]) # print('trainer updates: %s' % '\n - ' + '\n - '.join('{:,}'.format(tu) for tu in trainer_updates)) # print('$$$$$$$$$$$$$$$$$$$$$$$') # print('\n') # if timestep > int(1e6): # if training_iteration >= 20: # break # if result['episode_reward_mean'] > 200: # phase = 2 # elif result['episode_reward_mean'] > 100: # phase = 1 # else: # phase = 0 # ppo_trainer.workers.foreach_worker( # lambda ev: ev.foreach_env( # lambda env: env.set_phase(phase))) state = ppo_trainer.save() ppo_trainer.stop()
num_policies = 4 policies = { "policy_{}".format(i): (None, env.observation_space, env.action_space, {}) for i in range(num_policies) } policy_ids = list(policies.keys()) config = { "multiagent": { "policies": policies, "policy_mapping_fn": (lambda agent_id: random.choice(policy_ids)), }, "framework": "tf", "num_workers": 60 # Adjust this according to the number of CPUs on your machine. } trainer = PPOTrainer(env=TicTacToe, config=config) best_eps_len = 0 mean_reward_thold = -1 while True: results = trainer.train() print(pretty_print(results)) if results["episode_reward_mean"] > mean_reward_thold and results[ "episode_len_mean"] > best_eps_len: trainer.save("ttt_model") best_eps_len = results["episode_len_mean"] print("--------------------- MODEL SAVED!") if results.get("timesteps_total") > 10**7: break ray.shutdown()
def main(): """main function""" ray.init() if 'NUM_WORKERS' in os.environ: num_of_workers = int(os.environ['NUM_WORKERS']) else: num_of_workers = DEFAULT_NUM_WORKERS if os.path.isfile(WORLDS_JSON_PATH): with open(WORLDS_JSON_PATH) as jsonfile: dict_worlds = json.load(jsonfile) else: dict_worlds = None if os.path.isfile(MASTER_URI_JSON_PATH): with open(MASTER_URI_JSON_PATH) as jsonfile: list_master_uri = json.load(jsonfile)['master_uri'] else: list_master_uri = None config = ppo.DEFAULT_CONFIG.copy() config.update({ 'env_config': { 'dict_worlds': dict_worlds, 'list_master_uri': list_master_uri, # 병렬 시뮬레이션 수행 스크립트 사용할 때 # 'list_master_uri': None, # 기본 ROS 마스터 URI로 시뮬레이션 1개만 돌릴 떄 'use_random_heading': True, 'result_csv': RESULT_CSV_NAME, 'num_workers': num_of_workers }, 'num_gpus': 0, # 사용하는 GPU 수에 맞게 설정 'num_workers': num_of_workers, 'train_batch_size': 10000, 'batch_mode': 'complete_episodes' }) register_env('gazebo', lambda cfg: DroneSimEnv(cfg)) trainer = PPOTrainer(env='gazebo', config=config) num_iteration = 10000 latest_index = 0 checkpoint_path = None checkpoint_name = None for name in [ name for name in os.listdir(CHECKPOINT_PATH_BASE) if 'checkpoint_' in name ]: index = int(name.replace('checkpoint_', '')) if index > latest_index: latest_index = index checkpoint_path = CHECKPOINT_PATH_BASE + name + '/' checkpoint_name = 'checkpoint-' + str(index) if checkpoint_name: print('Running using (', checkpoint_name, ').') trainer.restore(checkpoint_path + checkpoint_name) print(checkpoint_name, '==========================================') ## goal/collision data init success_cnt = 0 goal_rate_filename = 'goal_rate_{}.csv'.format( WORLDS_JSON_NAME.replace('curriculum/', '').replace('.json', '')) if not os.path.isfile(goal_rate_filename): with open(goal_rate_filename, 'w') as goal_rate_logfile: goal_rate_logfile.write("training_iteration,goal_rate\n") while True: ## goal/collision data create with open(RESULT_CSV_NAME, 'w+') as file_: pass result = trainer.train() print(pretty_print(result)) # 복구용 체크포인트는 5 iteration 마다 저장 if result['training_iteration'] % 5 == 0: checkpoint = trainer.save(CHECKPOINT_PATH_BASE) print("checkpoint saved at", checkpoint) # 결과 확인용 체크포인트는 100 iteration 마다 저장 if result['training_iteration'] % 100 == 0: checkpoint = trainer.save() print("checkpoint saved at", checkpoint) ## goal/collision data read with open(RESULT_CSV_NAME, 'r') as file_: episodes_raw = file_.read() goal_list = episodes_raw.split(',') goal_cnt = goal_list.count('1') if goal_cnt == 0: goal_ratio = 0 else: goal_ratio = goal_cnt / (goal_cnt + goal_list.count('0')) print('goal rate:', goal_ratio) with open(goal_rate_filename, 'a') as goal_rate_logfile: goal_rate_logfile.write( str(result['training_iteration']) + ',' + str(goal_ratio) + '\n') if goal_ratio >= 0.95: success_cnt += 1 print('success in raw:', success_cnt) else: success_cnt = 0 if success_cnt >= 5 and EXIT_ON_SUCCESS: if result['training_iteration'] % 5 != 0: checkpoint = trainer.save(CHECKPOINT_PATH_BASE) print("checkpoint saved at", checkpoint) break if result['training_iteration'] >= num_iteration: break print('PPO training is done.')