config["num_sgd_iter"] = 5
config["sgd_minibatch_size"] = 8192
config["train_batch_size"] = 20000
config["use_gae"] = True
config["vf_clip_param"] = 10
config["vf_loss_coeff"] = 1
config["vf_share_layers"] = False

# For better gradient estimates in the later stages
# of the training, increase the batch sizes.
# config["sgd_minibatch_size"] = 8192 * 4
# config["train_batch_size"] = 20000 * 10

ray.init()
trainer = PPOTrainer(config=config, env=InventoryEnv)

# Use this when you want to continue from a checkpoint.
# trainer.restore(
#   "/home/enes/ray_results/PPO_InventoryEnv_2020-10-06_04-31-2945lwn1wg/checkpoint_737/checkpoint-737"
# )

best_mean_reward = np.NINF
while True:
    result = trainer.train()
    print(pretty_print(result))
    mean_reward = result.get("episode_reward_mean", np.NINF)
    if mean_reward > best_mean_reward:
        checkpoint = trainer.save()
        print("checkpoint saved at", checkpoint)
        best_mean_reward = mean_reward
      iteration_reward_mean=[str(result['policy_reward_mean'][policy_name])  for policy_name in policy_graphs.keys()]
      iteration_eplen_mean=str(result['episode_len_mean'])
      iteration_episodes_total=str(result['episodes_total'])
      savedata = ""
      savedata += iteration_episodes_total  # first entry is the total number of episodes
      savedata += " " + iteration_eplen_mean   # second entry is the mean episode length
      for pc in iteration_reward_mean:  # other entries are mean policy rewards
        savedata += " "
        savedata += pc
      file_str = 'N_{0}_k_{1}_Seq_{2}_r_{3}_{4:%H_%M_%S_%d%m%Y}_Iteration_Data'.format(n_agents, signal_size, S_eq, growth_rate, start_time)
      f = open(log_dir + "/" + file_str, "a")
      f.write(savedata + "\n")
      f.close()
      

trainer.save(checkpoint_folder) # save checkpoint after final iteration

# Outputs: Final episode rewards and lengths
episode_reward_final = result['episode_reward_mean']
episode_len_final = result['episode_len_mean']    

print("Experiment finished successfully.")

# Create success flag file
success_file_str = 'SUCCESS_N_{0}_k_{1}_Seq_{2}_r_{3}_{4:%H_%M_%S_%d%m%Y}'.format(n_agents, signal_size, S_eq, growth_rate, start_time)
f = open("./" + success_file_str, "a")
f.close()
    
    
    
    
示例#3
0
        for (k3,v3), (k2,v2) in zip(ppo_trainer.get_policy("policy_03").get_weights().items(),
                                    ppo_trainer.get_policy("policy_02").get_weights().items()):

            P3key_P2val[k3] = v2


        P2key_P1val = {} # temp storage with "policy_2" keys & "policy_1" values
        for (k2,v2), (k1,v1) in zip(ppo_trainer.get_policy("policy_02").get_weights().items(),
                                    ppo_trainer.get_policy("policy_01").get_weights().items()):

            P2key_P1val[k2] = v1

        #Set weights of policies
        ppo_trainer.set_weights(
                                    "policy_04":P4key_P3val, # weights or values from "policy_03" with "policy_04" keys
                                    "policy_03":P3key_P2val, # weights or values from "policy_02" with "policy_03" keys
                                    "policy_02":P2key_P1val, # weights or values from "policy_01" with "policy_02" keys
                                    "policy_01":ppo_trainer.get_policy("policy_01").get_weights() # no change
                                })
        # To check
        for (k,v), (k2,v2) in zip(ppo_trainer.get_policy("policy_01").get_weights().items(),
                                    ppo_trainer.get_policy("policy_02").get_weights().items()):
        
            print("Check weights have been swapped")
            assert (v == v2).all()
        
    if i % 200 == 0:
      # Save checkpoint
        checkpoint = ppo_trainer.save()
        print("checkpoint saved at", checkpoint)
    def my_train_fn(config, reporter):
        active_policy = None
        threshold = 0.7
        trainer_updates = []

        # ppo_trainer = MyPPOTrainer(env='c4', config=config)
        ppo_trainer = PPOTrainer(env='c4', config=config)
        bandit = Exp3Bandit(len(trainable_policies))

        def func(worker):
            worker.sampler.policy_mapping_fn = learned_vs_random_mapping_fn
            foo = 1

        # ppo_trainer.workers.foreach_worker(lambda w: w.sampler.policy_mapping_fn)
        ppo_trainer.workers.foreach_worker(func)

        # trainable_policies = ppo_trainer.workers.foreach_worker(lambda w: w.policies_to_train)[0][:]
        # trainable_policies = ppo_trainer.workers.foreach_worker(
        #     lambda w: w.foreach_trainable_policy(lambda p, i: (i, p))
        # )

        while True:
            result = ppo_trainer.train()
            reporter(**result)

            foo = 1

            timestep = result['timesteps_total']
            training_iteration = result['training_iteration']
            # print('\n')
            # print('$$$$$$$$$$$$$$$$$$$$$$$')
            # print('timestep: {:,}'.format(timestep))
            # print('trainable_policies: %s' % trainable_policies)
            # if active_policy is None and timestep > int(5e6):
            # # if active_policy is None and timestep > int(25e4):
            #     active_policy = trainable_policies[0]
            #     # ppo_trainer.workers.foreach_worker(
            #     #     lambda w: w.foreach_trainable_policy(lambda p, i: (i, p))
            #     # )
            #     ppo_trainer.workers.foreach_worker(
            #         lambda w: w.policies_to_train.remove(trainable_policies[1])
            #     )
            #     trainer_updates.append(timestep)
            # elif active_policy == trainable_policies[0] \
            #         and result['policy_reward_mean'][trainable_policies[0]] > threshold:
            #     active_policy = trainable_policies[1]
            #     ppo_trainer.workers.foreach_worker(
            #         lambda w: w.policies_to_train.remove(trainable_policies[0])
            #     )
            #     ppo_trainer.workers.foreach_worker(
            #         lambda w: w.policies_to_train.append(trainable_policies[1])
            #     )
            #     trainer_updates.append(timestep)
            # elif active_policy == trainable_policies[1] \
            #         and result['policy_reward_mean'][trainable_policies[1]] > threshold:
            #     active_policy = trainable_policies[0]
            #     ppo_trainer.workers.foreach_worker(
            #         lambda w: w.policies_to_train.remove(trainable_policies[1])
            #     )
            #     ppo_trainer.workers.foreach_worker(
            #         lambda w: w.policies_to_train.append(trainable_policies[0])
            #     )
            #     trainer_updates.append(timestep)
            #
            # print('active_policy: %s' % active_policy)
            # print('worker TPs: %s' % ppo_trainer.workers.foreach_worker(lambda w: w.policies_to_train)[0])
            # print('trainer updates: %s' % '\n  - ' + '\n  - '.join('{:,}'.format(tu) for tu in trainer_updates))
            # print('$$$$$$$$$$$$$$$$$$$$$$$')
            # print('\n')

            # if timestep > int(1e6):
            # if training_iteration >= 20:
            #     break

            # if result['episode_reward_mean'] > 200:
            #     phase = 2
            # elif result['episode_reward_mean'] > 100:
            #     phase = 1
            # else:
            #     phase = 0
            # ppo_trainer.workers.foreach_worker(
            #     lambda ev: ev.foreach_env(
            #         lambda env: env.set_phase(phase)))

        state = ppo_trainer.save()
        ppo_trainer.stop()
示例#5
0
    num_policies = 4
    policies = {
        "policy_{}".format(i):
        (None, env.observation_space, env.action_space, {})
        for i in range(num_policies)
    }
    policy_ids = list(policies.keys())
    config = {
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": (lambda agent_id: random.choice(policy_ids)),
        },
        "framework": "tf",
        "num_workers":
        60  # Adjust this according to the number of CPUs on your machine.
    }
    trainer = PPOTrainer(env=TicTacToe, config=config)
    best_eps_len = 0
    mean_reward_thold = -1
    while True:
        results = trainer.train()
        print(pretty_print(results))
        if results["episode_reward_mean"] > mean_reward_thold and results[
                "episode_len_mean"] > best_eps_len:
            trainer.save("ttt_model")
            best_eps_len = results["episode_len_mean"]
            print("--------------------- MODEL SAVED!")
        if results.get("timesteps_total") > 10**7:
            break
    ray.shutdown()
示例#6
0
def main():
    """main function"""

    ray.init()

    if 'NUM_WORKERS' in os.environ:
        num_of_workers = int(os.environ['NUM_WORKERS'])
    else:
        num_of_workers = DEFAULT_NUM_WORKERS

    if os.path.isfile(WORLDS_JSON_PATH):
        with open(WORLDS_JSON_PATH) as jsonfile:
            dict_worlds = json.load(jsonfile)
    else:
        dict_worlds = None

    if os.path.isfile(MASTER_URI_JSON_PATH):
        with open(MASTER_URI_JSON_PATH) as jsonfile:
            list_master_uri = json.load(jsonfile)['master_uri']
    else:
        list_master_uri = None

    config = ppo.DEFAULT_CONFIG.copy()
    config.update({
        'env_config': {
            'dict_worlds': dict_worlds,
            'list_master_uri': list_master_uri,  # 병렬 시뮬레이션 수행 스크립트 사용할 때
            # 'list_master_uri': None, # 기본 ROS 마스터 URI로 시뮬레이션 1개만 돌릴 떄
            'use_random_heading': True,
            'result_csv': RESULT_CSV_NAME,
            'num_workers': num_of_workers
        },
        'num_gpus': 0,  # 사용하는 GPU 수에 맞게 설정
        'num_workers': num_of_workers,
        'train_batch_size': 10000,
        'batch_mode': 'complete_episodes'
    })

    register_env('gazebo', lambda cfg: DroneSimEnv(cfg))
    trainer = PPOTrainer(env='gazebo', config=config)
    num_iteration = 10000

    latest_index = 0
    checkpoint_path = None
    checkpoint_name = None
    for name in [
            name for name in os.listdir(CHECKPOINT_PATH_BASE)
            if 'checkpoint_' in name
    ]:
        index = int(name.replace('checkpoint_', ''))
        if index > latest_index:
            latest_index = index
            checkpoint_path = CHECKPOINT_PATH_BASE + name + '/'
            checkpoint_name = 'checkpoint-' + str(index)
    if checkpoint_name:
        print('Running using (', checkpoint_name, ').')
        trainer.restore(checkpoint_path + checkpoint_name)

    print(checkpoint_name, '==========================================')

    ## goal/collision data init
    success_cnt = 0

    goal_rate_filename = 'goal_rate_{}.csv'.format(
        WORLDS_JSON_NAME.replace('curriculum/', '').replace('.json', ''))

    if not os.path.isfile(goal_rate_filename):
        with open(goal_rate_filename, 'w') as goal_rate_logfile:
            goal_rate_logfile.write("training_iteration,goal_rate\n")

    while True:
        ## goal/collision data create
        with open(RESULT_CSV_NAME, 'w+') as file_:
            pass

        result = trainer.train()
        print(pretty_print(result))

        # 복구용 체크포인트는 5 iteration 마다 저장
        if result['training_iteration'] % 5 == 0:
            checkpoint = trainer.save(CHECKPOINT_PATH_BASE)
            print("checkpoint saved at", checkpoint)

        # 결과 확인용 체크포인트는 100 iteration 마다 저장
        if result['training_iteration'] % 100 == 0:
            checkpoint = trainer.save()
            print("checkpoint saved at", checkpoint)

        ## goal/collision data read
        with open(RESULT_CSV_NAME, 'r') as file_:
            episodes_raw = file_.read()
            goal_list = episodes_raw.split(',')
            goal_cnt = goal_list.count('1')
            if goal_cnt == 0:
                goal_ratio = 0
            else:
                goal_ratio = goal_cnt / (goal_cnt + goal_list.count('0'))
            print('goal rate:', goal_ratio)
            with open(goal_rate_filename, 'a') as goal_rate_logfile:
                goal_rate_logfile.write(
                    str(result['training_iteration']) + ',' + str(goal_ratio) +
                    '\n')

        if goal_ratio >= 0.95:
            success_cnt += 1
            print('success in raw:', success_cnt)
        else:
            success_cnt = 0

        if success_cnt >= 5 and EXIT_ON_SUCCESS:
            if result['training_iteration'] % 5 != 0:
                checkpoint = trainer.save(CHECKPOINT_PATH_BASE)
                print("checkpoint saved at", checkpoint)
                break

        if result['training_iteration'] >= num_iteration:
            break

    print('PPO training is done.')