Exemplo n.º 1
0
class AlternateTraining(Trainable):
    def _setup(self, config):
        self.config = config
        self.env = config['env']
        agent_config = self.config
        adv_config = deepcopy(self.config)
        agent_config['multiagent']['policies_to_train'] = ['agent']
        adv_config['multiagent']['policies_to_train'] = ['adversary0']

        self.agent_trainer = PPOTrainer(env=self.env, config=agent_config)
        self.adv_trainer = PPOTrainer(env=self.env, config=adv_config)

    def _train(self):
        # improve the Adversary policy
        print("-- Adversary Training --")
        print(pretty_print(self.adv_trainer.train()))

        # swap weights to synchronize
        self.agent_trainer.set_weights(self.adv_trainer.get_weights(["adversary0"]))

        # improve the Agent policy
        print("-- Agent Training --")
        output = self.agent_trainer.train()
        print(pretty_print(output))

        # swap weights to synchronize
        self.adv_trainer.set_weights(self.agent_trainer.get_weights(["agent"]))
        return output

    def _save(self, tmp_checkpoint_dir):
        return self.agent_trainer._save(tmp_checkpoint_dir)
    def my_train_fn(config, reporter):
        assert args.num_learners >= 4, 'Requires 4 or more trainable agents'
        ppo_trainer = PPOTrainer(env='c4', config=config)
        while True:
            result = ppo_trainer.train()
            if 'evaluation' in result:
                train_policies = config['multiagent']['policies_to_train']
                scores = {k: v for k, v in result['evaluation']['policy_reward_mean'].items() if k in train_policies}

                scores_dist = softmax(np.array(list(scores.values())) / tau)
                new_trainables = random.choices(list(scores.keys()), scores_dist, k=len(scores))
                # new_trainables = train_policies
                # random.shuffle(new_trainables)

                weights = ppo_trainer.get_weights()
                new_weights = {old_pid: weights[new_pid] for old_pid, new_pid in zip(weights.keys(), new_trainables)}
                # new_weights = {pid: np.zeros_like(wt) for pid, wt in weights.items() if wt is not None}
                # new_weights = {pid: np.ones_like(wt)*-100 for pid, wt in weights.items() if wt is not None}
                # new_weights = {pid: np.random.rand(*wt.shape) for pid, wt in weights.items() if wt is not None}

                print('\n\n################\nSETTING WEIGHTS\n################\n\n')
                ppo_trainer.set_weights(new_weights)

                num_metrics = 4
                c = Counter(new_trainables)
                result['custom_metrics'].update(
                    {f'most_common{i:02d}': v[1] for i, v in enumerate(c.most_common(num_metrics))})
                result['custom_metrics'].update(
                    {f'scores_dist{i:02d}': v for i, v in enumerate(sorted(scores_dist, reverse=True)[:num_metrics])})
                print('scores_dist', scores_dist)
                # result['custom_metrics'].update(
                #     {f'new_agent{i:02d}': int(v[-2:]) for i, v in enumerate(new_trainables)})
            reporter(**result)
class AlternateTraining(Trainable):
    def _setup(self, config):
        self.config = config
        self.env = config['env']
        agent_config = self.config
        adv_config = deepcopy(self.config)
        agent_config['multiagent']['policies_to_train'] = ['agent']
        adv_config['multiagent']['policies_to_train'] = ['adversary']

        self.agent_trainer = PPOTrainer(env=self.env, config=agent_config)
        self.adv_trainer = PPOTrainer(env=self.env, config=adv_config)

    def _train(self):
        # improve the Adversary policy
        print("-- Adversary Training --")
        original_weight = self.adv_trainer.get_weights(
            ["adversary"])['adversary']['adversary/fc_1/kernel'][0, 0]
        print(pretty_print(self.adv_trainer.train()))
        first_weight = self.adv_trainer.get_weights(
            ["adversary"])['adversary']['adversary/fc_1/kernel'][0, 0]

        # Check that the weights are updating after training
        assert original_weight != first_weight, 'The weight hasn\'t changed after training what gives'

        # swap weights to synchronize
        self.agent_trainer.set_weights(
            self.adv_trainer.get_weights(["adversary"]))

        # improve the Agent policy
        print("-- Agent Training --")
        output = self.agent_trainer.train()

        # Assert that the weight hasn't changed but it has
        new_weight = self.agent_trainer.get_weights(
            ["adversary"])['adversary']['adversary/fc_1/kernel'][0, 0]

        # Check that the adversary is not being trained when the agent trainer is training
        assert first_weight == new_weight, 'The weight of the adversary matrix has changed but it shouldnt have been updated!'

        # swap weights to synchronize
        self.adv_trainer.set_weights(self.agent_trainer.get_weights(["agent"]))

        return output

    def _save(self, tmp_checkpoint_dir):
        return self.agent_trainer._save(tmp_checkpoint_dir)
Exemplo n.º 4
0
    dqn_trainer = DQNTrainer(env="multi_cartpole",
                             config={
                                 "multiagent": {
                                     "policies": policies,
                                     "policy_mapping_fn": policy_mapping_fn,
                                     "policies_to_train": ["dqn_policy"],
                                 },
                                 "gamma": 0.95,
                                 "n_step": 3,
                             })

    # You should see both the printed X and Y approach 200 as this trains:
    # info:
    #   policy_reward_mean:
    #     dqn_policy: X
    #     ppo_policy: Y
    for i in range(args.num_iters):
        print("== Iteration", i, "==")

        # improve the DQN policy
        print("-- DQN --")
        print(pretty_print(dqn_trainer.train()))

        # improve the PPO policy
        print("-- PPO --")
        print(pretty_print(ppo_trainer.train()))

        # swap weights to synchronize
        dqn_trainer.set_weights(ppo_trainer.get_weights(["ppo_policy"]))
        ppo_trainer.set_weights(dqn_trainer.get_weights(["dqn_policy"]))
config["num_sgd_iter"] = 5
config["sgd_minibatch_size"] = 8192
config["train_batch_size"] = 20000
config["use_gae"] = True
config["vf_clip_param"] = 10
config["vf_loss_coeff"] = 1
config["vf_share_layers"] = False

# For better gradient estimates in the later stages
# of the training, increase the batch sizes.
# config["sgd_minibatch_size"] = 8192 * 4
# config["train_batch_size"] = 20000 * 10

ray.init()
trainer = PPOTrainer(config=config, env=InventoryEnv)

# Use this when you want to continue from a checkpoint.
# trainer.restore(
#   "/home/enes/ray_results/PPO_InventoryEnv_2020-10-06_04-31-2945lwn1wg/checkpoint_737/checkpoint-737"
# )

best_mean_reward = np.NINF
while True:
    result = trainer.train()
    print(pretty_print(result))
    mean_reward = result.get("episode_reward_mean", np.NINF)
    if mean_reward > best_mean_reward:
        checkpoint = trainer.save()
        print("checkpoint saved at", checkpoint)
        best_mean_reward = mean_reward
Exemplo n.º 6
0
import numpy as np
import matplotlib.pyplot as plt
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2, A2C

import gym
import traf_env
import multi_traf_env
import ray
from ray import tune
from ray.rllib.policy import Policy
from ray.rllib.tests.test_multi_agent_env import MultiCartpole
from ray.tune.registry import register_env
from ray.rllib.agents.ppo.ppo import PPOTrainer
from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy

ray.init()
register_env("multi_air-v0",
             lambda c: multi_traf_env.AirTrafficGym(num_agents=2))
trainer = PPOTrainer(env="multi_air-v0")
num_train_itr = 50
for i in range(num_train_itr):
    print("****************************Iteration: ", i,
          "****************************")
    print(trainer.train())
Exemplo n.º 7
0
        #             "policy_03": (None, obs_space, act_space, {}),
        #             "policy_04": (None, obs_space, act_space, {}),
        #         },
        #         "policy_mapping_fn": policy_mapping_fn_eval
        #     },
        # },

      })

# Uncomment to restore from check-point
# ppo_trainer.restore('/home/ubuntu/ray_results/PPO_gfootball_2020-08-05_20-46-06e4wbrrqg/checkpoint_301/checkpoint-301')
for i in range(100000):

    episode_data = []
    print("+++++++++++++++Training iteration {!s}+++++++++++++++++++".format(i+1))
    result = ppo_trainer.train()
    print(pretty_print(result))

    if ((i > 0) and (i % 10 == 0)):
        print("===============Swapping weights===================")

        P4key_P3val = {} # temp storage with "policy_4" keys & "policy_3" values
        for (k4,v4), (k3,v3) in zip(ppo_trainer.get_policy("policy_04").get_weights().items(),
                                    ppo_trainer.get_policy("policy_03").get_weights().items()):
            P4key_P3val[k4] = v3


        P3key_P2val = {} # temp storage with "policy_3" keys & "policy_2" values
        for (k3,v3), (k2,v2) in zip(ppo_trainer.get_policy("policy_03").get_weights().items(),
                                    ppo_trainer.get_policy("policy_02").get_weights().items()):
 def _train(self):
     ppo_trainer = PPOTrainer(env='c4', config=self.config)
     while True:
         result = ppo_trainer.train()
         # reporter(**result)
         print('ran iteration')
    def my_train_fn(config, reporter):
        active_policy = None
        threshold = 0.7
        trainer_updates = []

        # ppo_trainer = MyPPOTrainer(env='c4', config=config)
        ppo_trainer = PPOTrainer(env='c4', config=config)
        bandit = Exp3Bandit(len(trainable_policies))

        def func(worker):
            worker.sampler.policy_mapping_fn = learned_vs_random_mapping_fn
            foo = 1

        # ppo_trainer.workers.foreach_worker(lambda w: w.sampler.policy_mapping_fn)
        ppo_trainer.workers.foreach_worker(func)

        # trainable_policies = ppo_trainer.workers.foreach_worker(lambda w: w.policies_to_train)[0][:]
        # trainable_policies = ppo_trainer.workers.foreach_worker(
        #     lambda w: w.foreach_trainable_policy(lambda p, i: (i, p))
        # )

        while True:
            result = ppo_trainer.train()
            reporter(**result)

            foo = 1

            timestep = result['timesteps_total']
            training_iteration = result['training_iteration']
            # print('\n')
            # print('$$$$$$$$$$$$$$$$$$$$$$$')
            # print('timestep: {:,}'.format(timestep))
            # print('trainable_policies: %s' % trainable_policies)
            # if active_policy is None and timestep > int(5e6):
            # # if active_policy is None and timestep > int(25e4):
            #     active_policy = trainable_policies[0]
            #     # ppo_trainer.workers.foreach_worker(
            #     #     lambda w: w.foreach_trainable_policy(lambda p, i: (i, p))
            #     # )
            #     ppo_trainer.workers.foreach_worker(
            #         lambda w: w.policies_to_train.remove(trainable_policies[1])
            #     )
            #     trainer_updates.append(timestep)
            # elif active_policy == trainable_policies[0] \
            #         and result['policy_reward_mean'][trainable_policies[0]] > threshold:
            #     active_policy = trainable_policies[1]
            #     ppo_trainer.workers.foreach_worker(
            #         lambda w: w.policies_to_train.remove(trainable_policies[0])
            #     )
            #     ppo_trainer.workers.foreach_worker(
            #         lambda w: w.policies_to_train.append(trainable_policies[1])
            #     )
            #     trainer_updates.append(timestep)
            # elif active_policy == trainable_policies[1] \
            #         and result['policy_reward_mean'][trainable_policies[1]] > threshold:
            #     active_policy = trainable_policies[0]
            #     ppo_trainer.workers.foreach_worker(
            #         lambda w: w.policies_to_train.remove(trainable_policies[1])
            #     )
            #     ppo_trainer.workers.foreach_worker(
            #         lambda w: w.policies_to_train.append(trainable_policies[0])
            #     )
            #     trainer_updates.append(timestep)
            #
            # print('active_policy: %s' % active_policy)
            # print('worker TPs: %s' % ppo_trainer.workers.foreach_worker(lambda w: w.policies_to_train)[0])
            # print('trainer updates: %s' % '\n  - ' + '\n  - '.join('{:,}'.format(tu) for tu in trainer_updates))
            # print('$$$$$$$$$$$$$$$$$$$$$$$')
            # print('\n')

            # if timestep > int(1e6):
            # if training_iteration >= 20:
            #     break

            # if result['episode_reward_mean'] > 200:
            #     phase = 2
            # elif result['episode_reward_mean'] > 100:
            #     phase = 1
            # else:
            #     phase = 0
            # ppo_trainer.workers.foreach_worker(
            #     lambda ev: ev.foreach_env(
            #         lambda env: env.set_phase(phase)))

        state = ppo_trainer.save()
        ppo_trainer.stop()
Exemplo n.º 10
0
    num_policies = 4
    policies = {
        "policy_{}".format(i):
        (None, env.observation_space, env.action_space, {})
        for i in range(num_policies)
    }
    policy_ids = list(policies.keys())
    config = {
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": (lambda agent_id: random.choice(policy_ids)),
        },
        "framework": "tf",
        "num_workers":
        60  # Adjust this according to the number of CPUs on your machine.
    }
    trainer = PPOTrainer(env=TicTacToe, config=config)
    best_eps_len = 0
    mean_reward_thold = -1
    while True:
        results = trainer.train()
        print(pretty_print(results))
        if results["episode_reward_mean"] > mean_reward_thold and results[
                "episode_len_mean"] > best_eps_len:
            trainer.save("ttt_model")
            best_eps_len = results["episode_len_mean"]
            print("--------------------- MODEL SAVED!")
        if results.get("timesteps_total") > 10**7:
            break
    ray.shutdown()
Exemplo n.º 11
0
def main():
    """main function"""

    ray.init()

    if 'NUM_WORKERS' in os.environ:
        num_of_workers = int(os.environ['NUM_WORKERS'])
    else:
        num_of_workers = DEFAULT_NUM_WORKERS

    if os.path.isfile(WORLDS_JSON_PATH):
        with open(WORLDS_JSON_PATH) as jsonfile:
            dict_worlds = json.load(jsonfile)
    else:
        dict_worlds = None

    if os.path.isfile(MASTER_URI_JSON_PATH):
        with open(MASTER_URI_JSON_PATH) as jsonfile:
            list_master_uri = json.load(jsonfile)['master_uri']
    else:
        list_master_uri = None

    config = ppo.DEFAULT_CONFIG.copy()
    config.update({
        'env_config': {
            'dict_worlds': dict_worlds,
            'list_master_uri': list_master_uri,  # 병렬 시뮬레이션 수행 스크립트 사용할 때
            # 'list_master_uri': None, # 기본 ROS 마스터 URI로 시뮬레이션 1개만 돌릴 떄
            'use_random_heading': True,
            'result_csv': RESULT_CSV_NAME,
            'num_workers': num_of_workers
        },
        'num_gpus': 0,  # 사용하는 GPU 수에 맞게 설정
        'num_workers': num_of_workers,
        'train_batch_size': 10000,
        'batch_mode': 'complete_episodes'
    })

    register_env('gazebo', lambda cfg: DroneSimEnv(cfg))
    trainer = PPOTrainer(env='gazebo', config=config)
    num_iteration = 10000

    latest_index = 0
    checkpoint_path = None
    checkpoint_name = None
    for name in [
            name for name in os.listdir(CHECKPOINT_PATH_BASE)
            if 'checkpoint_' in name
    ]:
        index = int(name.replace('checkpoint_', ''))
        if index > latest_index:
            latest_index = index
            checkpoint_path = CHECKPOINT_PATH_BASE + name + '/'
            checkpoint_name = 'checkpoint-' + str(index)
    if checkpoint_name:
        print('Running using (', checkpoint_name, ').')
        trainer.restore(checkpoint_path + checkpoint_name)

    print(checkpoint_name, '==========================================')

    ## goal/collision data init
    success_cnt = 0

    goal_rate_filename = 'goal_rate_{}.csv'.format(
        WORLDS_JSON_NAME.replace('curriculum/', '').replace('.json', ''))

    if not os.path.isfile(goal_rate_filename):
        with open(goal_rate_filename, 'w') as goal_rate_logfile:
            goal_rate_logfile.write("training_iteration,goal_rate\n")

    while True:
        ## goal/collision data create
        with open(RESULT_CSV_NAME, 'w+') as file_:
            pass

        result = trainer.train()
        print(pretty_print(result))

        # 복구용 체크포인트는 5 iteration 마다 저장
        if result['training_iteration'] % 5 == 0:
            checkpoint = trainer.save(CHECKPOINT_PATH_BASE)
            print("checkpoint saved at", checkpoint)

        # 결과 확인용 체크포인트는 100 iteration 마다 저장
        if result['training_iteration'] % 100 == 0:
            checkpoint = trainer.save()
            print("checkpoint saved at", checkpoint)

        ## goal/collision data read
        with open(RESULT_CSV_NAME, 'r') as file_:
            episodes_raw = file_.read()
            goal_list = episodes_raw.split(',')
            goal_cnt = goal_list.count('1')
            if goal_cnt == 0:
                goal_ratio = 0
            else:
                goal_ratio = goal_cnt / (goal_cnt + goal_list.count('0'))
            print('goal rate:', goal_ratio)
            with open(goal_rate_filename, 'a') as goal_rate_logfile:
                goal_rate_logfile.write(
                    str(result['training_iteration']) + ',' + str(goal_ratio) +
                    '\n')

        if goal_ratio >= 0.95:
            success_cnt += 1
            print('success in raw:', success_cnt)
        else:
            success_cnt = 0

        if success_cnt >= 5 and EXIT_ON_SUCCESS:
            if result['training_iteration'] % 5 != 0:
                checkpoint = trainer.save(CHECKPOINT_PATH_BASE)
                print("checkpoint saved at", checkpoint)
                break

        if result['training_iteration'] >= num_iteration:
            break

    print('PPO training is done.')