Пример #1
0
    def train(config, checkpoint_dir=None):
        trainer = PPOTrainer(config=config, env='BomberMan-v0')
        init_w = trainer.get_policy('policy_01').get_weights()
        trainer.restore(
            'C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-25_08-56-43eo23nmho\\checkpoint_002360\\checkpoint-2360'
        )
        trainer.workers.foreach_worker(
            lambda w: w.get_policy('policy_01').set_weights(init_w))
        trainer.restore('.\\kill-policy-0\\checkpoint')
        trainer.import_model()
        iter = 0

        #def update_phase(ev):
        #    ev.foreach_env(lambda e: e.set_phase(phase))

        while True:
            iter += 1
            result = trainer.train()
            if iter % 200 == 0:
                if not os.path.exists(f'./model-{iter}'):
                    #trainer.get_policy('policy_01').export_model(f'./model-{iter}')
                    trainer.export_policy_model(f'./model-{iter}/main',
                                                'policy_01')
                    trainer.export_policy_model(f'./model-{iter}/collect',
                                                'policy_collect')
                    trainer.export_policy_model(f'./model-{iter}/destroy',
                                                'policy_destroy')
                    trainer.export_policy_model(f'./model-{iter}/kill',
                                                'policy_kill')

                else:
                    print("model already saved")
Пример #2
0
    def train(config, checkpoint_dir=None):
        trainer = PPOTrainer(config=config, env='BomberMan-v0')
        trainer.restore(
            'C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-26_20-15-082mjvde9i\\checkpoint_008980\\checkpoint-8980'
        )
        iter = 0

        while True:
            iter += 1
            result = trainer.train()
            if iter % 200 == 0:
                if not os.path.exists(f'./model-{iter}'):
                    trainer.get_policy('policy_01').export_model(
                        f'./model-{iter}')
                else:
                    print("model already saved")
Пример #3
0
    def train(config, checkpoint_dir=None):
        trainer = PPOTrainer(config=config, env='BomberMan-v0')
        #trainer.restore('C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-16_09-20-44984tj3ip\\checkpoint_002770\\checkpoint-2770')
        iter = 0

        #def update_phase(ev):
        #    ev.foreach_env(lambda e: e.set_phase(phase))

        while True:
            iter += 1
            result = trainer.train()
            if iter % 200 == 1:
                if not os.path.exists(f'./model-{iter}'):
                    trainer.get_policy('policy_01').export_model(f'./model-{iter}')
                else:
                    print("model already saved")
def train(config, checkpoint_dir=None):
    trainer = PPOTrainer(config=config)

    if checkpoint_dir:
        trainer.load_checkpoint(checkpoint_dir)

    chk_freq = 10

    if useModelFromLowLevelTrain:
        config_low["num_workers"] = 0
        config_low["num_envs_per_worker"] = 1
        config_low["num_gpus"] = 1
        agentLow = PPOTrainer(config_low)
        agentLow.restore(
            "/home/aditya/ray_results/{}/{}/checkpoint_{}/checkpoint-{}".
            format(experiment_name, experiment_id, checkpoint_num,
                   checkpoint_num))
        lowWeight = agentLow.get_policy().get_weights()
        highWeight = trainer.get_policy("low_level_policy").get_weights()
        lowState = agentLow.get_policy().get_state()
        importedOptState = OrderedDict([
            (k.replace("default_policy", "low_level_policy"), v)
            for k, v in lowState["_optimizer_variables"].items()
        ])
        importedPolicy = {
            hw: lowWeight[lw]
            for hw, lw in zip(highWeight.keys(), lowWeight.keys())
        }
        importedPolicy["_optimizer_variables"] = importedOptState
        trainer.get_policy("low_level_policy").set_state(importedPolicy)
        chk_freq = 1  # Hanya perlu 1 kali saja di awal untuk save model hasil import

    while True:
        result = trainer.train()
        tune.report(**result)
        if (trainer._iteration % chk_freq == 0):
            with tune.checkpoint_dir(
                    step=trainer._iteration) as checkpoint_dir:
                trainer.save(checkpoint_dir)
Пример #5
0
    def train(config, checkpoint_dir=None):
        trainer = PPOTrainer(config=config, env='BomberMan-v0')
        #trainer.restore('C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-10_14-16-50n_4knahb\\checkpoint_002700\\checkpoint-2700')
        iter = 0

        def update_phase(ev):
            ev.foreach_env(lambda e: e.set_phase(phase))

        phase = 2
        trainer.workers.foreach_worker(update_phase)

        while True:
            iter += 1
            result = trainer.train()
            if iter % 250 == 0:
                if not os.path.exists(f'./model-{iter}'):
                    trainer.get_policy('policy_01').export_model(f'./model-{iter}')
                else:
                    print("model already saved")
            '''
            if phase == 1 and result["policy_reward_mean"]["policy_01"] > 2:
                print(f'Phase 2 now.')
                phase = 2
                trainer.workers.foreach_worker(update_phase)
            '''

            if phase == 1 and result["policy_reward_mean"]["policy_01"] > 3:
                print(f'Phase 2 now.')
                phase = 2
                trainer.workers.foreach_worker(update_phase)
                #trainer.config['gamma'] = 0.995

            if phase == 0 and result["policy_reward_mean"]["policy_01"] > 3.5:
                print(f'Phase 1 now.')
                phase = 1
                trainer.workers.foreach_worker(update_phase)
Пример #6
0
def main():
    ray.init()

    #  Hyperparameters of PPO are not well tuned. Most of them refer to https://github.com/xtma/pytorch_car_caring/blob/master/train.py
    trainer = PPOTrainer(env=MyEnv,
                         config={
                             "use_pytorch": True,
                             "model": {
                                 "custom_model": "mymodel",
                                 "custom_options": {
                                     'encoder_path': args.encoder_path,
                                     'train_encoder': args.train_encoder
                                 },
                                 "custom_action_dist": "mydist",
                             },
                             "env_config": {
                                 'game': 'CarRacing'
                             },
                             "num_workers": args.num_workers,
                             "num_envs_per_worker": args.num_envs_per_worker,
                             "num_gpus": args.num_gpus,
                             "use_gae": args.use_gae,
                             "batch_mode": args.batch_mode,
                             "vf_loss_coeff": args.vf_loss_coeff,
                             "vf_clip_param": args.vf_clip_param,
                             "lr": args.lr,
                             "kl_coeff": args.kl_coeff,
                             "num_sgd_iter": args.num_sgd_iter,
                             "grad_clip": args.grad_clip,
                             "clip_param": args.clip_param,
                             "rollout_fragment_length":
                             args.rollout_fragment_length,
                             "train_batch_size": args.train_batch_size,
                             "sgd_minibatch_size": args.sgd_minibatch_size
                         })

    for i in range(args.train_epochs):
        trainer.train()
        print("%d Train Done" % (i), "Save Freq: %d" % (args.model_save_freq))
        if (i + 1) % args.model_save_freq == 0:
            print("%d Episodes Done" % (i))
            weights = trainer.get_policy().get_weights()
            torch.save(weights, args.model_save_path + "%d-mode.pt" % (i + 1))
    trainer.save(args.trainer_save_path)
    print("Done All!")
    trainer.stop()
Пример #7
0
fullpath1 = checkpoint_path + checkpoint1

checkpoint2 = "checkpoint_000005/checkpoint-5"
fullpath2 = checkpoint_path + checkpoint2

sum1a = 0
sum1b = 0
sum2a = 0
sum2b = 0


if os.path.exists(fullpath1):
    print('path FOUND!')
    print("Restoring from checkpoint path", fullpath1)
    trainer.restore(fullpath1)
    temp = trainer.get_policy().model._curiosity_feature_net
    sum1a = sum(v.sum() for k, v in trainer.get_policy().get_weights().items())
    sum1b = sum(
        v.eval(trainer.get_policy()._sess).sum() for v in trainer.get_policy().model._curiosity_feature_net.variables())
else:
    print("That path does not exist!")


if os.path.exists(fullpath2):
    print('path FOUND!')
    print("Restoring from checkpoint path", fullpath2)
    trainer2.restore(fullpath2)
    sum2a = sum(v.sum() for k, v in trainer2.get_policy().get_weights().items())
    sum2b = sum(
        v.eval(trainer2.get_policy()._sess).sum() for v in trainer2.get_policy().model._curiosity_feature_net.variables())
else:
Пример #8
0
# (210, 160, 3)
prep.transform(env.reset()).shape
# (84, 84, 3)
# __preprocessing_observations_end__

# __query_action_dist_start__
# Get a reference to the policy
import numpy as np
from ray.rllib.agents.ppo import PPOTrainer

trainer = PPOTrainer(env="CartPole-v0",
                     config={
                         "framework": "tf2",
                         "num_workers": 0
                     })
policy = trainer.get_policy()
# <ray.rllib.policy.eager_tf_policy.PPOTFPolicy_eager object at 0x7fd020165470>

# Run a forward pass to get model output logits. Note that complex observations
# must be preprocessed as in the above code block.
logits, _ = policy.model({"obs": np.array([[0.1, 0.2, 0.3, 0.4]])})
# (<tf.Tensor: id=1274, shape=(1, 2), dtype=float32, numpy=...>, [])

# Compute action distribution given logits
policy.dist_class
# <class_object 'ray.rllib.models.tf.tf_action_dist.Categorical'>
dist = policy.dist_class(logits, policy.model)
# <ray.rllib.models.tf.tf_action_dist.Categorical object at 0x7fd02301d710>

# Query the distribution for samples, sample logps
dist.sample()
    agentLow = PPOTrainer(config_low)
    experiment_name = "HWalk_Low_Mimic"
    experiment_id = "PPO_HumanoidBulletEnvLow-v0_699c9_00000_0_2021-04-18_22-14-39"
    checkpoint_num = "1930"
    agentLow.restore(
        "/home/aditya/ray_results/{}/{}/checkpoint_{}/checkpoint-{}".format(
            experiment_name, experiment_id, checkpoint_num, checkpoint_num))

    # agent.export_policy_model("out/model", "default_policy")
    # agent.import_model("out/model")

    # agent.get_policy("default_policy").import_model_from_h5

    agentHigh = PPOTrainer(config_hier)
    lowWeight = agentLow.get_policy().get_weights()
    highWeight = agentHigh.get_policy("low_level_policy").get_weights()
    importedPolicy = {
        hw: lowWeight[lw]
        for hw, lw in zip(highWeight.keys(), lowWeight.keys())
    }
    s1 = agentLow.get_policy().get_state()
    s11 = OrderedDict([(k.replace("default_policy", "low_level_policy"), v)
                       for k, v in s1['_optimizer_variables'].items()])
    importedPolicy['_optimizer_variables'] = s11
    agentHigh.get_policy("low_level_policy").set_state(importedPolicy)

    obs = single_env.low_level_obs_space.sample()
    print(agentLow.compute_action(obs))
    print(agentHigh.compute_action(obs, policy_id='low_level_policy'))
    print("=============================================================")
Пример #10
0
    del config['num_workers']
    del config['num_gpus']

ray.init(num_cpus=8, num_gpus=1)
PPOagent = PPOTrainer(env=env_name, config=config)
PPOagent.restore(checkpoint_path)


reward_sum = 0
frame_list = []
i = 0
env.reset()

for agent in env.agent_iter():
    observation, reward, done, info = env.last()
    reward_sum += reward
    if done:
        action = None
    else:
        action, _, _ = PPOagent.get_policy("policy_0").compute_single_action(observation)

    env.step(action)
    i += 1
    if i % (len(env.possible_agents)+1) == 0:
        frame_list.append(PIL.Image.fromarray(env.render(mode='rgb_array')))
env.close()


print(reward_sum)
frame_list[0].save("out.gif", save_all=True, append_images=frame_list[1:], duration=3, loop=0)
    obs_space = TrainerConfig.OBS_SPACE_CONNECT3
    print("The observation space is: ")
    print(obs_space)
    print("The action space is: ")
    act_space = TrainerConfig.ACT_SPACE_CONNECT3
    print(act_space)
    trainer_obj = PPOTrainer(config=TrainerConfig.PPO_TRAINER_CONNECT3, )
    restored_weights = []
    weights = np.load(weights_file, allow_pickle=True)
    weights_name = ["p" + str(i + 1) for i in range(weights_to_keep)]
    for name in weights_name:
        restored_weights.append(weights[()][name])
        trainer_obj.callbacks.add_weights(restored_weights[-1])

    for i, weights in enumerate(restored_weights):
        trainer_obj.get_policy("player1").set_weights(weights)

        model_to_evaluate = trainer_obj.get_policy("player1").model
        updated_weights = trainer_obj.get_policy("player1").get_weights()

        print("there are " + str(len(weights)) + " weights")

        indx = 0
        equal_weights = []
        for w1, w2 in zip(weights, updated_weights):
            if np.array_equal(w1, w2):
                equal_weights.append(indx)
            indx += 1
        print(equal_weights)

        elo_diff, model_score, minimax_score, draw = model_vs_minimax_connect3(
Пример #12
0
    'num_workers'] = 4  # noptepochs (int) Number of epoch when optimizing the surrogate
ppo_config[
    'clip_param'] = 0.2  # cliprange (float or callable) Clipping parameter, it can be a function
ppo_config[
    'vf_clip_param'] = 1  # cliprange_vf = None? --  (float or callable) Clipping parameter for the value function,
# it can be a function. This is a parameter specific to the OpenAI implementation. If None is passed (default), then
# cliprange (that is used for the policy) will be used. IMPORTANT: this clipping depends on the reward scaling. To
# deactivate value function clipping (and recover the original PPO implementation), you have to pass a negative value
# (e.g. -1).
ppo_config['env_config'] = env_config
ppo_config['train_batch_size'] = 4000
ppo_config['explore'] = False

PPO_agent = PPOTrainer(config=ppo_config, env=SSA_Tasker_Env)
PPO_agent.restore(ppo_checkpoint)
PPO_agent.get_policy().config['explore'] = False

logdir = '/home/ash/ray_results/ssa_experiences/agent_visible_greedy_spoiled/' + str(
    env_config['rso_count']) + 'RSOs_jones_flatten_10000episodes/'

marwil_config = MARWIL_CONFIG.copy()
marwil_config['evaluation_num_workers'] = 1
marwil_config['env_config'] = env_config
marwil_config['evaluation_interval'] = 1
marwil_config['evaluation_config'] = {'input': 'sampler'}
marwil_config['beta'] = 1  # 0
marwil_config['input'] = logdir
marwil_config['env_config'] = env_config
marwil_config['explore'] = False

MARWIL_agent = MARWILTrainer(config=marwil_config, env=SSA_Tasker_Env)
Пример #13
0
def load_agent():

    # Initialize training environment

    ray.init()

    def environment_creater(params=None):
        agent = SimpleAvoidAgent(noise=0.05)
        return TronRayEnvironment(board_size=13, num_players=4)

    env = environment_creater()
    tune.register_env("tron_multi_player", environment_creater)
    ModelCatalog.register_custom_preprocessor("tron_prep", TronExtractBoard)

    # Configure Deep Q Learning with reasonable values
    config = DEFAULT_CONFIG.copy()
    config['num_workers'] = 4
    ## config['num_gpus'] = 1
    #config["timesteps_per_iteration"] = 1024
    #config['target_network_update_freq'] = 256
    #config['buffer_size'] = 100_000
    #config['schedule_max_timesteps'] = 200_000
    #config['exploration_fraction'] = 0.02
    #config['compress_observations'] = False
    #config['n_step'] = 2
    #config['seed'] = SEED

    #Configure for PPO
    #config["sample_batch_size"]= 100
    #config["train_batch_size"]=200
    #config["sgd_minibatch_size"]=60
    #Configure A3C with reasonable values

    # We will use a simple convolution network with 3 layers as our feature extractor
    config['model']['vf_share_layers'] = True
    config['model']['conv_filters'] = [(512, 5, 1), (256, 3, 2), (128, 3, 2)]
    config['model']['fcnet_hiddens'] = [256]
    config['model']['custom_preprocessor'] = 'tron_prep'

    # All of the models will use the same network as before
    agent_config = {
        "model": {
            "vf_share_layers": True,
            "conv_filters": [(512, 5, 1), (256, 3, 2), (128, 3, 2)],
            "fcnet_hiddens": [256],
            "custom_preprocessor": 'tron_prep'
        }
    }

    def policy_mapping_function(x):
        if x == '0':
            return "trainer"
        return "opponent"

    config['multiagent'] = {
        "policy_mapping_fn": policy_mapping_function,
        "policies": {
            "trainer":
            (None, env.observation_space, env.action_space, agent_config),
            "opponent":
            (None, env.observation_space, env.action_space, agent_config)
        },
        "policies_to_train": ["trainer"]
    }

    # Begin training or evaluation
    #trainer = DDPGTrainer(config, "tron_single_player")
    #trainer = A3CTrainer(config, "tron_single_player")
    #trainer = MARWILTrainer(config, "tron_single_player")
    trainer = PPOTrainer(config, "tron_multi_player")

    trainer.restore("./sp_checkpoint_1802/checkpoint-1802")

    return trainer.get_policy("trainer")
Пример #14
0
    def execute(self):
        timesteps = 0
        best_period_value = None

        if self.pr.agent.name() == "A2C":
            trainer = A2CTrainer(config=self.rllib_config,
                                 logger_creator=rllib_logger_creator)
        elif self.pr.agent.name() == "PPO":
            trainer = PPOTrainer(config=self.rllib_config,
                                 logger_creator=rllib_logger_creator)
            # import pdb; pdb.set_trace()
        else:
            raise ValueError('There is no rllib trainer with name ' +
                             self.pr.agent.name())

        tf_writer = SummaryWriter(
            self.pr.save_logs_to) if self.pr.save_logs_to else None

        reward_metric = Metric(short_name='rews',
                               long_name='trajectory reward',
                               formatting_string='{:5.1f}',
                               higher_is_better=True)
        time_step_metric = Metric(short_name='steps',
                                  long_name='total number of steps',
                                  formatting_string='{:5.1f}',
                                  higher_is_better=True)

        metrics = [reward_metric, time_step_metric]

        if self.pr.train:
            start_time = time.time()
            policy_save_tag = 0
            while timesteps < self.pr.total_steps:

                result = trainer.train()

                timesteps = result["timesteps_total"]
                reward_metric.log(result['evaluation']['episode_reward_mean'])
                time_step_metric.log(result['evaluation']['episode_len_mean'])
                # import pdb; pdb.set_trace()
                # # Get a metric list from each environment.
                # if hasattr(trainer, "evaluation_workers"):
                #     metric_lists = sum(trainer.evaluation_workers.foreach_worker(lambda w: w.foreach_env(lambda e: e.metrics)), [])
                # else:
                #     metric_lists = sum(trainer.workers.foreach_worker(lambda w: w.foreach_env(lambda e: e.metrics)), [])

                # metrics = metric_lists[0]

                # # Aggregate metrics from all other environments.
                # for metric_list in metric_lists[1:]:
                #     for i, metric in enumerate(metric_list):
                #         metrics[i]._values.extend(metric._values)

                save_logs_to = self.pr.save_logs_to
                model_save_paths_dict = self.pr.model_save_paths_dict
                # Consider whether to save a model.
                saved = False
                if model_save_paths_dict is not None and metrics[
                        0].currently_optimal:
                    # trainer.get_policy().model.save(model_save_paths_dict)
                    policy_save_tag += 1
                    trainer.get_policy().model.save_model_in_progress(
                        model_save_paths_dict, policy_save_tag)
                    saved = True

                # Write the metrics for this reporting period.
                total_seconds = time.time() - start_time
                logger.write_and_condense_metrics(total_seconds, 'iters',
                                                  timesteps, saved, metrics,
                                                  tf_writer)

                # Clear the metrics, both those maintained by the training workers and by the evaluation ones.
                condense_fn = lambda environment: [
                    m.condense_values() for m in environment.metrics
                ]
                trainer.workers.foreach_worker(
                    lambda w: w.foreach_env(condense_fn))
                if hasattr(trainer, "evaluation_workers"):
                    trainer.evaluation_workers.foreach_worker(
                        lambda w: w.foreach_env(condense_fn))

        else:
            start_time = time.time()
            env = trainer.workers.local_worker().env
            metrics = env.metrics
            worker = trainer.workers.local_worker()
            steps = steps_since_report = 0

            while True:
                batch = worker.sample()
                current_steps = len(batch["obs"])
                steps += current_steps
                steps_since_report += current_steps

                if steps_since_report >= self.pr.reporting_interval:
                    total_seconds = time.time() - start_time

                    # Write the metrics for this reporting period.
                    logger.write_and_condense_metrics(total_seconds, 'iters',
                                                      steps, False, metrics,
                                                      tf_writer)

                    steps_since_report = 0
                    if steps >= self.pr.total_steps:
                        break

            env.close()

        # Get a summary metric for the entire stage, based on the environment's first metric.
        summary_metric = logger.summarize_stage(metrics[0])

        # Temporary workaround for https://github.com/ray-project/ray/issues/8205
        ray.shutdown()
        _register_all()

        return summary_metric
#         results_path = os.path.split(checkpoint_path)[0]
#     else:
#         results_path = args.results_path
#     evaluator.evaluate(trainer, results_path)

###########################################################
# Visualize salient map
if args.visualize_salient_obj:
    HIGH_RES_OUTPUT = True
    if HIGH_RES_OUTPUT:
        out = cv2.VideoWriter('salient_obj_video.mp4', cv2.VideoWriter_fourcc(*"MJPG"), 30, (320, 320))
    else:
        out = cv2.VideoWriter('salient_obj_video.mp4', cv2.VideoWriter_fourcc(*"MJPG"), 30,
                              eval(config["env_config"]["resized_input_shape"]))

    model = tf.keras.models.clone_model(trainer.get_policy().model.base_model)  # type: tf.keras.Model
    env = launch_and_wrap_env(config["env_config"])
    obs_wrappers, _, _ = get_wrappers(env)
    env.reset()
    obs = env.reset()
    done = False
    while not done:
        salient_map_mean, action_dist_params = nvidia_salient_map(model, obs)
        if HIGH_RES_OUTPUT:
            render = env.render_obs()
            displayed_obs = obs_wrappers[0].observation(render)  # Clipping wrapper, shouldn't be hardcoded
            displayed_obs = cv2.resize(displayed_obs, (displayed_obs.shape[0], displayed_obs.shape[0]),
                                       interpolation=cv2.INTER_AREA)
            displayed_obs = (displayed_obs / 255.).astype(np.float32)
        else:
            displayed_obs = obs
    if restore_ckpt:
        best_ckpt = restore_training(trainer_obj, ckpt_dir,
                                     custom_metrics_file)

    else:
        best_ckpt = 0
        print("Starting training from scratch")

    for epoch in tqdm(range(best_ckpt + 1, epochs)):
        print("Epoch " + str(epoch))
        results = trainer_obj.train()
        p1_score = results["custom_metrics"]["player1_score"]
        minimax_score = results["custom_metrics"]["player2_score"]
        score_difference = results["custom_metrics"]["score_difference"]
        actual_depth = trainer_obj.get_policy("minimax").depth

        if epoch % ckpt_step == 0 and epoch != 0:
            custom_metrics = results["custom_metrics"]
            save_checkpoint(trainer_obj, ckpt_dir, custom_metrics_file,
                            custom_metrics, ckpt_to_keep)

        if p1_score >= minimax_score:
            print("Player 1 was able to beat MiniMax algorithm with depth " +
                  str(actual_depth))
            new_depth = actual_depth + 1
            print("Increasing Minimax depth to " + str(new_depth))
            trainer_obj.get_policy("minimax").depth = new_depth
            trainer_obj.save(Config.IMPORTANT_CKPT_PATH)

            if new_depth > max_depth:
Пример #17
0
def run_saved(args):
    if args.OSM[0] == 1 and args.OSM[1] == 0:
        setting = "RLvsOSM"
    elif args.OSM[0] == 1 and args.OSM[1] == 1:
        setting = "OSMvsOSM"
    else:
        setting = "RL{0}".format(len(args.alphas) - sum(args.honest))
    if args.save_path == 'none':
        checkpointnum = 0
    else:
        checkpointnum = args.save_path.split('-')[-1]
    env_name = "{setting}_{spirit}_{blocks}_{alpha:04d}_{spy}_{checkpointnum}".format(
        spirit=int(args.team_spirit * 100),
        blocks=int(args.blocks),
        alpha=int(args.alphas[0] * 10000),
        spy=args.spy[1],
        setting=setting,
        checkpointnum=checkpointnum)
    ray.init(local_mode=True,
             memory=700 * 1024 * 1024,
             object_store_memory=100 * 1024 * 1024,
             driver_object_store_memory=100 * 1024 * 102)
    print("Testing {0}".format(setting), env_name)

    def select_policy(agent_id):
        return agent_id

    ModelCatalog.register_custom_model("pa_model", ParametricActionsModel)
    register_env(env_name, lambda config: ParametricBitcoin(config))

    if args.extended:
        action_n = 6
    else:
        action_n = 4
    # define the state space, one for parties that have access to spy info and one without
    spy_state_space = constants.make_spy_space(len(args.alphas), args.blocks)
    blind_state_space = constants.make_blind_space(len(args.alphas),
                                                   args.blocks)
    policies = dict()
    osm_space = spaces.Box(
        low=np.zeros(4),
        high=np.array([args.blocks + 4, args.blocks + 4, args.blocks + 4, 3.]))
    if sum(args.OSM) > 0:
        osm = OSM_strategy(
            osm_space, spaces.Discrete(4), {
                'alpha': args.alphas[0],
                'gamma': args.gammas[0],
                'blocks': args.blocks
            })

    blind_dim = 0
    for space in blind_state_space:
        blind_dim += get_preprocessor(space)(space).size

    spy_dim = 0
    for space in spy_state_space:
        spy_dim += get_preprocessor(space)(space).size

    spy_state_space_wrapped = spaces.Dict({
        "action_mask":
        spaces.Box(0, 1, shape=(action_n, )),
        "avail_actions":
        spaces.Box(-10, 10, shape=(action_n, action_n)),
        "bitcoin":
        spaces.Box(0, np.inf, shape=(spy_dim, ))
    })
    blind_state_space_wrapped = spaces.Dict({
        "action_mask":
        spaces.Box(0, 1, shape=(action_n, )),
        "avail_actions":
        spaces.Box(-10, 10, shape=(action_n, action_n)),
        "bitcoin":
        spaces.Box(0, np.inf, shape=(blind_dim, ))
    })
    preps = [None for i in range(len(args.alphas))]
    for i in range(len(args.alphas)):
        if args.spy[i] == 1:
            policies[str(i)] = (None, spy_state_space_wrapped,
                                spaces.Discrete(action_n), {
                                    "model": {
                                        "use_lstm": args.use_lstm,
                                        "custom_model": "pa_model",
                                        "custom_options": {
                                            "parties": len(args.alphas),
                                            "spy": True,
                                            "blocks": args.blocks,
                                            "extended": args.extended
                                        }
                                    }
                                })
            preps[i] = get_preprocessor(spy_state_space_wrapped)(
                spy_state_space_wrapped)
        elif args.OSM[i] == 1:
            policies[str(i)] = (OSM_strategy, osm_space, spaces.Discrete(4), {
                'alpha': args.alphas[0],
                'gamma': args.gammas[0],
                'blocks': args.blocks
            })
        elif args.honest[i] == 1:
            policies[str(i)] = (Honest, osm_space, spaces.Discrete(6), {
                'alpha': args.alphas[0],
                'gamma': args.gammas[0],
                'blocks': args.blocks,
                'fiftyone': args.fiftyone[i],
                'extended': args.extended
            })
        else:
            policies[str(i)] = (None, blind_state_space_wrapped,
                                spaces.Discrete(action_n), {
                                    "model": {
                                        "use_lstm": args.use_lstm,
                                        "custom_model": "pa_model",
                                        "custom_options": {
                                            "parties": len(args.alphas),
                                            "spy": False,
                                            "blocks": args.blocks,
                                            "extended": args.extended
                                        }
                                    }
                                })
            preps[i] = get_preprocessor(blind_state_space_wrapped)(
                blind_state_space_wrapped)
    env_config = {
        'max_hidden_block': args.blocks,
        'alphas': args.alphas,
        'gammas': args.gammas,
        'ep_length': args.ep_length,
        'print': args.debug,
        'spy': args.spy,
        'team_spirit': args.team_spirit,
        'OSM': args.OSM,
        'extended': args.extended,
        'honest': args.honest,
    }
    policies_to_train = [
        str(i) for i in range(len(args.alphas))
        if args.OSM[i] != 1 and args.honest[i] != 1
    ]
    env = ParametricBitcoin(env_config=env_config)
    if len(policies_to_train) != 0:
        if args.trainer == 'PPO':
            trainer = PPOTrainer(env=BitcoinEnv,
                                 config={
                                     "num_workers": 0,
                                     "multiagent": {
                                         "policies_to_train":
                                         policies_to_train,
                                         "policies": policies,
                                         "policy_mapping_fn": select_policy,
                                     },
                                     "env_config": env_config
                                 })
        else:
            trainer = DQNTrainer(env=env_name,
                                 config={
                                     "eager": True,
                                     "multiagent": {
                                         "policies_to_train":
                                         policies_to_train,
                                         "policies": policies,
                                         "policy_mapping_fn": select_policy,
                                     },
                                     "env_config": env_config
                                 })
            model = trainer.get_policy().model
            print(model.base_model.summary())
        print("Restoring model")
        trainer.restore(args.save_path)
    loaded_policies = dict()
    for k in range(len(args.alphas)):
        if args.OSM[k] == 1:
            loaded_policies[str(k)] = osm
        elif args.honest[k] == 1:
            honest = Honest(
                osm_space,
                spaces.Discrete(6),
                {
                    'alpha': args.alphas[0],
                    'gamma': args.gammas[0],
                    'blocks': args.blocks,
                    'fiftyone': args.fiftyone[k],
                    'extended': args.extended
                },
            )
            loaded_policies[str(k)] = honest
            preps[k] = None
        else:
            loaded_policies[str(k)] = trainer.get_policy(str(k))
    trials = 100000
    reslist = []
    for j in range(3):
        blocks = np.zeros(len(args.alphas) + 1)
        event_blocks = np.zeros(len(args.alphas) + 1)
        action_dist = {
            str(i): np.zeros(action_n)
            for i in range(len(args.alphas))
        }
        res = dict()
        for i in range(trials):
            obs = env.reset()
            isDone = False
            RNNstates = {str(i): [] for i in range(len(args.alphas))}
            while not isDone:
                action_dict = dict()
                for k in range(len(policies)):
                    prep = preps[k]
                    if not prep:
                        action_dict[str(k)], _, _ = loaded_policies[str(
                            k)].compute_single_action(obs=obs[str(k)],
                                                      state=[])
                    else:
                        action_dict[str(k)], _, _ = loaded_policies[str(
                            k)].compute_single_action(obs=prep.transform(
                                obs[str(k)]),
                                                      state=[])
                    action_dist[str(k)][action_dict[str(k)]] += 1
                obs, _, done, _ = env.step(action_dict)
                isDone = done['__all__']
            if i == 0 and j == 0:
                with open(
                        os.path.join('/afs/ece/usr/charlieh/eval_results',
                                     env_name + '_trace.txt'), 'w+') as f:
                    f.write(env.wrapped._debug_string)
            blocks += env.wrapped._accepted_blocks
            event_blocks += env.wrapped._total_blocks
            total_event_blocks = np.sum(event_blocks)
            if i % 100 == 0:
                print("Relative rewards", blocks / np.sum(blocks))
                print("Relative received", event_blocks / total_event_blocks)
                for i in range(len(args.alphas)):
                    print("Action dist", str(i),
                          action_dist[str(i)] / np.sum(action_dist[str(i)]))
        res['blocks'] = blocks
        res['action dist'] = action_dist
        res['blocks norm'] = blocks / np.sum(blocks)
        res['actions norm'] = {
            str(i): action_dist[str(i)] / np.sum(action_dist[str(i)])
            for i in range(len(args.alphas))
        }
        reslist.append(res)
    np.save(os.path.join('/afs/ece/usr/charlieh/eval_results', env_name),
            reslist,
            allow_pickle=True)
Пример #18
0
# Set up env
ray.init(**config["ray_init_config"])
register_env('Duckietown', launch_and_wrap_env)

###########################################################
# Restore agent
trainer = PPOTrainer(config=config["rllib_config"])
trainer.restore(checkpoint_path)

print_config(trainer.config)

###########################################################
# Visualize
HIGH_RES_OUTPUT = True
model = tf.keras.models.clone_model(
    trainer.get_policy().model.base_model)  # type: tf.keras.Model

cap = cv2.VideoCapture('./docs/Real.mp4')

# fourcc = cv2.VideoWriter_fourcc(*'FMP4')
fps = cap.get(cv2.CAP_PROP_FPS)
if HIGH_RES_OUTPUT:
    out = cv2.VideoWriter('salient_obj_video.mp4',
                          cv2.VideoWriter_fourcc(*"MJPG"), fps, (320, 320))
else:
    out = cv2.VideoWriter('salient_obj_video.mp4',
                          cv2.VideoWriter_fourcc(*"MJPG"), fps,
                          eval(config["env_config"]["resized_input_shape"]))

dummy_env = wrap_env(config["env_config"])
obs_wrappers, _, _ = get_wrappers(dummy_env)
Пример #19
0
        # restoring checkpoints require ray
        ray.init()
        # best_ckpt=restore_training(trainer_obj, ckpt_dir,custom_metrics_file)
        with open(Config.MINIMAX_DEPTH_PATH) as json_file:
            data = json.load(json_file)
            minimax_depth = 3  #data["minimax_depth"]

        # restore weights from a previous run
        restored_weights = []
        weights = np.load(weights_file, allow_pickle=True)
        weights_name = ["p" + str(i + 1) for i in range(weights_to_keep)]
        for name in weights_name:
            restored_weights.append(weights[()][name])
            trainer_obj.callbacks.add_weights(restored_weights[-1])
        # give player 1 the best weights
        trainer_obj.get_policy("player1").set_weights(restored_weights[-1])

        ray.shutdown()

    else:
        best_ckpt = 0
        minimax_depth = 1
        print("Starting training from scratch")

    # import moved here otherwise i get version compatibility issues by using
    # the log_creator
    import tensorflow as tf

    number_of_stochastic_moves = 5

    logdir = str(trainer_obj._logdir)
Пример #20
0
#update_percentage = update_times * 0.01
epoch_update = 0

for epoch in range(num_epoch):
    print("Training iteration: {}".format(epoch), end='\t')
    res = trainer.train()
    win_percentage = (res["policy_reward_mean"]["trainer"] -
                      res["episode_len_mean"]) / 11 - 10 / 11 + 1
    print("Win percentage: ", win_percentage, end='\t')
    print("Average reward: ", res["policy_reward_mean"]["trainer"])
    update_percentage = update_times * 0.01
    if win_percentage > 0.72 + update_percentage or win_percentage > 0.82:
        #    and res["policy_reward_mean"]["trainer"] > 18 + update_times:
        if epoch_update == 0:
            epoch_update = epoch

        if epoch >= epoch_update + 5:
            update_times += 1
            epoch_update = epoch
            print("UPDATING OPPONENTS")
            trainer_weights = trainer.get_policy("trainer").get_weights()
            trainer.get_policy("opponent").set_weights(trainer_weights)
            reward = env.test(trainer)
    if epoch % save_epochs == 0:
        trainer.save()
    #print(res)
    #print("Average reward: ", res["policy_reward_mean"]["trainer"] )

    if epoch % 1 == 0:
        reward = env.test(trainer)
Пример #21
0
 # Serving and training loop
 env = trainer.env_creator({})
 # obs_state = {}
 # obs_state["obs"] = obs[list(obs.keys())[0]]
 player1 = Connect4Config.PLAYER1
 player1_id = Connect4Config.PLAYER1_ID
 player2 = Connect4Config.PLAYER2
 player2_id = Connect4Config.PLAYER2_ID
 actual_player = player1
 actual_player_id = player1_id
 obs = env.reset(player1_id)
 obs = {"obs": obs[actual_player]}
 action_dict = {}
 while True:
     # action, state, info_trainer = trainer.get_policy(actual_player).compute_single_action(obs)#compute_action(obs[actual_player],policy_id=actual_player,explore=False)#, full_fetch=True)
     action_logits, _ = trainer.get_policy(actual_player).model.forward(
         obs, None, None)
     action = np.argmax(action_logits[0])
     action_dict = {actual_player: action}
     print("Player " + str(actual_player_id + 1) + " picked column: " +
           str(action + 1))
     obs, reward, done, info = env.step(action_dict)
     print(env)
     if done["__all__"]:
         print("Player " + str(actual_player_id + 1) + " WON!!!!!!")
         obs = env.reset()
         break
     if actual_player == player1:
         actual_player = player2
         actual_player_id = player2_id
     else:
         actual_player = player1
Пример #22
0
import os

import ray
import ray.tune as tune
from ray.tune import sample_from
from fast_image_env import FastImageEnv
from fast_model import TorchFastModel, TorchCustomFastModel
from ray.rllib.models import ModelCatalog
from ray.rllib.agents.ppo import PPOTrainer

if __name__ == "__main__":

    ray.shutdown()
    ray.init()

    config = {
        "env": FastImageEnv,
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": 1,
        "num_workers": 4,
        "framework": "torch",
    }

    trainer = PPOTrainer(config=config)
    print(trainer.get_policy().model)

    results = tune.run("PPO", config=config, verbose=3)
    print(results)

    ray.shutdown()
Пример #23
0
    TOTAL_STEPS = int(args.total_steps)
    launch_script = "./launchClient_quiet.sh"

    register_env(ENV_NAME, create_env)

    # update config with evaluation resources and switch exploration off
    config = get_config(checkpoint_file)
    config["num_workers"] = args.num_workers
    config["num_gpus"] = args.num_gpus
    config["explore"] = False

    # Load agent
    ray.init()
    trainer = PPOTrainer(config)
    trainer.restore(checkpoint_file)
    policy = trainer.get_policy()

    # Start Malmo instances
    GAME_INSTANCE_PORTS = [COMMAND_PORT + i for i in range(NUM_WORKERS)]
    instances = launch_minecraft(GAME_INSTANCE_PORTS,
                                 launch_script=launch_script)

    # Connect to the Java instances
    env = create_env(config)

    # Custom evaluation loop
    print(f"running evaluations for {EPISODES} episodes")
    for ep in range(EPISODES):
        state = env.reset()
        done = False
        ep_length = 0
Пример #24
0
    },
    "observation_filter": "NoFilter",
    "clip_actions": False,
    "framework": "torch"
},
                       env="MinerEnv-v0")

id = 2050
checkpoint_dir = "/home/lucius/ray_results/gold_miner_2/PPO_MinerEnv-v0_0_2020-09-13_00-54-26q3mjnpej"
checkpoint = "{}/checkpoint_{}/checkpoint-{}".format(checkpoint_dir, id, id)

ppo_agent.restore(checkpoint)

for i in range(8):
    mem_size = 0
    weights = ppo_agent.get_policy(f"policy_{i}").get_weights()
    for key in weights:
        parameters = 1
        for value in weights[key].shape:
            parameters *= value

        mem_size += parameters

        weights[key] = torch.tensor(weights[key])
    print(mem_size)
    torch.save(
        weights,
        f"/home/lucius/working/projects/gold_miner/resources/TrainedModels/model_{i}.pt"
    )

    # model = FourthModel(constants.OBS_SPACE, constants.ACT_SPACE, 6, {}, "model", constants.NUM_FEATURES)
    lstm_weights = np.load(best_weights_npy,allow_pickle=True)
    number_of_evaluation_games = Config.NUMBER_OF_EVALUATION_GAMES #  100 
    number_of_games_to_test = Config.NUMBER_OF_GAMES_TO_TEST #[1,2,3,4,5]
    depth_list = Config.DEPTH_LIST # [1,4,6]
    number_of_stochastic_moves = 6
    sequence_len = lstm_timesteps
    
    npy_weights_file = os.path.join(data_dir,"weights.npy")
    weights = np.load(npy_weights_file,allow_pickle=True)[()]  
    
    play = True
    
    trainer_obj = PPOTrainer(
        config=TrainerConfig.PPO_TRAINER_CONNECT3,
    )
    model = trainer_obj.get_policy("player1").model

    # =============================================================================
    # TEST THE MODEL 
    # =============================================================================
    import tensorflow as tf 
    lstm_model = LSTM_model(batch_size,(lstm_timesteps,features_len),output_len,lstm_hidden,False)  
    
    # generate a fake input to define the model stucture and then load the weights 
    # [batch,timestep,features]
    # random_input = np.random.rand(1,lstm_timesteps,features_len)
    random_input = np.random.rand(1,lstm_timesteps,features_len)
    random_input = random_input.astype('float32')
    lstm_model(random_input)
    lstm_model.set_weights(lstm_weights[()])
    
Пример #26
0
}

ppo_trainer_config = {
    "env": "ParametricScopone",
    "multiagent": {
        "policies_to_train": ["ppo_policy_nico"],
        "policies":
        policies,
        "policy_mapping_fn":
        lambda agent_id: "ppo_policy_albi"
        if agent_id in ("player_1", "player_3") else "ppo_policy_nico",
    },
    "observation_filter": "NoFilter",
    "callbacks": PlayerScoreCallbacks
}

trainer = PPOTrainer(config=ppo_trainer_config)
if restore_checkpoint:
    trainer.restore(checkpoint_path)

trainer.get_policy("ppo_policy_albi").model.base_model.summary()
trainer.get_policy("ppo_policy_nico").model.base_model.summary()

for i in range(10000):
    res = trainer.train()
    print("Iteration {}. policy_reward_mean: {}".format(
        i, res['policy_reward_mean']))
    if i % checkpoint_every == 0:
        trainer.save()

print('Training finished, check the results in ~/ray_results/<dir>/')