Пример #1
0
}

ppo_trainer_config = {
    "env": "ParametricScopone",
    "multiagent": {
        "policies_to_train": ["ppo_policy_nico"],
        "policies":
        policies,
        "policy_mapping_fn":
        lambda agent_id: "ppo_policy_albi"
        if agent_id in ("player_1", "player_3") else "ppo_policy_nico",
    },
    "observation_filter": "NoFilter",
    "callbacks": PlayerScoreCallbacks
}

trainer = PPOTrainer(config=ppo_trainer_config)
if restore_checkpoint:
    trainer.restore(checkpoint_path)

trainer.get_policy("ppo_policy_albi").model.base_model.summary()
trainer.get_policy("ppo_policy_nico").model.base_model.summary()

for i in range(10000):
    res = trainer.train()
    print("Iteration {}. policy_reward_mean: {}".format(
        i, res['policy_reward_mean']))
    if i % checkpoint_every == 0:
        trainer.save()

print('Training finished, check the results in ~/ray_results/<dir>/')
Пример #2
0
        return out, []

    @override(ModelV2)
    def value_function(self):
        return tf.reshape(self._value_out, [-1])


if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()

    ModelCatalog.register_custom_model("bn_model", BatchNormModel)

    config = {
        "env": "Pendulum-v0" if args.run == "DDPG" else "CartPole-v0",
        "model": {
            "custom_model": "bn_model",
        },
        "num_workers": 0,
    }

    from ray.rllib.agents.ppo import PPOTrainer
    trainer = PPOTrainer(config=config)
    trainer.train()

    tune.run(
        args.run,
        stop={"training_iteration": args.num_iters},
        config=config,
    )
Пример #3
0
        "my_model",
        TorchMobileV2PlusRNNModel if args.torch else MobileV2PlusRNNModel)

    # Configure our Trainer.
    config = {
        "framework": "torch" if args.torch else "tf",
        "model": {
            "custom_model": "my_model",
            # Extra config passed to the custom model's c'tor as kwargs.
            "custom_model_config": {
                "cnn_shape": cnn_shape_torch if args.torch else cnn_shape,
            },
            "max_seq_len": 20,
        },
        "vf_share_layers": True,
        "num_workers": 0,  # no parallelism
        "env_config": {
            "action_space":
            Discrete(2),
            # Test a simple Image observation space.
            "observation_space":
            Box(0.0,
                1.0,
                shape=cnn_shape_torch if args.torch else cnn_shape,
                dtype=np.float32)
        },
    }

    trainer = PPOTrainer(config=config, env=RandomEnv)
    print(trainer.train())
Пример #4
0
def my_train_fn(config, reporter):
    iterations = config.pop("train-iterations", 10)

    # Train for n iterations with high LR
    agent1 = PPOTrainer(env="CartPole-v0", config=config)
    for _ in range(iterations):
        result = agent1.train()
        result["phase"] = 1
        reporter(**result)
        phase1_time = result["timesteps_total"]
    state = agent1.save()
    agent1.stop()

    # Train for n iterations with low LR
    config["lr"] = 0.0001
    agent2 = PPOTrainer(env="CartPole-v0", config=config)
    agent2.restore(state)
    for _ in range(iterations):
        result = agent2.train()
        result["phase"] = 2
        result["timesteps_total"] += phase1_time  # keep time moving forward
        reporter(**result)
    agent2.stop()
Пример #5
0
    agent_cfg[
        "grad_clip"] = None  # If not None, clip gradients during optimization at this value

# ================= Configure the learning algorithm =================

# agent_cfg = copy.deepcopy(DEFAULT_CONFIG)
# agent_cfg["lr"] = 5.0e-6
# agent_cfg["lr_schedule"] = [
#     [      0, 5.0e-6],
#     [ 100000, 1.0e-6],
#     [ 400000, 1.0e-6],
#     [ 800000, 1.0e-7],
#     [1000000, 1.0e-7],
#     [1200000, 1.0e-9],
# ]
train_agent = Trainer(agent_cfg, env="my_custom_env")

# ================= Run the optimization =================

timesteps_total = 400000
results_fields_filter = [
    "training_iteration", "time_total_s", "timesteps_total",
    "episode_reward_max", "episode_reward_mean",
    [
        "info",
        [
            "sample_time_ms", "grad_time_ms", "opt_peak_throughput",
            "sample_peak_throughput"
        ]
    ]
]
Пример #6
0
    # Train the "main" policy to play really well using self-play.
    results = None
    if not args.from_checkpoint:
        results = tune.run(
            "PPO",
            config=config,
            stop=stop,
            checkpoint_at_end=True,
            checkpoint_freq=10,
            verbose=1)

    # Restore trained trainer (set to non-explore behavior) and play against
    # human on command line.
    if args.num_episodes_human_play > 0:
        num_episodes = 0
        trainer = PPOTrainer(config=dict(config, **{"explore": False}))
        if args.from_checkpoint:
            trainer.restore(args.from_checkpoint)
        else:
            trainer.restore(results.get_last_checkpoint())

        # Play from the command line against the trained agent
        # in an actual (non-RLlib-wrapped) open-spiel env.
        human_player = 1
        env = Environment("connect_four")

        while num_episodes < args.num_episodes_human_play:
            print("You play as {}".format("o" if human_player else "x"))
            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
Пример #7
0
    omega_r, omega_l = u_r / k_r_inv, u_l / k_l_inv
    vel = (omega_r + omega_l) * radius / 2
    angle = (omega_r * radius - vel) / (0.5 * baseline)
    angle_backup = (vel - omega_l * radius) / (0.5 * baseline)
    # print("angle is {}, angle backup is {}".format(angle, angle_backup))
    assert math.isclose(angle, angle_backup)
    return np.array([vel, angle])


# Set up env
ray.init(**config["ray_init_config"])
register_env('Duckietown', launch_and_wrap_env)

###########################################################
# Restore agent
trainer = PPOTrainer(config=config["rllib_config"])
trainer.restore(checkpoint_path)

print_config(trainer.config)

# add seed to env config
seed = args.seed

actions = []

###########################################################
###########################################################
# Simple demonstration of closed loop performance
if not (args.analyse_trajectories or args.visualize_salient_obj
        or args.reward_plots or args.visualize_dot_trajectories):
    # env = Monitor(env, "gym_monitor_results", write_upon_reset=True, force=True)
Пример #8
0
        'num_workers': 0,
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": select_policy,
            },
        "clip_actions": True,
        "framework": "torch",
        #"num_sgd_iter": 4,
        "lr": 1e-4,
        #"kl_target": 0.03,
        #"train_batch_size": 1024,
        "rollout_fragment_length": 100,
        #"sgd_minibatch_size": 32
    }

    trainer = PPOTrainer(env="wanderer_roborobo", config=config)
    print(trainer.config.get('no_final_linear'))
    print('model built')
    stop_iter = 2000

    #%%
    import numpy as np
    for i in range(stop_iter):
        print("== Iteration", i, "==")
        result_ppo = trainer.train()
        pretty_print(result_ppo)
        if (i+1) % 200 == 0:
            trainer.save('model')
    trainer.save('model')
    del trainer
    ray.shutdown()
Пример #9
0
    ray.init(address='auto')  # address = None when running locally. address = 'auto' when running on aws.]
    obs_state_processor = SimpleObsStateProcessor(pathogen_sorting_strategy=infected_population_sorting_per_city)
    act_state_processor = SimpleActStateProcessor(sort_pathogens=obs_state_processor.sort_pathogens)

    # Notice that trial_max will only work for stochastic policies
    register_env("ic20env",
                 lambda _: SimplifiedIC20Environment(obs_state_processor, act_state_processor, UnstableReward(),
                                                     trial_max=10))
    ten_gig = 10737418240

    trainer = PPOTrainer(
        env="ic20env",
        config=merge_dicts(DEFAULT_CONFIG, {
            # -- Rollout-Worker
            'num_gpus': 0,
            'num_workers': 5,
            "num_envs_per_worker": 1,
            "num_cpus_per_worker": 1,
            "memory_per_worker": ten_gig,
            'gamma': 0.99,
        }))

    # Attempt to restore from checkpoint if possible.
    if os.path.exists(CHECKPOINT_FILE):
        checkpoint_path = open(CHECKPOINT_FILE).read()
        print("Restoring from checkpoint path", checkpoint_path)
        trainer.restore(checkpoint_path)

    # Serving and training loop
    while True:
        print(pretty_print(trainer.train()))
Пример #10
0
    # Train the "main" policy to play really well using self-play.
    results = None
    if not args.from_checkpoint:
        results = tune.run("PPO",
                           config=config,
                           stop=stop,
                           checkpoint_at_end=True,
                           checkpoint_freq=10,
                           verbose=3)

    # Restore trained trainer (set to non-explore behavior) and play against
    # human on command line.
    if args.num_episodes_human_play > 0:
        num_episodes = 0
        trainer = PPOTrainer(config=dict(config, **{"explore": False}))
        if args.from_checkpoint:
            trainer.restore(args.from_checkpoint)
        else:
            checkpoint = results.get_last_checkpoint()
            if not checkpoint:
                raise ValueError("No last checkpoint found in results!")
            trainer.restore(checkpoint)

        # Play from the command line against the trained agent
        # in an actual (non-RLlib-wrapped) open-spiel env.
        human_player = 1
        env = Environment(args.env)

        while num_episodes < args.num_episodes_human_play:
            print("You play as {}".format("o" if human_player else "x"))
Пример #11
0
 def testLocal(self):
     ray.init(local_mode=True)
     cf = DEFAULT_CONFIG.copy()
     agent = PPOTrainer(cf, "CartPole-v0")
     print(agent.train())
Пример #12
0
                                         "type": "EpsilonGreedy",
                                         "initial_epsilon": 1.0,
                                         "final_epsilon": 0.02,
                                         "epsilon_timesteps": 1000,
                                     },
                                     "learning_starts": 100,
                                     "timesteps_per_iteration": 200,
                                     "log_level": "INFO",
                                     "framework": args.framework,
                                 }))
    elif args.run == "PPO":
        # Example of using PPO (does NOT support off-policy actions).
        trainer = PPOTrainer(env=env,
                             config=dict(
                                 connector_config, **{
                                     "sample_batch_size": 1000,
                                     "train_batch_size": 4000,
                                     "framework": args.framework,
                                 }))
    else:
        raise ValueError("--run must be DQN or PPO")

    checkpoint_path = CHECKPOINT_FILE.format(args.run)

    # Attempt to restore from checkpoint if possible.
    if os.path.exists(checkpoint_path):
        checkpoint_path = open(checkpoint_path).read()
        print("Restoring from checkpoint path", checkpoint_path)
        trainer.restore(checkpoint_path)

    # Serving and training loop
Пример #13
0
                 [9, 101, 100, 11, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4]),
             spaces.MultiDiscrete(
                 [9, 101, 100, 11, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4]),
             spaces.MultiDiscrete(
                 [9, 101, 100, 11, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4]),
             spaces.MultiDiscrete(
                 [9, 101, 100, 11, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4]),
             spaces.MultiDiscrete(
                 [9, 101, 100, 11, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4]),
             spaces.MultiDiscrete(
                 [9, 101, 100, 11, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4, heroId, 4])
             ))
DEFAULT_CONFIG["env_config"]["action_space"] = spaces.MultiDiscrete([7, 9, 9])

ray.init()
trainer = PPOTrainer(config=DEFAULT_CONFIG, env=RandomEnv)
trainer2 = PPOTrainer(config=DEFAULT_CONFIG, env=RandomEnv)

checkpoint_path = "checkpoints/"
checkpoint1 = "checkpoint_000001/checkpoint-1"
fullpath1 = checkpoint_path + checkpoint1

checkpoint2 = "checkpoint_000005/checkpoint-5"
fullpath2 = checkpoint_path + checkpoint2

sum1a = 0
sum1b = 0
sum2a = 0
sum2b = 0

Пример #14
0
    def test_minibatch_sequencing(self):
        ModelCatalog.register_custom_model("rnn", RNNSpyModel)
        register_env("counter", lambda _: DebugCounterEnv())
        ppo = PPOTrainer(
            env="counter",
            config={
                "shuffle_sequences": False,  # for deterministic testing
                "num_workers": 0,
                "rollout_fragment_length": 20,
                "train_batch_size": 20,
                "sgd_minibatch_size": 10,
                "num_sgd_iter": 1,
                "model": {
                    "custom_model": "rnn",
                    "max_seq_len": 4,
                    "vf_share_layers": True,
                },
                "framework": "tf",
            })
        ppo.train()
        ppo.train()

        # first epoch: 20 observations get split into 2 minibatches of 8
        # four observations are discarded
        batch0 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0"))
        batch1 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1"))
        if batch0["sequences"][0][0][0] > batch1["sequences"][0][0][0]:
            batch0, batch1 = batch1, batch0  # sort minibatches
        self.assertEqual(batch0["seq_lens"].tolist(), [4, 4, 2])
        self.assertEqual(batch1["seq_lens"].tolist(), [2, 3, 4, 1])
        check(batch0["sequences"], [
            [[0], [1], [2], [3]],
            [[4], [5], [6], [7]],
            [[8], [9], [0], [0]],
        ])
        check(batch1["sequences"], [
            [[10], [11], [0], [0]],
            [[12], [13], [14], [0]],
            [[0], [1], [2], [3]],
            [[4], [0], [0], [0]],
        ])

        # second epoch: 20 observations get split into 2 minibatches of 8
        # four observations are discarded
        batch2 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_2"))
        batch3 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_3"))
        if batch2["sequences"][0][0][0] > batch3["sequences"][0][0][0]:
            batch2, batch3 = batch3, batch2
        self.assertEqual(batch2["seq_lens"].tolist(), [4, 4, 2])
        self.assertEqual(batch3["seq_lens"].tolist(), [4, 4, 2])
        check(batch2["sequences"], [
            [[0], [1], [2], [3]],
            [[4], [5], [6], [7]],
            [[8], [9], [0], [0]],
        ])
        check(batch3["sequences"], [
            [[5], [6], [7], [8]],
            [[9], [10], [11], [12]],
            [[13], [14], [0], [0]],
        ])
Пример #15
0
        done = self.episode_len >= 10
        # r = -abs(obs - action)
        reward = -sum(abs(self.cur_obs - action))
        # Set a new observation (random sample).
        self.cur_obs = self.observation_space.sample()
        return self.cur_obs, reward, done, {}


# Create an RLlib Trainer instance to learn how to act in the above
# environment.
trainer = PPOTrainer(
    config={
        # Env class to use (here: our gym.Env sub-class from above).
        "env": ParrotEnv,
        # Config dict to be passed to our custom env's constructor.
        "env_config": {
            "parrot_shriek_range": gym.spaces.Box(-5.0, 5.0, (1, ))
        },
        # Parallelize environment rollouts.
        "num_workers": 3,
    })

# Train for n iterations and report results (mean episode rewards).
# Since we have to guess 10 times and the optimal reward is 0.0
# (exact match between observation and action value),
# we can expect to reach an optimal episode reward of 0.0.
for i in range(5):
    results = trainer.train()
    print(f"Iter: {i}; avg. reward={results['episode_reward_mean']}")

# Perform inference (action computations) based on given env observations.
Пример #16
0
        if agent_id % 2 == 0:
            return "ppo_policy"
        else:
            return "dqn_policy"

    ppo_trainer = PPOTrainer(
        env="multi_agent_cartpole",
        config={
            "multiagent": {
                "policies": policies,
                "policy_mapping_fn": policy_mapping_fn,
                "policies_to_train": ["ppo_policy"],
            },
            "model": {
                "vf_share_layers": True,
            },
            "num_sgd_iter": 6,
            "vf_loss_coeff": 0.01,
            # disable filters, otherwise we would need to synchronize those
            # as well to the DQN agent
            "observation_filter": "MeanStdFilter",
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
            "framework": args.framework,
        },
    )

    dqn_trainer = DQNTrainer(
        env="multi_agent_cartpole",
        config={
            "multiagent": {
Пример #17
0
    }

    # Logger
    logger.configure(
        experiment_name=
        f"Bertrand_competition_discrete_k_{k}_trainer_{trainer_choice}", )

    if trainer_choice != 'QL':
        register_env('Bertrand', lambda env_config: env)
        ray.init(num_cpus=4)

        for _ in range(epochs):
            if trainer_choice == 'DQN':
                trainer = DQNTrainer(config=config, env='Bertrand')
            elif trainer_choice == 'PPO':
                trainer = PPOTrainer(config=config, env='Bertrand')
            elif trainer_choice == 'A3C':
                trainer = A3CTrainer(config=config, env='Bertrand')

            result = trainer.train()
            log_stats(
                env
            )  #TODO: this is not working because env does not store prices. Fix it or find better way to evaluate
    else:
        # Q-learning

        players = ['agent_' + str(i) for i in range(num_agents)]

        # Hyperparameters
        alpha = 0.05
        beta = 0.2
Пример #18
0
class KandboxAgentRLLibPPO(KandboxAgentPlugin):
    title = "Kandbox Plugin - Agent - realtime - by rllib ppo"
    slug = "ri_agent_rl_ppo"
    author = "Kandbox"
    author_url = "https://github.com/qiyangduan"
    description = "RLLibPPO for GYM for RL."
    version = "0.1.0"
    default_config = {
        "nbr_of_actions": 4,
        "max_epochs": 1000,
        "nbr_of_days_planning_window": 6,
        "model_path": "default_model_path",
        "working_dir": "/tmp",
        "checkpoint_path_key": "ppo_checkpoint_path",
    }
    config_form_spec = {
        "type": "object",
        "properties": {},
    }

    def __init__(self, agent_config, kandbox_config):
        self.agent_config = agent_config
        self.current_best_episode_reward_mean = -99

        env_config = agent_config["env_config"]

        if "rules_slug_config_list" not in env_config.keys():
            if "rules" not in env_config.keys():
                log.error("no rules_slug_config_list and no rules ")
            else:
                env_config["rules_slug_config_list"] = [
                    [rule.slug, rule.config] for rule in env_config["rules"]
                ]
                env_config.pop("rules", None)

        # self.env_class = env_class = agent_config["env"]

        self.kandbox_config = self.default_config.copy()
        self.kandbox_config.update(kandbox_config)

        # self.trained_model = trained_model
        self.kandbox_config["create_datetime"] = datetime.now()

        # self.trainer = None
        self.env_config = env_config
        # self.load_model(env_config=self.env_config)
        print(
            f"KandboxAgentRLLibPPO __init__ called, at time {self.kandbox_config['create_datetime']}"
        )
        # import pdb

        # pdb.set_trace()
        if not ray.is_initialized():
            ray.init(ignore_reinit_error=True, log_to_driver=False)
        # ray.init(redis_address="localhost:6379")

    def build_model(self):

        trainer_config = DEFAULT_CONFIG.copy()

        trainer_config["num_workers"] = 0
        # trainer_config["train_batch_size"] = 640
        # trainer_config["sgd_minibatch_size"] = 160
        # trainer_config["num_sgd_iter"] = 100

        trainer_config["exploration_config"] = {
            "type": "Random",
        }
        # EpsilonGreedy(Exploration):
        # trainer_config["exploration_config"] = {
        #         "type": "Curiosity",
        #         "eta": 0.2,
        #         "lr": 0.001,
        #         "feature_dim": 128,
        #         "feature_net_config": {
        #             "fcnet_hiddens": [],
        #             "fcnet_activation": "relu",
        #         },
        #         "sub_exploration": {
        #             "type": "StochasticSampling",
        #         }
        #     }

        # trainer_config["log_level"] = "DEBUG"
        """
        if env_config is not None:
            for x in env_config.keys():
                trainer_config[x] = env_config[x]
        """

        # trainer_config["env_config"] = copy.deepcopy(env_config)  #  {"rules": "qiyang_role"}

        trainer_config.update(self.agent_config)

        self.trainer = PPOTrainer(trainer_config, self.agent_config["env"])
        # self.config["trainer"] = self.trainer
        return self.trainer

    def load_model(self):  # , allow_empty = None
        env_config = self.agent_config["env_config"]
        self.trainer = self.build_model()

        # if (model_path is not None) & (os.path.exists(model_path)):
        if "ppo_checkpoint_path" in env_config.keys():
            # raise FileNotFoundError("can not find model at path: {}".format(model_path))
            if os.path.exists(env_config["ppo_checkpoint_path"]):
                self.trainer.restore(env_config["ppo_checkpoint_path"])
                print("Reloaded model from path: {} ".format(
                    env_config["ppo_checkpoint_path"]))

            else:
                print(
                    "Env_config has ppo_checkpoint_path = {}, but no files found. I am returning an initial model"
                    .format(env_config["ppo_checkpoint_path"]))

        else:
            print(
                "Env_config has no ppo_checkpoint_path, returning an initial model"
            )
        # self.config["model_path"] = model_path
        # self.config["trainer"] = self.trainer
        # self.config["policy"] = self.trainer.workers.local_worker().get_policy()
        self.policy = self.trainer.workers.local_worker().get_policy()
        return self.trainer

    def train_model(self):

        # self.trainer = self.build_model()
        for i in range(self.kandbox_config["max_epochs"]):
            result = self.trainer.train()
            # print(pretty_print(result))
            print(
                "Finished training iteration {}, Result: episodes_this_iter:{}, policy_reward_max: {}, episode_reward_max {}, episode_reward_mean {}, info.num_steps_trained: {}..."
                .format(
                    i,
                    result["episodes_this_iter"],
                    result["policy_reward_max"],
                    result["episode_reward_max"],
                    result["episode_reward_mean"],
                    result["info"]["num_steps_trained"],
                ))
            if result[
                    "episode_reward_mean"] > self.current_best_episode_reward_mean * 1.1:
                model_path = self.save_model()
                print(
                    "Model is saved after 10 percent increase, episode_reward_mean = {},  file = {}"
                    .format(result["episode_reward_mean"], model_path))
                self.current_best_episode_reward_mean = result[
                    "episode_reward_mean"]

        return self.save_model()

    def save_model(self):

        checkpoint_dir = "{}/model_checkpoint_org_{}_team_{}".format(
            self.agent_config["env_config"]["working_dir"],
            self.agent_config["env_config"]["org_code"],
            self.agent_config["env_config"]["team_id"],
        )
        _path = self.trainer.save(checkpoint_dir=checkpoint_dir)

        # exported_model_dir = "{}/exported_ppo_model_org_{}_team_{}".format(
        #     self.agent_config["env_config"]["working_dir"], self.agent_config["env_config"]["org_code"], self.agent_config["env_config"]["team_id"]
        # )
        # self.trainer.get_policy().export_model(exported_model_dir + "/1")

        return _path  # self.trainer

    def predict_action(self, observation=None):

        action = self.trainer.compute_action(observation)
        return action

    def predict_action_list(self, env=None, job_code=None, observation=None):
        actions = []
        if env is not None:
            self.env = env
        else:
            env = self.env

        if job_code is None:
            job_i = env.current_job_i
        else:
            job_i = env.jobs_dict[job_code].job_index

        observation = env._get_observation()

        # export_dir = "/Users/qiyangduan/temp/kandbox/exported_ppo_model_org_duan3_team_3/1"
        # loaded_policy = tf.saved_model.load(export_dir)
        # loaded_policy.signatures["serving_default"](observations=observation)

        predicted_action = self.trainer.compute_action(observation)
        # V predicted_action = self.policy.compute_action(observation)

        for _ in range(len(env.workers)):  # hist_job_workers_ranked:
            if len(actions) >= self.config["nbr_of_actions"]:
                return actions
            actions.append(list(predicted_action).copy())
            max_i = np.argmax(predicted_action[0:len(env.workers)])
            predicted_action[max_i] = 0

        return actions

    def predict_action_dict_list(self,
                                 env=None,
                                 job_code=None,
                                 observation=None):
        if env is not None:
            self.env = env
        else:
            env = self.env

        curr_job = copy.deepcopy(env.jobs_dict[job_code])

        if job_code is None:
            job_i = env.current_job_i
        else:
            job_i = curr_job.job_index
            env.current_job_i = job_i

        observation = env._get_observation()

        action = self.predict_action(observation=observation)
        action_dict = env.decode_action_into_dict_native(action=action)

        action_day = int(action_dict.scheduled_start_minutes / 1440)
        curr_job.requested_start_min_minutes = action_day * 1440
        curr_job.requested_start_max_minutes = (action_day + 1) * 1440

        action_dict_list = self.env.recommendation_server.search_action_dict_on_worker_day(
            a_worker_code_list=action.scheduled_worker_codes,
            curr_job=curr_job,
            max_number_of_matching=3,
        )
        return action_dict_list
Пример #19
0
        done = self._counter >= self._horizon
        return self.state, reward, done, {}

    def reset(self):
        self.state = np.random.normal(0, 1, 2)
        self._counter = 0
        return self.state


trainer_config = DEFAULT_CONFIG.copy()
trainer_config['num_workers'] = 1
trainer_config["train_batch_size"] = 64
trainer_config["sgd_minibatch_size"] = 64
trainer_config["num_sgd_iter"] = 10

trainer = PPOTrainer(trainer_config, MyEnv)
for i in range(50):
    print("Training iteration {}...".format(i))
    trainer.train()

cumulative_reward_list = []
M = 100
for rep in range(M):
    env = MyEnv({})
    state = env.reset()

    done = False
    cumulative_reward = 0
    while not done:
        action = trainer.compute_action(state)
        #print(action, state)
Пример #20
0
env_tmp = RLCardWrapped(None)
policies = {
    "ppo_policy_1": (PPOTFPolicy, env_tmp.observation_space,
                     env_tmp.action_space, ppo_trainer_config),
    "rand_policy":
    (RandomPolicy, env_tmp.observation_space, env_tmp.action_space, {}),
}

# Instantiate the PPO trainer eval
trainer_eval = PPOTrainer(
    config={
        "env": rlcard_env_id,
        "multiagent": {
            "policies_to_train": ['ppo_policy_1'],
            "policies":
            policies,
            "policy_mapping_fn":
            lambda agent_id: "ppo_policy_1"
            if agent_id == "player_1" else "rand_policy",
        },
        # "num_gpus": 0.5,
    })

trainer = PPOTrainer(
    config={
        "env": rlcard_env_id,
        "multiagent": {
            "policies_to_train": ['ppo_policy_1'],
            "policies": policies,
            "policy_mapping_fn": lambda agent_id: "ppo_policy_1",
        },
Пример #21
0
    def testPPOSampleWaste(self):
        ray.init(num_cpus=4)

        # Check we at least collect the initial wave of samples
        ppo = PPOTrainer(env="CartPole-v0",
                         config={
                             "sample_batch_size": 200,
                             "train_batch_size": 128,
                             "num_workers": 3,
                         })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 600)
        ppo.stop()

        # Check we collect at least the specified amount of samples
        ppo = PPOTrainer(env="CartPole-v0",
                         config={
                             "sample_batch_size": 200,
                             "train_batch_size": 900,
                             "num_workers": 3,
                         })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 1000)
        ppo.stop()

        # Check in vectorized mode
        ppo = PPOTrainer(env="CartPole-v0",
                         config={
                             "sample_batch_size": 200,
                             "num_envs_per_worker": 2,
                             "train_batch_size": 900,
                             "num_workers": 3,
                         })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 1200)
        ppo.stop()

        # Check legacy mode
        ppo = PPOTrainer(env="CartPole-v0",
                         config={
                             "sample_batch_size": 200,
                             "train_batch_size": 128,
                             "num_workers": 3,
                             "straggler_mitigation": True,
                         })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 200)
        ppo.stop()
Пример #22
0
        "lr": 0.0001,  # started at 0.0001
        "num_sgd_iter": 5,
        "vf_loss_coeff": 0.001,
        "log_level": "WARN",
        "train_batch_size": 512,
        "sgd_minibatch_size": 32,
        "clip_param": 0.3,
        "vf_clip_param": 10.0
    }

    last_improve = 150

    iteration = 22
    improved = 0
    while True:
        trainer = PPOTrainer(env="fire_mage", config=rnn_config)
        print(dir(trainer))
        #trainer.restore('./checkpoints_flush/checkpoint_379/checkpoint-379')

        step = 0
        best_val = 0.0
        if False:
            save_0 = trainer.save_to_object()
        while True:
            if False:
                save_0 = trainer.save_to_object()
                result = trainer.train()
                while result['episode_reward_mean'] > best_val:
                    print('UPENING')
                    best_save = deepcopy(save_0)
                    best_val = result['episode_reward_mean']
        trainer.get_policy("low_level_policy").set_state(importedPolicy)
        chk_freq = 1  # Hanya perlu 1 kali saja di awal untuk save model hasil import

    while True:
        result = trainer.train()
        tune.report(**result)
        if (trainer._iteration % chk_freq == 0):
            with tune.checkpoint_dir(
                    step=trainer._iteration) as checkpoint_dir:
                trainer.save(checkpoint_dir)


if __name__ == "__main__":
    ray.init(ignore_reinit_error=True)
    config_hier["multiagent"]["policies_to_train"] = ["high_level_policy"]
    resources = PPOTrainer.default_resource_request(config_hier).to_json()
    tune.run(
        train,
        name="HWalk_Hier_Mimic",
        # resume=resume,
        restore=
        "/home/aditya/ray_results/{}/{}/checkpoint_{}/checkpoint_{}/checkpoint-{}"
        .format(
            experiment_name_hier,
            experiment_id_hier,
            checkpoint_num_hier,
            checkpoint_num_hier,
            checkpoint_num_hier,
        ) if resumeFromCheckpoint else "",
        stop={"episode_reward_mean": 10000},
        config=config_hier,
Пример #24
0
        phase1_time = result["timesteps_total"]
    state = agent1.save()
    agent1.stop()

    # Train for n iterations with low LR
    config["lr"] = 0.0001
    agent2 = PPOTrainer(env="CartPole-v0", config=config)
    agent2.restore(state)
    for _ in range(iterations):
        result = agent2.train()
        result["phase"] = 2
        result["timesteps_total"] += phase1_time  # keep time moving forward
        reporter(**result)
    agent2.stop()


if __name__ == "__main__":
    ray.init()
    args = parser.parse_args()
    config = {
        # Special flag signalling `my_train_fn` how many iters to do.
        "train-iterations": 2,
        "lr": 0.01,
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "num_workers": 0,
        "framework": args.framework,
    }
    resources = PPOTrainer.default_resource_request(config)
    tune.run(my_train_fn, resources_per_trial=resources, config=config)
def train(config, checkpoint_dir=None):
    trainer = PPOTrainer(config=config)

    if checkpoint_dir:
        trainer.load_checkpoint(checkpoint_dir)

    chk_freq = 10

    if useModelFromLowLevelTrain:
        config_low["num_workers"] = 0
        config_low["num_envs_per_worker"] = 1
        config_low["num_gpus"] = 1
        agentLow = PPOTrainer(config_low)
        agentLow.restore(
            "/home/aditya/ray_results/{}/{}/checkpoint_{}/checkpoint-{}".
            format(experiment_name, experiment_id, checkpoint_num,
                   checkpoint_num))
        lowWeight = agentLow.get_policy().get_weights()
        highWeight = trainer.get_policy("low_level_policy").get_weights()
        lowState = agentLow.get_policy().get_state()
        importedOptState = OrderedDict([
            (k.replace("default_policy", "low_level_policy"), v)
            for k, v in lowState["_optimizer_variables"].items()
        ])
        importedPolicy = {
            hw: lowWeight[lw]
            for hw, lw in zip(highWeight.keys(), lowWeight.keys())
        }
        importedPolicy["_optimizer_variables"] = importedOptState
        trainer.get_policy("low_level_policy").set_state(importedPolicy)
        chk_freq = 1  # Hanya perlu 1 kali saja di awal untuk save model hasil import

    while True:
        result = trainer.train()
        tune.report(**result)
        if (trainer._iteration % chk_freq == 0):
            with tune.checkpoint_dir(
                    step=trainer._iteration) as checkpoint_dir:
                trainer.save(checkpoint_dir)
Пример #26
0
                                  config["entropy_coeff_schedule"])
    warmup_steps = config["model"]["custom_options"].get(
        "warmup_steps", 100000)
    TransformerLearningRateSchedule.__init__(
        policy, config["model"]["custom_options"]["transformer"]["num_heads"],
        warmup_steps)


TTFPPOPolicy = PPOTFPolicy.with_updates(name="TTFPPOPolicy",
                                        before_loss_init=setup_mixins,
                                        mixins=[
                                            TransformerLearningRateSchedule,
                                            EntropyCoeffSchedule, KLCoeffMixin,
                                            ValueNetworkMixin
                                        ])

TTFPPOPolicyInfer = PPOTFPolicy.with_updates(name="TTFPPOPolicyInfer",
                                             before_loss_init=setup_mixins,
                                             mixins=[
                                                 LearningRateSchedule,
                                                 EntropyCoeffSchedule,
                                                 KLCoeffMixin,
                                                 ValueNetworkMixin
                                             ])

register_trainable(
    "TTFPPO",
    PPOTrainer.with_updates(name="TTFPPOTrainer",
                            get_policy_class=lambda c: TTFPPOPolicy),
)
                    "exploration_config": {
                        "type": "EpsilonGreedy",
                        "initial_epsilon": 1.0,
                        "final_epsilon": 0.02,
                        "epsilon_timesteps": 1000,
                    },
                    # "learning_starts": 100,
                    # "timesteps_per_iteration": 200,
                    # "log_level": "INFO",
                }))
    elif args.run == "PPO":
        # Example of using PPO (does NOT support off-policy actions).
        trainer = PPOTrainer(
            env="srv",
            config=dict(
                connector_config, **{
                    "sample_batch_size": 1000,
                    "train_batch_size": 4000,
                }))
    else:
        raise ValueError("--run must be DQN or PPO")

    checkpoint_path = CHECKPOINT_FILE.format(args.run)

    # Attempt to restore from checkpoint if possible.
    if os.path.exists(checkpoint_path):
        checkpoint_path = open(checkpoint_path).read()
        print("Restoring from checkpoint path", checkpoint_path)
        trainer.restore(checkpoint_path)

    # Serving and training loop