Пример #1
0
    def test_old_configs(self):
        """Tests creating various Trainers (Algorithms) using 1.10 config dicts."""
        from ray.rllib.tests.backward_compat.old_ppo import DEFAULT_CONFIG
        from ray.rllib.agents.ppo import PPOTrainer

        config = DEFAULT_CONFIG.copy()
        trainer = PPOTrainer(config=config, env="CartPole-v0")
        trainer.train()
        trainer.stop()
Пример #2
0
def Hunter_trainer(config, reporter):
    multi_hunter_trainer = PPOTrainer(MultiHunterEnv, config)
    for _ in range(100):
        environment.simulate()
        result = multi_hunter_trainer.train()
        result["phase"] = 1
        reporter(**result)
        phase1_time = result["timesteps_total"]
    state = multi_hunter_trainer.save()
    multi_hunter_trainer.stop()
Пример #3
0
def main():
    ray.init()

    #  Hyperparameters of PPO are not well tuned. Most of them refer to https://github.com/xtma/pytorch_car_caring/blob/master/train.py
    trainer = PPOTrainer(env=MyEnv,
                         config={
                             "use_pytorch": True,
                             "model": {
                                 "custom_model": "mymodel",
                                 "custom_options": {
                                     'encoder_path': args.encoder_path,
                                     'train_encoder': args.train_encoder
                                 },
                                 "custom_action_dist": "mydist",
                             },
                             "env_config": {
                                 'game': 'CarRacing'
                             },
                             "num_workers": args.num_workers,
                             "num_envs_per_worker": args.num_envs_per_worker,
                             "num_gpus": args.num_gpus,
                             "use_gae": args.use_gae,
                             "batch_mode": args.batch_mode,
                             "vf_loss_coeff": args.vf_loss_coeff,
                             "vf_clip_param": args.vf_clip_param,
                             "lr": args.lr,
                             "kl_coeff": args.kl_coeff,
                             "num_sgd_iter": args.num_sgd_iter,
                             "grad_clip": args.grad_clip,
                             "clip_param": args.clip_param,
                             "rollout_fragment_length":
                             args.rollout_fragment_length,
                             "train_batch_size": args.train_batch_size,
                             "sgd_minibatch_size": args.sgd_minibatch_size
                         })

    for i in range(args.train_epochs):
        trainer.train()
        print("%d Train Done" % (i), "Save Freq: %d" % (args.model_save_freq))
        if (i + 1) % args.model_save_freq == 0:
            print("%d Episodes Done" % (i))
            weights = trainer.get_policy().get_weights()
            torch.save(weights, args.model_save_path + "%d-mode.pt" % (i + 1))
    trainer.save(args.trainer_save_path)
    print("Done All!")
    trainer.stop()
Пример #4
0
def my_train_fn(config, reporter):
    # Train for n iterations with high LR
    agent1 = PPOTrainer(env="CartPole-v0", config=config)
    for _ in range(10):
        result = agent1.train()
        result["phase"] = 1
        reporter(**result)
        phase1_time = result["timesteps_total"]
    state = agent1.save()
    agent1.stop()

    # Train for n iterations with low LR
    config["lr"] = 0.0001
    agent2 = PPOTrainer(env="CartPole-v0", config=config)
    agent2.restore(state)
    for _ in range(10):
        result = agent2.train()
        result["phase"] = 2
        result["timesteps_total"] += phase1_time  # keep time moving forward
        reporter(**result)
    agent2.stop()
Пример #5
0
    def test_ppo_sample_waste(self):
        # Check we at least collect the initial wave of samples
        ppo = PPOTrainer(env="CartPole-v0",
                         config={
                             "rollout_fragment_length": 200,
                             "train_batch_size": 128,
                             "num_workers": 3,
                         })
        result = ppo.train()
        self.assertEqual(result["info"]["num_steps_sampled"], 600)
        ppo.stop()

        # Check we collect at least the specified amount of samples
        ppo = PPOTrainer(env="CartPole-v0",
                         config={
                             "rollout_fragment_length": 200,
                             "train_batch_size": 900,
                             "num_workers": 3,
                         })
        result = ppo.train()
        self.assertEqual(result["info"]["num_steps_sampled"], 1200)
        ppo.stop()

        # Check in vectorized mode
        ppo = PPOTrainer(env="CartPole-v0",
                         config={
                             "rollout_fragment_length": 200,
                             "num_envs_per_worker": 2,
                             "train_batch_size": 900,
                             "num_workers": 3,
                         })
        result = ppo.train()
        self.assertEqual(result["info"]["num_steps_sampled"], 1200)
        ppo.stop()
Пример #6
0
    def testPPOSampleWaste(self):
        ray.init(num_cpus=4)

        # Check we at least collect the initial wave of samples
        ppo = PPOTrainer(env="CartPole-v0",
                         config={
                             "sample_batch_size": 200,
                             "train_batch_size": 128,
                             "num_workers": 3,
                         })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 600)
        ppo.stop()

        # Check we collect at least the specified amount of samples
        ppo = PPOTrainer(env="CartPole-v0",
                         config={
                             "sample_batch_size": 200,
                             "train_batch_size": 900,
                             "num_workers": 3,
                         })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 1000)
        ppo.stop()

        # Check in vectorized mode
        ppo = PPOTrainer(env="CartPole-v0",
                         config={
                             "sample_batch_size": 200,
                             "num_envs_per_worker": 2,
                             "train_batch_size": 900,
                             "num_workers": 3,
                         })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 1200)
        ppo.stop()

        # Check legacy mode
        ppo = PPOTrainer(env="CartPole-v0",
                         config={
                             "sample_batch_size": 200,
                             "train_batch_size": 128,
                             "num_workers": 3,
                             "straggler_mitigation": True,
                         })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 200)
        ppo.stop()
Пример #7
0
    def test_ppo_sample_waste(self):
        # Check we at least collect the initial wave of samples
        ppo = PPOTrainer(
            env="CartPole-v0",
            config={
                "sample_batch_size": 200,
                "train_batch_size": 128,
                "num_workers": 3,
            })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 600)
        ppo.stop()

        # Check we collect at least the specified amount of samples
        ppo = PPOTrainer(
            env="CartPole-v0",
            config={
                "sample_batch_size": 200,
                "train_batch_size": 900,
                "num_workers": 3,
            })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 1000)
        ppo.stop()

        # Check in vectorized mode
        ppo = PPOTrainer(
            env="CartPole-v0",
            config={
                "sample_batch_size": 200,
                "num_envs_per_worker": 2,
                "train_batch_size": 900,
                "num_workers": 3,
            })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 1200)
        ppo.stop()
Пример #8
0
agent_cfg[
    "shuffle_sequences"] = True  # Whether to shuffle sequences in the batch when training
agent_cfg[
    "grad_clip"] = None  # Clamp the norm of the gradient during optimization (None to disable)

# ====================== Run the optimization ======================

agent_cfg["lr"] = 1.0e-4
agent_cfg["lr_schedule"] = None

train_agent = Trainer(agent_cfg, "env", logger_creator)
checkpoint_path = train(train_agent, max_timesteps=100000)

# ===================== Enjoy the trained agent ======================

test_agent = Trainer(agent_cfg, "env", logger_creator)
test_agent.restore(checkpoint_path)
test(test_agent, explore=False)

# =================== Terminate Ray backend ====================

train_agent.stop()
test_agent.stop()
ray.shutdown()

# =================== Terminate the Ray backend ====================

train_agent.stop()
test_agent.stop()
ray.shutdown()
Пример #9
0
 def test_local(self):
     cf = DEFAULT_CONFIG.copy()
     for _ in framework_iterator(cf):
         agent = PPOTrainer(cf, "CartPole-v0")
         print(agent.train())
         agent.stop()
    # Create a new dummy Trainer to "fix" our checkpoint.
    new_trainer = PPOTrainer(config=config)
    # Get untrained weights for all policies.
    untrained_weights = new_trainer.get_weights()
    # Restore all policies from checkpoint.
    new_trainer.restore(best_checkpoint)
    # Set back all weights (except for 1st agent) to original
    # untrained weights.
    new_trainer.set_weights(
        {pid: w
         for pid, w in untrained_weights.items() if pid != "policy_0"})
    # Create the checkpoint from which tune can pick up the
    # experiment.
    new_checkpoint = new_trainer.save()
    new_trainer.stop()
    print(".. checkpoint to restore from (all policies reset, "
          f"except policy_0): {new_checkpoint}")

    print("Starting new tune.run")

    # Start our actual experiment.
    stop = {
        "episode_reward_mean": args.stop_reward,
        "timesteps_total": args.stop_timesteps,
        "training_iteration": args.stop_iters,
    }

    # Make sure, the non-1st policies are not updated anymore.
    config["multiagent"]["policies_to_train"] = [
        pid for pid in policy_ids if pid != "policy_0"
Пример #11
0
    #                     else random.choice(["fb_1", "fb_2"])
    #         },
    #     },
    # )

    trainer = PPOTrainer(
        env="rcrs_env",
        config={
            "env": "rcrs_env",
            "num_workers": 1,
            "multiagent": {
                "policies": {
                    "fb_1": (None, obs_space, act_space, {}),
                    "fb_2": (None, obs_space, act_space, {}),
                },
                "policy_mapping_fn":
                lambda agent_id: "fb_1" if agent_id.startswith("fb_1_") else
                random.choice(["fb_1", "fb_2"])
            },
        },
    )

    for i in range(2):
        result = trainer.train()
        print(pretty_print(result))
        if i % 1 == 0:
            checkpoint = trainer.save()
            print("checkpoint saved at", checkpoint)
    statess = trainer.save()
    trainer.stop()