示例#1
0
    def test_preprocessing_disabled(self):
        config = ppo.DEFAULT_CONFIG.copy()
        config["seed"] = 42
        config["env"] = "ray.rllib.examples.env.random_env.RandomEnv"
        config["env_config"] = {
            "config": {
                "observation_space":
                Dict({
                    "a":
                    Discrete(5),
                    "b":
                    Dict({
                        "ba": Discrete(4),
                        "bb": Box(-1.0, 1.0, (2, 3), dtype=np.float32),
                    }),
                    "c":
                    Tuple((MultiDiscrete([2, 3]), Discrete(1))),
                    "d":
                    Box(-1.0, 1.0, (1, ), dtype=np.int32),
                }),
            },
        }
        # Set this to True to enforce no preprocessors being used.
        # Complex observations now arrive directly in the model as
        # structures of batches, e.g. {"a": tensor, "b": [tensor, tensor]}
        # for obs-space=Dict(a=..., b=Tuple(..., ...)).
        config["_disable_preprocessor_api"] = True
        # Speed things up a little.
        config["train_batch_size"] = 100
        config["sgd_minibatch_size"] = 10
        config["rollout_fragment_length"] = 5
        config["num_sgd_iter"] = 1

        num_iterations = 1
        # Only supported for tf so far.
        for _ in framework_iterator(config):
            trainer = ppo.PPOTrainer(config=config)
            for i in range(num_iterations):
                results = trainer.train()
                check_train_results(results)
                print(results)
            check_compute_single_action(trainer)
            trainer.stop()
示例#2
0
    def test_traj_view_lstm_prev_actions_and_rewards(self):
        """Tests, whether Policy/Model return correct LSTM ViewRequirements.
        """
        config = ppo.DEFAULT_CONFIG.copy()
        config["model"] = config["model"].copy()
        # Activate LSTM + prev-action + rewards.
        config["model"]["use_lstm"] = True
        config["model"]["lstm_use_prev_action"] = True
        config["model"]["lstm_use_prev_reward"] = True

        for _ in framework_iterator(config):
            trainer = ppo.PPOTrainer(config, env="CartPole-v0")
            policy = trainer.get_policy()
            view_req_model = policy.model.view_requirements
            view_req_policy = policy.view_requirements
            # 7=obs, prev-a + r, 2x state-in, 2x state-out.
            assert len(view_req_model) == 7, view_req_model
            assert len(view_req_policy) == 20,\
                (len(view_req_policy), view_req_policy)
            for key in [
                    SampleBatch.OBS, SampleBatch.ACTIONS, SampleBatch.REWARDS,
                    SampleBatch.DONES, SampleBatch.NEXT_OBS,
                    SampleBatch.VF_PREDS, SampleBatch.PREV_ACTIONS,
                    SampleBatch.PREV_REWARDS, "advantages", "value_targets",
                    SampleBatch.ACTION_DIST_INPUTS, SampleBatch.ACTION_LOGP
            ]:
                assert key in view_req_policy

                if key == SampleBatch.PREV_ACTIONS:
                    assert view_req_policy[key].data_col == SampleBatch.ACTIONS
                    assert view_req_policy[key].shift == -1
                elif key == SampleBatch.PREV_REWARDS:
                    assert view_req_policy[key].data_col == SampleBatch.REWARDS
                    assert view_req_policy[key].shift == -1
                elif key not in [
                        SampleBatch.NEXT_OBS, SampleBatch.PREV_ACTIONS,
                        SampleBatch.PREV_REWARDS
                ]:
                    assert view_req_policy[key].data_col is None
                else:
                    assert view_req_policy[key].data_col == SampleBatch.OBS
                    assert view_req_policy[key].shift == 1
            trainer.stop()
示例#3
0
def get_rl_agent(agent_name, config, env_to_agent):
    if agent_name == A2C:
        import ray.rllib.agents.a3c as a2c
        agent = a2c.A2CTrainer(config=config, env=env_to_agent)
    elif agent_name == A3C:
        import ray.rllib.agents.a3c as a3c
        agent = a3c.A3CTrainer(config=config, env=env_to_agent)
    elif agent_name == BC:
        import ray.rllib.agents.marwil as bc
        agent = bc.BCTrainer(config=config, env=env_to_agent)
    elif agent_name == DQN:
        import ray.rllib.agents.dqn as dqn
        agent = dqn.DQNTrainer(config=config, env=env_to_agent)
    elif agent_name == APEX_DQN:
        import ray.rllib.agents.dqn as dqn
        agent = dqn.ApexTrainer(config=config, env=env_to_agent)
    elif agent_name == IMPALA:
        import ray.rllib.agents.impala as impala
        agent = impala.ImpalaTrainer(config=config, env=env_to_agent)
    elif agent_name == MARWIL:
        import ray.rllib.agents.marwil as marwil
        agent = marwil.MARWILTrainer(config=config, env=env_to_agent)
    elif agent_name == PG:
        import ray.rllib.agents.pg as pg
        agent = pg.PGTrainer(config=config, env=env_to_agent)
    elif agent_name == PPO:
        import ray.rllib.agents.ppo as ppo
        agent = ppo.PPOTrainer(config=config, env=env_to_agent)
    elif agent_name == APPO:
        import ray.rllib.agents.ppo as ppo
        agent = ppo.APPOTrainer(config=config, env=env_to_agent)
    elif agent_name == SAC:
        import ray.rllib.agents.sac as sac
        agent = sac.SACTrainer(config=config, env=env_to_agent)
    elif agent_name == LIN_UCB:
        import ray.rllib.contrib.bandits.agents.lin_ucb as lin_ucb
        agent = lin_ucb.LinUCBTrainer(config=config, env=env_to_agent)
    elif agent_name == LIN_TS:
        import ray.rllib.contrib.bandits.agents.lin_ts as lin_ts
        agent = lin_ts.LinTSTrainer(config=config, env=env_to_agent)
    else:
        raise Exception("Not valid agent name")
    return agent
示例#4
0
    def test(self, algo, path, lr, fc_hid, fc_act):
        """Test trained agent for a single episode. Return the episode reward"""
        # instantiate env class
        unused_shared = []
        unused_own = []
        unsatisfied_shared = []
        unsatisfied_own = []

        episode_reward = 0
        self.config_test["num_workers"] = 0
        self.config_test["lr"] = lr
        self.config_test['model']["fcnet_hiddens"] = fc_hid
        self.config_test['model']["fcnet_activation"] = fc_act

        if algo == "ppo":
            self.agent = ppo.PPOTrainer(config=self.config_test)
        if algo == "ddpg":
            self.agent = ddpg.DDPGTrainer(config=self.config_test)
        if algo == "a3c":
            self.agent = a3c.A3CTrainer(config=self.config_test)
        if algo == "td3":
            self.agent = ddpg.TD3Trainer(config=self.config_test)
        if algo == "appo":
            self.agent = ppo.APPOTrainer(config=self.config_test)

        self.agent.restore(path)
        env = self.agent.workers.local_worker().env

        obs = env.reset()
        done = False

        while not done:
            action = self.agent.compute_action(obs)
            obs, reward, done, info = env.step(action)
            episode_reward += reward

            unused_shared.append(info["unused_shared"])
            unused_own.append(info["unused_own"])
            unsatisfied_shared.append(info["unsatisfied_shared"])
            unsatisfied_own.append(info["unsatisfied_own"])

        return episode_reward, unused_shared, unused_own, unsatisfied_shared, unsatisfied_own
def get_rllib_agent(agent_name, env_name, env, env_to_agent):
    config = get_config(env_name, env, 1) if is_rllib_agent(agent_name) else {}
    if agent_name == RLLIB_A2C:
        import ray.rllib.agents.a3c as a2c
        agent = a2c.A2CTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_A3C:
        import ray.rllib.agents.a3c as a3c
        agent = a3c.A3CTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_BC:
        import ray.rllib.agents.marwil as bc
        agent = bc.BCTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_DQN:
        import ray.rllib.agents.dqn as dqn
        agent = dqn.DQNTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_APEX_DQN:
        import ray.rllib.agents.dqn as dqn
        agent = dqn.ApexTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_IMPALA:
        import ray.rllib.agents.impala as impala
        agent = impala.ImpalaTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_MARWIL:
        import ray.rllib.agents.marwil as marwil
        agent = marwil.MARWILTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_PG:
        import ray.rllib.agents.pg as pg
        agent = pg.PGTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_PPO:
        import ray.rllib.agents.ppo as ppo
        agent = ppo.PPOTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_APPO:
        import ray.rllib.agents.ppo as ppo
        agent = ppo.APPOTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_SAC:
        import ray.rllib.agents.sac as sac
        agent = sac.SACTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_LIN_UCB:
        import ray.rllib.contrib.bandits.agents.lin_ucb as lin_ucb
        agent = lin_ucb.LinUCBTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_LIN_TS:
        import ray.rllib.contrib.bandits.agents.lin_ts as lin_ts
        agent = lin_ts.LinTSTrainer(config=config, env=env_to_agent)
    return agent
示例#6
0
    def test_ppo_exploration_setup(self):
        """Tests, whether PPO runs with different exploration setups."""
        config = copy.deepcopy(ppo.DEFAULT_CONFIG)
        config["num_workers"] = 0  # Run locally.
        config["env_config"] = {"is_slippery": False, "map_name": "4x4"}
        obs = np.array(0)

        # Test against all frameworks.
        for fw in framework_iterator(config):
            # Default Agent should be setup with StochasticSampling.
            trainer = ppo.PPOTrainer(config=config, env="FrozenLake-v0")
            # explore=False, always expect the same (deterministic) action.
            a_ = trainer.compute_action(
                obs,
                explore=False,
                prev_action=np.array(2),
                prev_reward=np.array(1.0))
            # Test whether this is really the argmax action over the logits.
            if fw != "tf":
                last_out = trainer.get_policy().model.last_output()
                if fw == "torch":
                    check(a_, np.argmax(last_out.detach().cpu().numpy(), 1)[0])
                else:
                    check(a_, np.argmax(last_out.numpy(), 1)[0])
            for _ in range(50):
                a = trainer.compute_action(
                    obs,
                    explore=False,
                    prev_action=np.array(2),
                    prev_reward=np.array(1.0))
                check(a, a_)

            # With explore=True (default), expect stochastic actions.
            actions = []
            for _ in range(300):
                actions.append(
                    trainer.compute_action(
                        obs,
                        prev_action=np.array(2),
                        prev_reward=np.array(1.0)))
            check(np.mean(actions), 1.5, atol=0.2)
            trainer.stop()
示例#7
0
    def test_counting_by_agent_steps(self):
        """Test whether a PPOTrainer can be built with all frameworks."""
        config = copy.deepcopy(ppo.DEFAULT_CONFIG)

        num_agents = 3

        config["num_workers"] = 2
        config["num_sgd_iter"] = 2
        config["framework"] = "torch"
        config["rollout_fragment_length"] = 21
        config["train_batch_size"] = 147
        config["multiagent"] = {
            "policies": {f"p{i}"
                         for i in range(num_agents)},
            "policy_mapping_fn": lambda aid, **kwargs: "p{}".format(aid),
            "count_steps_by": "agent_steps",
        }
        # Env setup.
        config["env"] = MultiAgentPendulum
        config["env_config"] = {"num_agents": num_agents}

        num_iterations = 2
        trainer = ppo.PPOTrainer(config=config)
        results = None
        for i in range(num_iterations):
            results = trainer.train()
        self.assertEqual(results["agent_timesteps_total"],
                         results["timesteps_total"])
        self.assertEqual(
            results["num_env_steps_trained"] * num_agents,
            results["num_agent_steps_trained"],
        )
        self.assertGreaterEqual(
            results["agent_timesteps_total"],
            num_iterations * config["train_batch_size"],
        )
        self.assertLessEqual(
            results["agent_timesteps_total"],
            (num_iterations + 1) * config["train_batch_size"],
        )
        trainer.stop()
示例#8
0
def load_trained_agent(new_checkpoint):
    # Previous trainer
    prev_trainer = ppo.PPOTrainer(env=DummyTrainer,
                                  config={
                                      "env_config": {},
                                      "framework": "torch",
                                      "num_gpus": 0,
                                      "num_workers": 0,
                                      "explore": False
                                  })

    # restore an older model for the previous trainer
    prev_checkpoint_index = new_checkpoint
    try:
        prev_trainer.restore(
            f"models/checkpoint_{prev_checkpoint_index}/checkpoint-{prev_checkpoint_index}"
        )
    except FileNotFoundError:
        return None

    return prev_trainer.workers.local_worker().get_policy()
示例#9
0
    def setup(self, config):
        path1 = config["path"]
        path_invariant = config["path_invariant"]
        batch_size = config["batch_size"]
        train_data = GridSearchDataset()
        val_data = GridSearchDataset()
        train_loader = DataLoader(train_data,
                                  batch_size=batch_size,
                                  shuffle=True)
        val_loader = DataLoader(val_data, batch_size=batch_size)

        invariant_model = torch.nn.Sequential(torch.nn.Linear(2, 50),
                                              torch.nn.ReLU(),
                                              torch.nn.Linear(50, 1),
                                              torch.nn.Tanh())
        invariant_model.load_state_dict(
            torch.load(
                path_invariant,
                map_location=torch.device('cpu')))  # load the invariant model
        invariant_model.cuda()
        config = get_PPO_config(1234)
        trainer = ppo.PPOTrainer(config=config)
        trainer.restore(path1)
        policy = trainer.get_policy()
        sequential_nn = convert_ray_policy_to_sequential(
            policy)  # load the agent model
        sequential_nn.cuda()

        model = sequential_nn
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=config.get("lr", 1e-3))
        loss = RetrainLoss(invariant_model)  # torch.nn.MSELoss()

        self.models, self.optimizer, self.criterion = self.register(
            models=[model, invariant_model],
            optimizers=optimizer,
            criterion=loss)
        self.model = self.models[0]
        self.register_data(train_loader=train_loader,
                           validation_loader=val_loader)
示例#10
0
def get_PPO_trainer(use_gpu=1):
    ModelCatalog.register_custom_model("my_model", TorchCustomModel)
    config = {"env": allCars,  #
              "model": {"custom_model": "my_model", "fcnet_hiddens": [64, 64], "fcnet_activation": "relu"},  # model config,"
              "vf_share_layers": False,
              "lr": 5e-4,
              "num_gpus": use_gpu,
              "vf_clip_param": 100000,
              "grad_clip": 2500,
              "num_workers": 8,  # parallelism
              "batch_mode": "complete_episodes",
              "evaluation_interval": 10,
              "use_gae": True,  #
              "lambda": 0.95,  # gae lambda param
              "num_envs_per_worker": 10,
              "train_batch_size": 4000,
              "evaluation_num_episodes": 20,
              "rollout_fragment_length": 1000,
              "framework": "torch",
              "horizon": 100}
    trainer = ppo.PPOTrainer(config=config)
    return config, trainer
示例#11
0
    def test_traj_view_attention_net(self):
        config = ppo.DEFAULT_CONFIG.copy()
        # Setup attention net.
        config["model"] = config["model"].copy()
        config["model"]["max_seq_len"] = 50
        config["model"]["custom_model"] = GTrXLNet
        config["model"]["custom_model_config"] = {
            "num_transformer_units": 1,
            "attention_dim": 64,
            "num_heads": 2,
            "memory_inference": 50,
            "memory_training": 50,
            "head_dim": 32,
            "ff_hidden_dim": 32,
        }
        # Test with odd batch numbers.
        config["train_batch_size"] = 1031
        config["sgd_minibatch_size"] = 201
        config["num_sgd_iter"] = 5
        config["num_workers"] = 0
        config["callbacks"] = MyCallbacks
        config["env_config"] = {
            "config": {
                "start_at_t": 1
            }
        }  # first obs is [1.0]

        for _ in framework_iterator(config, frameworks="tf2"):
            trainer = ppo.PPOTrainer(
                config,
                env="ray.rllib.examples.env.debug_counter_env.DebugCounterEnv",
            )
            rw = trainer.workers.local_worker()
            sample = rw.sample()
            assert sample.count == config["rollout_fragment_length"]
            results = trainer.train()
            assert results["train_batch_size"] == config["train_batch_size"]
            trainer.stop()
示例#12
0
def train():
    ray.init()
    config = ppo.DEFAULT_CONFIG.copy()
    config["num_gpus"] = 0
    config["env_config"] = {
        "history_len": 10,
        "features": "sent latency inflation,latency ratio,send ratio"
    }
    config["num_workers"] = 6
    config["eager"] = False
    config["log_level"] = "INFO"
    config["monitor"] = True
    config["num_cpus_per_worker"] = 0
    trainer = ppo.PPOTrainer(config=config, env=SimulatedNetworkEnv)

    for i in range(1000):
        # Perform one iteration of training the policy with PPO
        result = trainer.train()
        print(pretty_print(result))

        if i % 100 == 0:
            checkpoint = trainer.save()
            print("checkpoint saved at", checkpoint)
示例#13
0
    def _continuous_run(self):
        import ray
        from ray import tune
        from ray.rllib.agents import ppo,ddpg
        ray.init(num_cpus=4,num_gpus=1,local_mode=True)
        configs={
            'num_gpus':1,
            'num_workers':4,
            # 'num_gpus_per_worker':1,
            'framework':'torch',
            "simple_optimizer":True,
        }
        AGENT_CONFIG={
            'ddpg':ddpg.DDPGTrainer(config=configs,env="MountainCarContinuous-v0"),
            'ppo':ppo.PPOTrainer(config=configs,env="MountainCarContinuous-v0"),
        }
        trainer=AGENT_CONFIG[self.configs['algorithm']]
        # tune.run(agent, config={"env": "MountainCarContinuous-v0","framework":"torch","num_gpus":0,})
        for i in range(2000): # 2000epoch
            result=trainer.train()#1 epoch
            print(result)

        return
示例#14
0
def solve():
    length = 20
    ray.init()
    trainer = ppo.PPOTrainer(env=CorridorEnv,
                             config={"env_config": {
                                 "length": length
                             }})
    while True:
        results = trainer.train()
        training_iteration = results.get("training_iteration")
        episode_length_mean = results.get("episode_len_mean")
        episodes_total = results.get("episodes_total")
        total_time = results.get("time_total_s")
        print("\n============")
        print(pretty_print(results))
        print(f"\nCorridorEnv (length: {length}) "
              f"Training Iteration {training_iteration}:"
              f"\n\tIteration episode mean length: {episode_length_mean}"
              f"\n\tEpisodes total: {episodes_total}"
              f"\n\tTime total: {round(total_time, 1)}sec")
        if episode_length_mean <= length:
            break
    print(f"\nProblem solved in {episodes_total} training episodes")
 def test_plain(self):
     config = ppo.DEFAULT_CONFIG.copy()
     for _ in framework_iterator(config, frameworks="torch"):
         trainer = ppo.PPOTrainer(config, env="CartPole-v0")
         policy = trainer.get_policy()
         view_req_model = policy.model.inference_view_requirements()
         view_req_policy = policy.training_view_requirements()
         assert len(view_req_model) == 1
         assert len(view_req_policy) == 6
         for key in [
                 SampleBatch.OBS, SampleBatch.ACTIONS, SampleBatch.REWARDS,
                 SampleBatch.DONES, SampleBatch.NEXT_OBS,
                 SampleBatch.VF_PREDS
         ]:
             assert key in view_req_policy
             # None of the view cols has a special underlying data_col,
             # except next-obs.
             if key != SampleBatch.NEXT_OBS:
                 assert view_req_policy[key].data_col is None
             else:
                 assert view_req_policy[key].data_col == SampleBatch.OBS
                 assert view_req_policy[key].shift == 1
         trainer.stop()
示例#16
0
def training_PPO(start_train_date, end_train_date, resume, diff_days):
    config = ppo.DEFAULT_CONFIG.copy()
    config["observation_filter"] = 'MeanStdFilter'
    config["batch_mode"] = "complete_episodes"
    config["lr"] = 1e-4
    config["num_workers"] = num_cores
    config["env_config"] = {
        "settings": settings,
        "main_path": curr_path,
        "start_train": start_train_date,
        "end_train": end_train_date,
        "train/test": "train",
        "sc_volt_start_train": sc_volt_train,
        "diff_days": diff_days,
        "GT_hour_start": 0,
    }
    trainer = ppo.PPOTrainer(config=config, env="simplePible")

    if resume_path != "":
        print("Restoring checkpoint: ", resume)
        sleep(5)
        trainer.restore(
            resume
        )  # Can optionally call trainer.restore(path) to load a checkpoint.

    for i in range(0, int(settings[0]["training_iterations"])):
        result = trainer.train()
        print(pretty_print(result))

        if int(result["training_iteration"]) % 10 == 0:
            checkpoint = trainer.save()
            print("checkpoint saved at", checkpoint)
            checkp_split = checkpoint.split('/')
            parent_dir = '/'.join(checkp_split[0:-2])

    # Remove previous agents and save bew agetn into Agents_Saved
    Ember_RL_func.rm_old_save_new_agent(parent_dir, save_agent_folder)
    def test_lstm_prev_actions_and_rewards(self):
        config = ppo.DEFAULT_CONFIG.copy()
        config["model"] = config["model"].copy()
        # Activate LSTM + prev-action + rewards.
        config["model"]["use_lstm"] = True
        config["model"]["lstm_use_prev_action_reward"] = True

        for _ in framework_iterator(config, frameworks="torch"):
            trainer = ppo.PPOTrainer(config, env="CartPole-v0")
            policy = trainer.get_policy()
            view_req_model = policy.model.inference_view_requirements()
            view_req_policy = policy.training_view_requirements()
            assert len(view_req_model) == 3  # obs, prev_a, prev_r
            assert len(view_req_policy) == 8
            for key in [
                    SampleBatch.OBS, SampleBatch.ACTIONS, SampleBatch.REWARDS,
                    SampleBatch.DONES, SampleBatch.NEXT_OBS,
                    SampleBatch.VF_PREDS, SampleBatch.PREV_ACTIONS,
                    SampleBatch.PREV_REWARDS
            ]:
                assert key in view_req_policy

                if key == SampleBatch.PREV_ACTIONS:
                    assert view_req_policy[key].data_col == SampleBatch.ACTIONS
                    assert view_req_policy[key].shift == -1
                elif key == SampleBatch.PREV_REWARDS:
                    assert view_req_policy[key].data_col == SampleBatch.REWARDS
                    assert view_req_policy[key].shift == -1
                elif key not in [
                        SampleBatch.NEXT_OBS, SampleBatch.PREV_ACTIONS,
                        SampleBatch.PREV_REWARDS
                ]:
                    assert view_req_policy[key].data_col is None
                else:
                    assert view_req_policy[key].data_col == SampleBatch.OBS
                    assert view_req_policy[key].shift == 1
            trainer.stop()
示例#18
0
    def test_ppo_compilation_and_lr_schedule(self):
        """Test whether a PPOTrainer can be built with all frameworks."""
        config = copy.deepcopy(ppo.DEFAULT_CONFIG)
        # For checking lr-schedule correctness.
        config["callbacks"] = MyCallbacks

        config["num_workers"] = 1
        config["num_sgd_iter"] = 2
        # Settings in case we use an LSTM.
        config["model"]["lstm_cell_size"] = 10
        config["model"]["max_seq_len"] = 20
        # Use default-native keras models whenever possible.
        config["model"]["_use_default_native_models"] = True

        config["train_batch_size"] = 128
        # Test with compression.
        config["compress_observations"] = True
        num_iterations = 2

        for _ in framework_iterator(config):
            for env in ["CartPole-v0", "MsPacmanNoFrameskip-v4"]:
                print("Env={}".format(env))
                for lstm in [True, False]:
                    print("LSTM={}".format(lstm))
                    config["model"]["use_lstm"] = lstm
                    config["model"]["lstm_use_prev_action"] = lstm
                    config["model"]["lstm_use_prev_reward"] = lstm

                    trainer = ppo.PPOTrainer(config=config, env=env)
                    for i in range(num_iterations):
                        trainer.train()
                    check_compute_single_action(
                        trainer,
                        include_prev_action_reward=True,
                        include_state=lstm)
                    trainer.stop()
示例#19
0
    def test_no_curiosity(self):
        config = ppo.DEFAULT_CONFIG
        env = "CartPole-v0"
        dummy_obs = np.array([0.0, 0.1, 0.0, 0.0])
        prev_a = np.array(0)
        config["framework"] = "torch"
        config["exploration_config"] = {"type": "ParameterNoise"}

        trainer = ppo.PPOTrainer(config=config, env=env)
        trainer.train()

        # Make sure all actions drawn are the same, given same
        # observations. Tests the explorations API.

        actions = []
        for _ in range(5):
            actions.append(
                trainer.compute_action(
                    observation=dummy_obs,
                    explore=False,
                    prev_action=prev_a,
                    prev_reward=1.0 if prev_a is not None else None))
            check(actions[-1], actions[0])
        print(actions)
示例#20
0
    def test_ppo_loss_function(self):
        """Tests the PPO loss function math."""
        config = copy.deepcopy(ppo.DEFAULT_CONFIG)
        config["num_workers"] = 0  # Run locally.
        config["gamma"] = 0.99
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"
        config["model"]["vf_share_layers"] = True

        for fw, sess in framework_iterator(config, session=True):
            trainer = ppo.PPOTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()

            # Check no free log std var by default.
            if fw == "torch":
                matching = [
                    v for (n, v) in policy.model.named_parameters()
                    if "log_std" in n
                ]
            else:
                matching = [
                    v for v in policy.model.trainable_variables()
                    if "log_std" in str(v)
                ]
            assert len(matching) == 0, matching

            # Post-process (calculate simple (non-GAE) advantages) and attach
            # to train_batch dict.
            # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] =
            # [0.50005, -0.505, 0.5]
            train_batch = compute_gae_for_sample_batch(policy,
                                                       FAKE_BATCH.copy())
            if fw == "torch":
                train_batch = policy._lazy_tensor_dict(train_batch)

            # Check Advantage values.
            check(train_batch[Postprocessing.VALUE_TARGETS],
                  [0.50005, -0.505, 0.5])

            # Calculate actual PPO loss.
            if fw in ["tf2", "tfe"]:
                ppo_surrogate_loss_tf(policy, policy.model, Categorical,
                                      train_batch)
            elif fw == "torch":
                ppo_surrogate_loss_torch(policy, policy.model,
                                         TorchCategorical, train_batch)

            vars = policy.model.variables() if fw != "torch" else \
                list(policy.model.parameters())
            if fw == "tf":
                vars = policy.get_session().run(vars)
            expected_shared_out = fc(train_batch[SampleBatch.CUR_OBS],
                                     vars[0 if fw != "torch" else 2],
                                     vars[1 if fw != "torch" else 3],
                                     framework=fw)
            expected_logits = fc(expected_shared_out,
                                 vars[2 if fw != "torch" else 0],
                                 vars[3 if fw != "torch" else 1],
                                 framework=fw)
            expected_value_outs = fc(expected_shared_out,
                                     vars[4],
                                     vars[5],
                                     framework=fw)

            kl, entropy, pg_loss, vf_loss, overall_loss = \
                self._ppo_loss_helper(
                    policy, policy.model,
                    Categorical if fw != "torch" else TorchCategorical,
                    train_batch,
                    expected_logits, expected_value_outs,
                    sess=sess
                )
            if sess:
                policy_sess = policy.get_session()
                k, e, pl, v, tl = policy_sess.run(
                    [
                        policy._mean_kl,
                        policy._mean_entropy,
                        policy._mean_policy_loss,
                        policy._mean_vf_loss,
                        policy._total_loss,
                    ],
                    feed_dict=policy._get_loss_inputs_dict(train_batch,
                                                           shuffle=False))
                check(k, kl)
                check(e, entropy)
                check(pl, np.mean(-pg_loss))
                check(v, np.mean(vf_loss), decimals=4)
                check(tl, overall_loss, decimals=4)
            else:
                check(policy._mean_kl, kl)
                check(policy._mean_entropy, entropy)
                check(policy._mean_policy_loss, np.mean(-pg_loss))
                check(policy._mean_vf_loss, np.mean(vf_loss), decimals=4)
                check(policy._total_loss, overall_loss, decimals=4)
            trainer.stop()
示例#21
0
import logging.config
import sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

helpers.register_madras()
ray.init()
config = ppo.DEFAULT_CONFIG.copy()
# Full config is here: https://github.com/ray-project/ray/blob/d51583dbd6dc9c082764b9ec06349678aaa71078/rllib/agents/trainer.py#L42
config["num_gpus"] = 0
config["num_workers"] = 1
config["eager"] = False
config[
    "vf_clip_param"] = 20  # originally it was 10. We should consider scaling down the rewards for keeping episode reward under 2000
# config["gamma"] = 0.7
# config["lr"] = 5e-7
# config["batch_mode"] = "complete_episodes"
# config["train_batch_size"] = 10000

trainer = ppo.PPOTrainer(config=config, env="madras_env")

# Can optionally call trainer.restore(path) to load a checkpoint.

for i in range(10000):
    # Perform one iteration of training the policy with PPO
    result = trainer.train()
    print(pretty_print(result))

    if i % 10 == 0:
        checkpoint = trainer.save()
        logging.info("checkpoint saved at", checkpoint)
示例#22
0
文件: test_ppo.py 项目: kylinLiu/ray
    def test_ppo_loss_function(self):
        """Tests the PPO loss function math."""
        config = ppo.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        config["gamma"] = 0.99
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"
        config["vf_share_layers"] = True

        # Fake CartPole episode of n time steps.
        train_batch = {
            SampleBatch.CUR_OBS: np.array(
                [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
                 [0.9, 1.0, 1.1, 1.2]],
                dtype=np.float32),
            SampleBatch.ACTIONS: np.array([0, 1, 1]),
            SampleBatch.REWARDS: np.array([1.0, -1.0, .5], dtype=np.float32),
            SampleBatch.DONES: np.array([False, False, True]),
            SampleBatch.VF_PREDS: np.array([0.5, 0.6, 0.7], dtype=np.float32),
            SampleBatch.ACTION_DIST_INPUTS: np.array(
                [[-2., 0.5], [-3., -0.3], [-0.1, 2.5]], dtype=np.float32),
            SampleBatch.ACTION_LOGP: np.array(
                [-0.5, -0.1, -0.2], dtype=np.float32),
        }

        for fw in ["tf", "torch"]:
            print("framework={}".format(fw))
            config["use_pytorch"] = fw == "torch"
            config["eager"] = fw == "tf"

            trainer = ppo.PPOTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()

            # Post-process (calculate simple (non-GAE) advantages) and attach
            # to train_batch dict.
            # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] =
            # [0.50005, -0.505, 0.5]
            if fw == "tf":
                train_batch = postprocess_ppo_gae_tf(policy, train_batch)
            else:
                train_batch = postprocess_ppo_gae_torch(policy, train_batch)
                train_batch = policy._lazy_tensor_dict(train_batch)

            # Check Advantage values.
            check(train_batch[Postprocessing.VALUE_TARGETS],
                  [0.50005, -0.505, 0.5])

            # Calculate actual PPO loss (results are stored in policy.loss_obj)
            # for tf.
            if fw == "tf":
                ppo_surrogate_loss_tf(policy, policy.model, Categorical,
                                      train_batch)
            else:
                ppo_surrogate_loss_torch(policy, policy.model,
                                         TorchCategorical, train_batch)

            vars = policy.model.variables() if fw == "tf" else \
                list(policy.model.parameters())
            expected_shared_out = fc(train_batch[SampleBatch.CUR_OBS], vars[0],
                                     vars[1])
            expected_logits = fc(expected_shared_out, vars[2], vars[3])
            expected_value_outs = fc(expected_shared_out, vars[4], vars[5])

            kl, entropy, pg_loss, vf_loss, overall_loss = \
                self._ppo_loss_helper(
                    policy, policy.model,
                    Categorical if fw == "tf" else TorchCategorical,
                    train_batch,
                    expected_logits, expected_value_outs
                )
            check(policy.loss_obj.mean_kl, kl)
            check(policy.loss_obj.mean_entropy, entropy)
            check(policy.loss_obj.mean_policy_loss, np.mean(-pg_loss))
            check(policy.loss_obj.mean_vf_loss, np.mean(vf_loss), decimals=4)
            check(policy.loss_obj.loss, overall_loss, decimals=4)
def _main():
    """ Training loop """
    # Args
    logger.info('Arguments: %s', str(ARGS))

    # Results
    metrics_dir, checkpoint_dir, best_checkpoint_dir, debug_dir, eval_dir = results_handler(
        ARGS)

    # Initialize the simulation.
    # ray.init()
    # ray.init(memory=52428800, object_store_memory=78643200) ## minimum values
    # ray.init(address='auto', _redis_password='******') ## attach
    ray.init(num_cpus=ARGS.ray_cpus,
             num_gpus=ARGS.ray_gpus,
             _memory=ARGS.ray_mem_gb * GB,
             object_store_memory=ARGS.ray_store_gb * GB)

    # Load default Scenario configuration
    experiment_config = load_json_file(ARGS.config)

    # Associate the agents with something
    env_config = {
        'metrics_dir': metrics_dir,
        'checkpoint_dir': checkpoint_dir,
        'agent_init': load_json_file(experiment_config['agents_init']),
        'scenario_config': experiment_config['marl_env_config'],
    }
    ## fix the config for learning:
    env_config['agent_init']['eval'] = {}

    marl_env = None
    if ARGS.env == 'MARL':
        ray.tune.registry.register_env(
            'marl_env', complexparkingstochasticdeepmarlenv.env_creator)
        marl_env = complexparkingstochasticdeepmarlenv.CSParkingPersuasiveDeepMARLEnv(
            env_config)
    else:
        raise Exception('Unknown environment %s' % ARGS.env)

    # Persuasive A3C Algorithm.
    policy_class = ppo.PPOTFPolicy
    policy_conf = persuasive_ppo_conf(rollout_size=ARGS.rollout_size,
                                      agents=len(marl_env.get_agents()),
                                      debug_folder=debug_dir,
                                      eval_folder=eval_dir,
                                      alpha=ARGS.alpha,
                                      gamma=ARGS.gamma)
    # Gen config
    agent = marl_env.get_agents()[0]
    policies = {
        'unique': (policy_class, marl_env.get_obs_space(agent),
                   marl_env.get_action_space(agent), {})
    }
    policy_conf['multiagent']['policies'] = policies
    policy_conf['multiagent']['policy_mapping_fn'] = lambda agent_id: 'unique'
    policy_conf['env_config'] = env_config
    # policy_conf['evaluation_config']['env_config'] = {
    #     'metrics_dir': metrics_dir,
    #     'checkpoint_dir': checkpoint_dir,
    #     'agent_init': load_json_file(experiment_config['agents_init']),
    #     'scenario_config': experiment_config['marl_env_config'],
    # }
    logger.info('Configuration: \n%s', pformat(policy_conf))

    def default_logger_creator(config):
        """
            Creates a Unified logger with a default logdir prefix
            containing the agent name and the env id
        """
        log_dir = os.path.join(os.path.normpath(ARGS.dir), 'logs')
        if not os.path.exists(log_dir):
            os.makedirs(log_dir)
        return UnifiedLogger(config, log_dir, loggers=[NoopLogger])

    trainer = ppo.PPOTrainer(
        # env=deepmarlenvironment.PersuasiveDeepMARLEnv,
        env='marl_env',
        config=policy_conf,
        logger_creator=default_logger_creator)

    last_checkpoint = get_last_checkpoint(checkpoint_dir)
    if last_checkpoint is not None:
        trainer.restore(last_checkpoint)
        logger.info('Restored checkpoint: %s', last_checkpoint)

    # Restoring the latest best metrics
    for metric in CHECKPOINT_METRICS:
        CURRENT_METRICS[metric]['value'] = get_last_best_of(
            os.path.join(best_checkpoint_dir, metric))
    logger.info('Restored metrics: \n%s', pformat(CURRENT_METRICS))

    counter = 0
    unchanged_window = 0
    final_result = None
    while counter < ARGS.training_iterations:
        # Do one training step.
        result = trainer.train()
        checkpoint = trainer.save(checkpoint_dir)
        logger.info('Checkpoint saved in %s', checkpoint)
        counter = result['iterations_since_restore']
        # counter = result['training_iteration']
        # steps += result['info']['num_steps_trained']
        # steps += result['timesteps_this_iter']
        final_result = result
        print_selected_results(result, SELECTION)
        metric_file = os.path.join(
            metrics_dir,
            'metrics_{}.json'.format(result['training_iteration']))
        with open(metric_file, 'w') as fstream:
            # the evaluation metrica are not saved in 'results.json'
            json.dump(result, fstream, cls=NPEncoder)
            # fstream.write('\n')
            print(
                '############################# METRIC SAVED #############################'
            )
        ############################################################################################
        if 'evaluation' not in result:
            continue
        changes = False
        for metric in CHECKPOINT_METRICS:
            old = CURRENT_METRICS[metric]['value']
            new = CURRENT_METRICS[metric]['get'](result)
            # if np.isnan(new):
            #     pprint(result['evaluation'])
            #     raise Exception(metric, old, new)
            if CURRENT_METRICS[metric]['check'](new, old):
                # Save the "best" checkout
                if metric in STOPPING_METRICS:
                    changes = True
                CURRENT_METRICS[metric]['value'] = new
                cleanup(os.path.join(best_checkpoint_dir, metric))
                current_checkpoint = trainer.save(
                    os.path.join(best_checkpoint_dir, metric))
                current_info_file = os.path.join(best_checkpoint_dir, metric,
                                                 'info.json')
                current_value = {'value': str(new)}
                with open(current_info_file, 'w') as fstream:
                    json.dump(current_value, fstream, indent=4)
                if old is None:
                    old = -1.0
                logger.info('UPDATING %s: %.2f (%.2f). Checkpoint saved in %s',
                            metric, new, old, current_checkpoint)
            else:
                logger.info('UNCHANGED %s ---> Best: %.2f - New: %.2f', metric,
                            old, new)
        if changes:
            unchanged_window = 0
        else:
            unchanged_window += 1
            logger.info(
                'Nothing has changed for the last %d training runs in the monitored metrics [%s].',
                unchanged_window, str(STOPPING_METRICS))
        if unchanged_window >= 10:
            break
        ############################################################################################

    # pprint(final_result)
    print_selected_results(final_result, SELECTION)
示例#24
0
    env_params['res_nutilda_tol'] = 1.0e-4
    env_params['reward_type'] = 1  # 1: cl/cd, 2: cd
    env_params['states_type'] = 1  # 1: single state, 2: k states history
    env_params['vx'] = 25.75
    data = np.loadtxt('control_points_range.csv',
                      delimiter=',',
                      skiprows=1,
                      usecols=range(1, 4))
    env_params['controlparams_low'] = data[:, 1]
    env_params['controlparams_high'] = data[:, 2]

    config["env_config"] = env_params

    # Trainer
    #    trainer = appo.APPOTrainer(config=config, env="myenv")
    trainer = ppo.PPOTrainer(config=config, env="myenv")
    #    trainer = a3c.A3CTrainer(config=config, env="myenv")
    #    trainer = a2c.A2CTrainer(config=config, env="myenv")
    trainer_time = time()

    #    trainer.restore('./PPO_myenv_2020-07-23_18-53-59cb7gh16j/checkpoint_51/checkpoint-51')

    file_results = 'Training_iterations_ppo.txt'

    # Can optionally call trainer.restore(path) to load a checkpoint.
    result = {'episodes_total': 0}
    results = []
    with open(file_results, 'wb', 0) as f:
        #        for i in range(ncount):
        i = 0
        while result['episodes_total'] <= 3000:
示例#25
0
    def test_traj_view_simple_performance(self):
        """Test whether PPOTrainer runs faster w/ `_use_trajectory_view_api`.
        """
        config = copy.deepcopy(ppo.DEFAULT_CONFIG)
        action_space = Discrete(2)
        obs_space = Box(-1.0, 1.0, shape=(700, ))

        from ray.rllib.examples.env.random_env import RandomMultiAgentEnv
        from ray.tune import register_env
        register_env(
            "ma_env",
            lambda c: RandomMultiAgentEnv({
                "num_agents": 2,
                "p_done": 0.0,
                "max_episode_len": 104,
                "action_space": action_space,
                "observation_space": obs_space
            }))

        config["num_workers"] = 3
        config["num_envs_per_worker"] = 8
        config["num_sgd_iter"] = 1  # Put less weight on training.

        policies = {
            "pol0": (None, obs_space, action_space, {}),
        }

        def policy_fn(agent_id):
            return "pol0"

        config["multiagent"] = {
            "policies": policies,
            "policy_mapping_fn": policy_fn,
        }
        num_iterations = 2
        for _ in framework_iterator(config, frameworks="torch"):
            print("w/ traj. view API")
            config["_use_trajectory_view_api"] = True
            trainer = ppo.PPOTrainer(config=config, env="ma_env")
            learn_time_w = 0.0
            sampler_perf_w = {}
            start = time.time()
            for i in range(num_iterations):
                out = trainer.train()
                ts = out["timesteps_total"]
                sampler_perf_ = out["sampler_perf"]
                sampler_perf_w = {
                    k:
                    sampler_perf_w.get(k, 0.0) + (sampler_perf_[k] * 1000 / ts)
                    for k, v in sampler_perf_.items()
                }
                delta = out["timers"]["learn_time_ms"] / ts
                learn_time_w += delta
                print("{}={}s".format(i, delta))
            sampler_perf_w = {
                k: sampler_perf_w[k] / (num_iterations if "mean_" in k else 1)
                for k, v in sampler_perf_w.items()
            }
            duration_w = time.time() - start
            print("Duration: {}s "
                  "sampler-perf.={} learn-time/iter={}s".format(
                      duration_w, sampler_perf_w,
                      learn_time_w / num_iterations))
            trainer.stop()

            print("w/o traj. view API")
            config["_use_trajectory_view_api"] = False
            trainer = ppo.PPOTrainer(config=config, env="ma_env")
            learn_time_wo = 0.0
            sampler_perf_wo = {}
            start = time.time()
            for i in range(num_iterations):
                out = trainer.train()
                ts = out["timesteps_total"]
                sampler_perf_ = out["sampler_perf"]
                sampler_perf_wo = {
                    k: sampler_perf_wo.get(k, 0.0) +
                    (sampler_perf_[k] * 1000 / ts)
                    for k, v in sampler_perf_.items()
                }
                delta = out["timers"]["learn_time_ms"] / ts
                learn_time_wo += delta
                print("{}={}s".format(i, delta))
            sampler_perf_wo = {
                k: sampler_perf_wo[k] / (num_iterations if "mean_" in k else 1)
                for k, v in sampler_perf_wo.items()
            }
            duration_wo = time.time() - start
            print("Duration: {}s "
                  "sampler-perf.={} learn-time/iter={}s".format(
                      duration_wo, sampler_perf_wo,
                      learn_time_wo / num_iterations))
            trainer.stop()

            # Assert `_use_trajectory_view_api` is faster.
            self.assertLess(sampler_perf_w["mean_raw_obs_processing_ms"],
                            sampler_perf_wo["mean_raw_obs_processing_ms"])
            self.assertLess(sampler_perf_w["mean_action_processing_ms"],
                            sampler_perf_wo["mean_action_processing_ms"])
            self.assertLess(duration_w, duration_wo)
示例#26
0
        self.action_space = gym.spaces.Discrete(2)  # right/left
        self.observation_space = gym.spaces.Discrete(self.end_pos)

    def reset(self):
        self.cur_pos = 0
        return self.cur_pos

    def step(self, action):
        if action == 0 and self.cur_pos > 0:  # move right (towards goal)
            self.cur_pos -= 1
        elif action == 1:  # move left (towards start)
            self.cur_pos += 1
        if self.cur_pos >= self.end_pos:
            return 0, 1.0, True, {}
        else:
            return self.cur_pos, -0.1, False, {}


ray.init()
config = {
    "env": SimpleCorridor,
    "env_config": {
        "corridor_length": 5,
    },
}

trainer = ppo.PPOTrainer(config=config)
for _ in range(3):
    print(trainer.train())
# __rllib-custom-gym-env-end__
示例#27
0
            steps (list): list of global steps after each episode
            returns (list): list of total return of each episode
        """
        box = np.ones(self.log_frequency) / self.log_frequency
        returns_smooth = np.convolve(self.returns[1:], box, mode='same')
        plt.clf()
        plt.plot(self.steps[1:], returns_smooth)
        plt.title('Status Report')
        plt.ylabel('Return')
        plt.xlabel('Steps')
        plt.savefig('returns.png')

        with open('returns.txt', 'w') as f:
            for step, value in zip(self.steps[1:], self.returns[1:]):
                f.write("{}\t{}\n".format(step, value))


if __name__ == '__main__':
    ray.init()
    trainer = ppo.PPOTrainer(
        env=DiamondCollector,
        config={
            'env_config': {},  # No environment parameters to configure
            'framework': 'torch',  # Use pytorch instead of tensorflow
            'num_gpus': 0,  # We aren't using GPUs
            'num_workers': 0  # We aren't using parallelism
        })

    while True:
        print(trainer.train())
示例#28
0
            "Colisions for feet:                                             ",
            self.env.env.robot.calc_state()[20], "   ",
            self.env.env.robot.calc_state()[21]
        )  #returns states, last 2 numbers inticate whther foot is in contact with ground
        self.dts_taken_so_far += 1
        if self.debug:
            print("Time elapsed in episode: ",
                  self.dts_taken_so_far * self.env.env.scene.dt)
            print("Number of dt's taken in episode: ", self.dts_taken_so_far)
        return self.env.step(action)


policy = CustomPolicy(env.observation_space, env.action_space, {})
workers = WorkerSet(policy=CustomPolicy,
                    env_creator=lambda c: gym.make("CartPole-v0"),
                    num_workers=10)
from ray.tune.registry import register_env

register_env("walkerbulletenv", lambda config: WalkerEnv(config))

#trainer = ppo.PPOTrainer(config=config, env="walkerbulletenv")

ray.init()
config = ppo.DEFAULT_CONFIG.copy()
config['num_workers'] = 0

ckpt_path = "/home/roman/ray_results/PPO_walkerbulletenv_2020-05-28_22-26-581q14o5cv/checkpoint_991/checkpoint-991"  #path to saved policy
agent = ppo.PPOTrainer(config, env="walkerbulletenv")
agent.restore(ckpt_path)  # restore agent (policy) from checkpoint
policy = agent.workers.local_worker().get_policy()  # get the policy
示例#29
0
from ray.rllib.agents import ppo
from ray.rllib.models import ModelCatalog
from tqdm import tqdm

from aie import plotting
from aie.aie_env import AIEEnv
from rl.conf import BASE_PPO_CONF, OUT_DIR
from rl.models.tf.fcnet import FCNet

# %%
ray.init()
ModelCatalog.register_custom_model("my_model", FCNet)

# %%
trainer = ppo.PPOTrainer(config={
    **BASE_PPO_CONF,
    "num_workers": 0,
})

ckpt_path = OUT_DIR / 'PPO_AIEEnv_2021-02-19_16-19-44p6xanojq/checkpoint_1777/checkpoint-1777'

trainer.restore(str(ckpt_path))

# %%
env = AIEEnv({}, force_dense_logging=True)
obs = env.reset()

for t in tqdm(range(1000)):
    results = {
        k: trainer.compute_action(
            v,
            policy_id='learned',
示例#30
0
    "horizon": 2000,
    "num_gpus": 1,
    "explore": False
    #"replay_sequence_length": 5,
    #"num_workers": 4,
    #"num_envs_per_worker": 2,
}

ray.init(local_mode=True)

checkpoint_number = 790

env = Herding({"sheep_count": 3
               #"agents_layout": "simple"
               })
agent = ppo.PPOTrainer(config=config, env=HerdingEnvWrapper)
agent.restore(
    rf"C:\Users\Mateusz\ray_results\Herding\Herding\checkpoint_{checkpoint_number}\checkpoint-{checkpoint_number}"
)

while True:
    episode_reward = 0
    done = False
    steps = 0
    obs = env.reset()
    while (not done) and (steps != 300):
        action = agent.compute_action(obs[0], policy_id="policy")
        obs, reward, done, info = env.step(np.array([[2, action]]))
        env.render()
        episode_reward += reward
        steps += 1