Пример #1
0
    def testPPOSampleWaste(self):
        ray.init(num_cpus=4, object_store_memory=1000 * 1024 * 1024)

        # Check we at least collect the initial wave of samples
        ppo = PPOTrainer(env="CartPole-v0",
                         config={
                             "sample_batch_size": 200,
                             "train_batch_size": 128,
                             "num_workers": 3,
                         })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 600)
        ppo.stop()

        # Check we collect at least the specified amount of samples
        ppo = PPOTrainer(env="CartPole-v0",
                         config={
                             "sample_batch_size": 200,
                             "train_batch_size": 900,
                             "num_workers": 3,
                         })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 1000)
        ppo.stop()

        # Check in vectorized mode
        ppo = PPOTrainer(env="CartPole-v0",
                         config={
                             "sample_batch_size": 200,
                             "num_envs_per_worker": 2,
                             "train_batch_size": 900,
                             "num_workers": 3,
                         })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 1200)
        ppo.stop()
Пример #2
0
    iteration = 22
    improved = 0
    while True:
        trainer = PPOTrainer(env="fire_mage", config=rnn_config)
        print(dir(trainer))
        #trainer.restore('./checkpoints_flush/checkpoint_379/checkpoint-379')

        step = 0
        best_val = 0.0
        if False:
            save_0 = trainer.save_to_object()
        while True:
            if False:
                save_0 = trainer.save_to_object()
                result = trainer.train()
                while result['episode_reward_mean'] > best_val:
                    print('UPENING')
                    best_save = deepcopy(save_0)
                    best_val = result['episode_reward_mean']
                    save_0 = trainer.save_to_object()
                    trainer.save('./checkpoints_flush')
                    result = trainer.train()
                print('REVERTING')
                trainer.restore_from_object(best_save)
            else:
                result = trainer.train()
                if result['episode_reward_mean'] > best_val:
                    improved = step
                    best_val = result['episode_reward_mean']
                    trainer.save('./checkpoints_iter_' + str(iteration))
Пример #3
0
    # You should see both the printed X and Y approach 200 as this trains:
    # info:
    #   policy_reward_mean:
    #     dqn_policy: X
    #     ppo_policy: Y
    for i in range(args.stop_iters):
        print("== Iteration", i, "==")

        # improve the DQN policy
        print("-- DQN --")
        result_dqn = dqn_trainer.train()
        print(pretty_print(result_dqn))

        # improve the PPO policy
        print("-- PPO --")
        result_ppo = ppo_trainer.train()
        print(pretty_print(result_ppo))

        # Test passed gracefully.
        if args.as_test and \
                result_dqn["episode_reward_mean"] > args.stop_reward and \
                result_ppo["episode_reward_mean"] > args.stop_reward:
            print("test passed (both agents above requested reward)")
            quit(0)

        # swap weights to synchronize
        dqn_trainer.set_weights(ppo_trainer.get_weights(["ppo_policy"]))
        ppo_trainer.set_weights(dqn_trainer.get_weights(["dqn_policy"]))

    # Desired reward not reached.
    if args.as_test:
        # Note that we do not immediately return value, but rather save it for `value_function`
        model_out, self._value = self.base_model(input_dict["obs"])
        # l = np.array([last_r])
        # if l.shape == (1,):
        #     l = l.reshape((1, 1))
        return model_out, state

    def value_function(self):
        return self._value


ModelCatalog.register_custom_model("image-ppo", RLLibPPOCritic)

ray.init()
trainer = PPOTrainer(
    env="CartPole-v0",
    config={
        "framework": "torch",
        "model": {
            "custom_model": "image-ppo",
        },
    }
)

plot = plotter.Plotter('ppo_cartpole')
for epoch in range(10):
    results = trainer.train()
    plot.add_results(results)

plot.plot(title='PPO CartPole-v0')
Пример #5
0


trainer_config = DEFAULT_CONFIG.copy()
trainer_config['num_workers'] = 1
trainer_config["train_batch_size"] = 400
trainer_config["sgd_minibatch_size"] = 64
trainer_config["num_sgd_iter"] = 10




trainer = PPOTrainer(trainer_config, SIR);
for i in range(200):
    print("Training iteration {}...".format(i))
    trainer.train()



env = SIR()
state = env.reset()

done = False
#max_state = -1
cumulative_reward = 0

total_states = list()
while not done:
    action = trainer.compute_action(state)
    state, reward, done, results = env.step(action)
    #max_state = max(max_state, state)
Пример #6
0
 def test_local(self):
     cf = DEFAULT_CONFIG.copy()
     for _ in framework_iterator(cf):
         agent = PPOTrainer(cf, "CartPole-v0")
         print(agent.train())
         agent.stop()
Пример #7
0
 def testLocal(self):
     ray.init(local_mode=True)
     cf = DEFAULT_CONFIG.copy()
     agent = PPOTrainer(cf, "CartPole-v0")
     print(agent.train())
Пример #8
0
    number_of_stochastic_moves = 5

    logdir = str(trainer_obj._logdir)
    additional_metrics = {"additional_metrics": {}}
    file_writer = tf.summary.create_file_writer(logdir)
    file_writer.set_as_default()

    for epoch in tqdm(range(best_ckpt + 1, epochs)):
        print("Epoch " + str(epoch))
        # when we call the train() methods we are updating the weights but the
        # win_rate is referred to the previous weights used to collect the
        # rollouts
        # if internally the weight changed, also prev_weights change since
        # it is just a reference (tested, prev_weights doesn't change)
        prev_weights = trainer_obj.get_policy("player1").get_weights()
        results = trainer_obj.train()

        # CHECK IF Array are different
        #updated_weights = trainer_obj.get_policy("player1").get_weights()
        # print("there are " + str(len(prev_weights)) + " weights")
        # indx = 0
        # equal_weights = []
        # for w1, w2 in zip(prev_weights,updated_weights):
        #     if np.array_equal(w1,w2)::
        #         equal_weights.append(indx)
        #     indx += 1
        # print(equal_weights)
        # input("Press Enter...")

        player1_win_rate = results["custom_metrics"]["player1_win_rate"]
        # instead of score_diff we use the win_ratio
Пример #9
0
 def test_basic(self):
     ppo = PPOTrainer(env="CartPole-v0",
                      config={"lr_schedule": [[0, 1e-5], [1000, 0.0]]})
     for _ in range(10):
         result = ppo.train()
     assert result["episode_reward_mean"] < 100, "should not have learned"
                "interaction_hidden_size": 4,
            },
        },
        "clip_actions": True,
        "framework": "torch",
        "num_sgd_iter": 3,
        "lr": 1e-4,
        #"kl_target": 0.03,
        "no_done_at_end": False,
        "soft_horizon": True,
        "train_batch_size": 100,
        "rollout_fragment_length": 100,
        "sgd_minibatch_size": 32
    }

    trainer = PPOTrainer(env="negotiate_roborobo", config=config)
    print(trainer.config.get('no_final_linear'))
    print('model built')
    stop_iter = 2000

    #%%
    import numpy as np
    for i in range(stop_iter):
        print("== Iteration", i, "==")
        result_ppo = trainer.train()
        if (i + 1) % 1 == 0:
            trainer.save('model_nego')
    trainer.save('model_nego')
    del trainerii
    ray.shutdown()
Пример #11
0
import ray
from ray.rllib.agents.ppo import PPOTrainer, DEFAULT_CONFIG
from ray.tune.logger import pretty_print
#ray.shutdown()
ray.init(num_cpus=4, ignore_reinit_error=True, log_to_driver=False)
config = DEFAULT_CONFIG.copy()
config['num_workers'] = 1
config['num_sgd_iter'] = 30
config['sgd_minibatch_size'] = 128
config['model']['fcnet_hiddens'] = [100, 100]
config[
    'num_cpus_per_worker'] = 0  # This avoids running out of resources in the notebook environment when this cell is re-executed

agent1 = PPOTrainer(config, 'CartPole-v0')
for i in range(2):
    result = agent1.train()
    print(pretty_print(result))

config2 = DEFAULT_CONFIG.copy()
config2['num_workers'] = 4
config2['num_sgd_iter'] = 30
config2['sgd_minibatch_size'] = 128
config2['model']['fcnet_hiddens'] = [100, 100]
config2['num_cpus_per_worker'] = 0

agent2 = PPOTrainer(config2, 'CartPole-v0')
for i in range(2):
    result = agent2.train()
    print(pretty_print(result))

checkpoint_path = agent2.save()
Пример #12
0
    def execute(self):
        timesteps = 0
        best_period_value = None

        if self.pr.agent.name() == "A2C":
            trainer = A2CTrainer(config=self.rllib_config,
                                 logger_creator=rllib_logger_creator)
        elif self.pr.agent.name() == "PPO":
            trainer = PPOTrainer(config=self.rllib_config,
                                 logger_creator=rllib_logger_creator)
            # import pdb; pdb.set_trace()
        else:
            raise ValueError('There is no rllib trainer with name ' +
                             self.pr.agent.name())

        tf_writer = SummaryWriter(
            self.pr.save_logs_to) if self.pr.save_logs_to else None

        reward_metric = Metric(short_name='rews',
                               long_name='trajectory reward',
                               formatting_string='{:5.1f}',
                               higher_is_better=True)
        time_step_metric = Metric(short_name='steps',
                                  long_name='total number of steps',
                                  formatting_string='{:5.1f}',
                                  higher_is_better=True)

        metrics = [reward_metric, time_step_metric]

        if self.pr.train:
            start_time = time.time()
            policy_save_tag = 0
            while timesteps < self.pr.total_steps:

                result = trainer.train()

                timesteps = result["timesteps_total"]
                reward_metric.log(result['evaluation']['episode_reward_mean'])
                time_step_metric.log(result['evaluation']['episode_len_mean'])
                # import pdb; pdb.set_trace()
                # # Get a metric list from each environment.
                # if hasattr(trainer, "evaluation_workers"):
                #     metric_lists = sum(trainer.evaluation_workers.foreach_worker(lambda w: w.foreach_env(lambda e: e.metrics)), [])
                # else:
                #     metric_lists = sum(trainer.workers.foreach_worker(lambda w: w.foreach_env(lambda e: e.metrics)), [])

                # metrics = metric_lists[0]

                # # Aggregate metrics from all other environments.
                # for metric_list in metric_lists[1:]:
                #     for i, metric in enumerate(metric_list):
                #         metrics[i]._values.extend(metric._values)

                save_logs_to = self.pr.save_logs_to
                model_save_paths_dict = self.pr.model_save_paths_dict
                # Consider whether to save a model.
                saved = False
                if model_save_paths_dict is not None and metrics[
                        0].currently_optimal:
                    # trainer.get_policy().model.save(model_save_paths_dict)
                    policy_save_tag += 1
                    trainer.get_policy().model.save_model_in_progress(
                        model_save_paths_dict, policy_save_tag)
                    saved = True

                # Write the metrics for this reporting period.
                total_seconds = time.time() - start_time
                logger.write_and_condense_metrics(total_seconds, 'iters',
                                                  timesteps, saved, metrics,
                                                  tf_writer)

                # Clear the metrics, both those maintained by the training workers and by the evaluation ones.
                condense_fn = lambda environment: [
                    m.condense_values() for m in environment.metrics
                ]
                trainer.workers.foreach_worker(
                    lambda w: w.foreach_env(condense_fn))
                if hasattr(trainer, "evaluation_workers"):
                    trainer.evaluation_workers.foreach_worker(
                        lambda w: w.foreach_env(condense_fn))

        else:
            start_time = time.time()
            env = trainer.workers.local_worker().env
            metrics = env.metrics
            worker = trainer.workers.local_worker()
            steps = steps_since_report = 0

            while True:
                batch = worker.sample()
                current_steps = len(batch["obs"])
                steps += current_steps
                steps_since_report += current_steps

                if steps_since_report >= self.pr.reporting_interval:
                    total_seconds = time.time() - start_time

                    # Write the metrics for this reporting period.
                    logger.write_and_condense_metrics(total_seconds, 'iters',
                                                      steps, False, metrics,
                                                      tf_writer)

                    steps_since_report = 0
                    if steps >= self.pr.total_steps:
                        break

            env.close()

        # Get a summary metric for the entire stage, based on the environment's first metric.
        summary_metric = logger.summarize_stage(metrics[0])

        # Temporary workaround for https://github.com/ray-project/ray/issues/8205
        ray.shutdown()
        _register_all()

        return summary_metric
Пример #13
0
def main():
    ray.init()
    logging.getLogger().setLevel(logging.INFO)
    date = datetime.now().strftime('%Y%m%d_%H%M%S')
    parser = argparse.ArgumentParser()
    # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4')
    parser.add_argument('--config',
                        type=str,
                        default='config/global_config.json',
                        help='config file')
    parser.add_argument('--algo',
                        type=str,
                        default='PPO',
                        choices=['DQN', 'DDQN', 'DuelDQN'],
                        help='choose an algorithm')
    parser.add_argument('--inference',
                        action="store_true",
                        help='inference or training')
    parser.add_argument('--ckpt', type=str, help='inference or training')
    parser.add_argument('--epoch',
                        type=int,
                        default=10,
                        help='number of training epochs')
    parser.add_argument(
        '--num_step',
        type=int,
        default=10**3,
        help='number of timesteps for one episode, and for inference')
    parser.add_argument('--save_freq',
                        type=int,
                        default=100,
                        help='model saving frequency')
    parser.add_argument('--batch_size',
                        type=int,
                        default=128,
                        help='model saving frequency')
    parser.add_argument('--state_time_span',
                        type=int,
                        default=5,
                        help='state interval to receive long term state')
    parser.add_argument('--time_span',
                        type=int,
                        default=30,
                        help='time interval to collect data')

    args = parser.parse_args()

    config_env = env_config(args)
    # ray.tune.register_env('gym_cityflow', lambda env_config:CityflowGymEnv(config_env))

    config_agent = agent_config(config_env)

    # # build cityflow environment

    trainer = PPOTrainer(env=CityflowGymEnv, config=config_agent)
    for i in range(1000):
        # Perform one iteration of training the policy with PPO
        result = trainer.train()
        print(pretty_print(result))

        if i % 30 == 0:
            checkpoint = trainer.save()
            print("checkpoint saved at", checkpoint)
Пример #14
0
class KandboxAgentRLLibPPO(KandboxAgentPlugin):
    title = "Kandbox Plugin - Agent - realtime - by rllib ppo"
    slug = "ri_agent_rl_ppo"
    author = "Kandbox"
    author_url = "https://github.com/qiyangduan"
    description = "RLLibPPO for GYM for RL."
    version = "0.1.0"
    default_config = {
        "nbr_of_actions": 4,
        "n_epochs": 1000,
        "nbr_of_days_planning_window": 6,
        "model_path": "default_model_path",
        "working_dir": "/tmp",
        "checkpoint_path_key": "ppo_checkpoint_path",
    }
    config_form_spec = {
        "type": "object",
        "properties": {},
    }

    def __init__(self, agent_config, kandbox_config):
        self.agent_config = agent_config
        self.current_best_episode_reward_mean = -99

        env_config = agent_config["env_config"]

        if "rules_slug_config_list" not in env_config.keys():
            if "rules" not in env_config.keys():
                log.error("no rules_slug_config_list and no rules ")
            else:
                env_config["rules_slug_config_list"] = [
                    [rule.slug, rule.config] for rule in env_config["rules"]
                ]
                env_config.pop("rules", None)

        # self.env_class = env_class = agent_config["env"]

        self.kandbox_config = self.default_config.copy()
        self.kandbox_config.update(kandbox_config)

        # self.trained_model = trained_model
        self.kandbox_config["create_datetime"] = datetime.now()

        # self.trainer = None
        self.env_config = env_config
        # self.load_model(env_config=self.env_config)
        print(
            f"KandboxAgentRLLibPPO __init__ called, at time {self.kandbox_config['create_datetime']}"
        )
        # import pdb

        # pdb.set_trace()
        if not ray.is_initialized():
            ray.init(ignore_reinit_error=True, log_to_driver=False)
        # ray.init(redis_address="localhost:6379")

    def build_model(self):

        trainer_config = DEFAULT_CONFIG.copy()

        trainer_config["num_workers"] = 0
        # trainer_config["train_batch_size"] = 640
        # trainer_config["sgd_minibatch_size"] = 160
        # trainer_config["num_sgd_iter"] = 100

        trainer_config["exploration_config"] = {
            "type": "Random",
        }
        # EpsilonGreedy(Exploration):
        # trainer_config["exploration_config"] = {
        #         "type": "Curiosity",
        #         "eta": 0.2,
        #         "lr": 0.001,
        #         "feature_dim": 128,
        #         "feature_net_config": {
        #             "fcnet_hiddens": [],
        #             "fcnet_activation": "relu",
        #         },
        #         "sub_exploration": {
        #             "type": "StochasticSampling",
        #         }
        #     }

        # trainer_config["log_level"] = "DEBUG"
        """
        if env_config is not None:
            for x in env_config.keys():
                trainer_config[x] = env_config[x]
        """

        # trainer_config["env_config"] = copy.deepcopy(env_config)  #  {"rules": "qiyang_role"}

        trainer_config.update(self.agent_config)

        self.trainer = PPOTrainer(trainer_config, self.agent_config["env"])
        # self.config["trainer"] = self.trainer
        return self.trainer

    def load_model(self):  # , allow_empty = None
        env_config = self.agent_config["env_config"]
        self.trainer = self.build_model()

        # if (model_path is not None) & (os.path.exists(model_path)):
        if "ppo_checkpoint_path" in env_config.keys():
            # raise FileNotFoundError("can not find model at path: {}".format(model_path))
            if os.path.exists(env_config["ppo_checkpoint_path"]):
                self.trainer.restore(env_config["ppo_checkpoint_path"])
                print("Reloaded model from path: {} ".format(
                    env_config["ppo_checkpoint_path"]))

            else:
                print(
                    "Env_config has ppo_checkpoint_path = {}, but no files found. I am returning an initial model"
                    .format(env_config["ppo_checkpoint_path"]))

        else:
            print(
                "Env_config has no ppo_checkpoint_path, returning an initial model"
            )
        # self.config["model_path"] = model_path
        # self.config["trainer"] = self.trainer
        # self.config["policy"] = self.trainer.workers.local_worker().get_policy()
        self.policy = self.trainer.workers.local_worker().get_policy()
        return self.trainer

    def train_model(self):

        # self.trainer = self.build_model()
        for i in range(self.kandbox_config["n_epochs"]):
            result = self.trainer.train()
            # print(pretty_print(result))
            print(
                "Finished training iteration {}, Result: episodes_this_iter:{}, policy_reward_max: {}, episode_reward_max {}, episode_reward_mean {}, info.num_steps_trained: {}..."
                .format(
                    i,
                    result["episodes_this_iter"],
                    result["policy_reward_max"],
                    result["episode_reward_max"],
                    result["episode_reward_mean"],
                    result["info"]["num_steps_trained"],
                ))
            if result[
                    "episode_reward_mean"] > self.current_best_episode_reward_mean * 1.1:
                model_path = self.save_model()
                print(
                    "Model is saved after 10 percent increase, episode_reward_mean = {},  file = {}"
                    .format(result["episode_reward_mean"], model_path))
                self.current_best_episode_reward_mean = result[
                    "episode_reward_mean"]

        return self.save_model()

    def save_model(self):

        checkpoint_dir = "{}/model_checkpoint_org_{}_team_{}".format(
            self.agent_config["env_config"]["working_dir"],
            self.agent_config["env_config"]["org_code"],
            self.agent_config["env_config"]["team_id"],
        )
        _path = self.trainer.save(checkpoint_dir=checkpoint_dir)

        # exported_model_dir = "{}/exported_ppo_model_org_{}_team_{}".format(
        #     self.agent_config["env_config"]["working_dir"], self.agent_config["env_config"]["org_code"], self.agent_config["env_config"]["team_id"]
        # )
        # self.trainer.get_policy().export_model(exported_model_dir + "/1")

        return _path  # self.trainer

    def predict_action(self, observation=None):

        action = self.trainer.compute_action(observation)
        return action

    def predict_action_list(self, env=None, job_code=None, observation=None):
        actions = []
        if env is not None:
            self.env = env
        else:
            env = self.env

        if job_code is None:
            job_i = env.current_job_i
        else:
            job_i = env.jobs_dict[job_code].job_index

        observation = env._get_observation()

        # export_dir = "/Users/qiyangduan/temp/kandbox/exported_ppo_model_org_duan3_team_3/1"
        # loaded_policy = tf.saved_model.load(export_dir)
        # loaded_policy.signatures["serving_default"](observations=observation)

        predicted_action = self.trainer.compute_action(observation)
        # V predicted_action = self.policy.compute_action(observation)

        for _ in range(len(env.workers)):  # hist_job_workers_ranked:
            if len(actions) >= self.config["nbr_of_actions"]:
                return actions
            actions.append(list(predicted_action).copy())
            max_i = np.argmax(predicted_action[0:len(env.workers)])
            predicted_action[max_i] = 0

        return actions

    def predict_action_dict_list(self,
                                 env=None,
                                 job_code=None,
                                 observation=None):
        if env is not None:
            self.env = env
        else:
            env = self.env

        curr_job = copy.deepcopy(env.jobs_dict[job_code])

        if job_code is None:
            job_i = env.current_job_i
        else:
            job_i = curr_job.job_index
            env.current_job_i = job_i

        observation = env._get_observation()

        action = self.predict_action(observation=observation)
        action_dict = env.decode_action_into_dict_native(action=action)

        action_day = int(action_dict.scheduled_start_minutes / 1440)
        curr_job.requested_start_min_minutes = action_day * 1440
        curr_job.requested_start_max_minutes = (action_day + 1) * 1440

        action_dict_list = self.env.recommendation_server.search_action_dict_on_worker_day(
            a_worker_code_list=action.scheduled_worker_codes,
            curr_job=curr_job,
            max_number_of_matching=3,
        )
        return action_dict_list
Пример #15
0
def fulltest(total_trials,
             training_trials,
             d,
             m,
             q,
             train_check,
             evaluation_trials=5000,
             lr=0.00005,
             num_workers=4,
             num_gpus=0.25,
             SDP=True,
             LG=False,
             local_SDP=False,
             dep=True,
             rngvec=np.ones(1000)):
    quantization = 20
    separable = True
    bigvec = np.zeros((total_trials, int(training_trials / train_check) + 1))
    vec_SDP = []
    vec_local_SDP = []
    vec_LG = []

    for j in range(total_trials):
        print("Starting round", j, "of", total_trials)
        rho, _ = qsdl.generate_initial_state(d,
                                             m,
                                             rng=rngvec[j],
                                             depolarized=dep)

        if local_SDP == True:
            lg = max_SDP_sim_order(q, rho, len(d), 1250, d)
            vec_local_SDP.append(lg)
            print("local SDP-based")
            print(lg)
        if SDP == True:
            sdpr = sdp.SDP(rho, q, len(d))
            vec_SDP.append(sdpr)
            print("SDP")
            print(sdpr)
        if LG == True:
            lg = LG_sim_order(copy.copy(q), copy.copy(rho), len(d), 2500, d)
            vec_LG.append(lg)
            print("LG")
            print(lg)

        print("RLNN: ")
        print(bigvec[-1])
        defaultconfig = {
            "rho": copy.copy(rho),
            "q": copy.copy(q),
            "quantization": quantization,
            "d": d,
            "separable": True
        }
        vec = []
        ray.shutdown()
        ray.init(**ray_init_kwargs)
        config = ppo.DEFAULT_CONFIG.copy()
        if (num_gpus > 0):
            config["num_gpus"] = num_gpus
            config["num_workers"] = num_workers
            config["lr"] = lr
            config["train_batch_size"] = 8000
            config["num_sgd_iter"] = 5
            config["env_config"] = defaultconfig
            trainer = Trainer(config=config, env=qsdl.QSDEnv)
        for i in range(training_trials):
            result = trainer.train()
            print("train iteration", i + 1, "/", training_trials,
                  " avg_reward =", result["episode_reward_mean"],
                  " timesteps =", result["timesteps_total"])
            #         if i % check == check-1:
            #             checkpoint = trainer.save()
            #             print("checkpoint saved at", checkpoint)
            if i == 0 or (i + 1) % train_check == 0:
                rew = 0
                for i in range(evaluation_trials):
                    env = qsdl.QSDEnv(defaultconfig)
                    obs = env.reset()
                    done = False
                    while not done:
                        action = trainer.compute_action(obs)
                        obs, r, done, _ = env.step(action)
                        rew += r
                vec.append(rew / evaluation_trials)
        bigvec[j] = vec
    return bigvec, vec_SDP, vec_local_SDP, vec_LG
Пример #16
0
            "policy_mapping_fn": policy_mapping_fn,
        },
        "framework":
        "tf",
    }

    # Create the Trainer used for Policy serving.
    trainer = PPOTrainer(env="fake_unity", config=config)

    # Attempt to restore from checkpoint if possible.
    checkpoint_path = CHECKPOINT_FILE.format(args.env)
    if not args.no_restore and os.path.exists(checkpoint_path):
        checkpoint_path = open(checkpoint_path).read()
        print("Restoring from checkpoint path", checkpoint_path)
        trainer.restore(checkpoint_path)

    # Serving and training loop.
    count = 0
    while True:
        # Calls to train() will block on the configured `input` in the Trainer
        # config above (PolicyServerInput).
        print(trainer.train())
        if count % args.checkpoint_freq == 0:
            print("Saving learning progress to checkpoint file.")
            checkpoint = trainer.save()
            # Write the latest checkpoint location to CHECKPOINT_FILE,
            # so we can pick up from the latest one after a server re-start.
            with open(checkpoint_path, "w") as f:
                f.write(checkpoint)
        count += 1
Пример #17
0
    "sample_batch_size": 20,
    "sgd_minibatch_size": 500,
    "num_sgd_iter": 10,
    "num_workers": 1,  # 32
    "num_envs_per_worker": 1,  #5
    "num_gpus": 1,
    "model": {
        "dim": 64
    }
})


def env_creator(env_config):
    return PodWorldEnv(max_steps=10000, reward_factor=10000.0)


register_env("podworld_env", env_creator)
agent = PPOTrainer(config=config, env="podworld_env")
agent_save_path = None

for i in range(50):
    stats = agent.train()
    # print(pretty_print(stats))
    if i % 5 == 0 and i > 0:
        path = agent.save()
        if agent_save_path is None:
            agent_save_path = path
            print('Saved agent at', agent_save_path)
    logger.write((i, stats['episode_reward_min']))
    print('episode_reward_mean', stats['episode_reward_min'])
Пример #18
0
def train_func():
    default_config = {
        'env': 'JSSEnv:jss-v1',
        'seed': 0,
        'framework': 'tf',
        'log_level': 'WARN',
        'num_gpus': 1,
        'instance_path': 'instances/ta41',
        'evaluation_interval': None,
        'metrics_smoothing_episodes': 2000,
        'gamma': 1.0,
        'num_workers': mp.cpu_count(),
        'layer_nb': 2,
        'train_batch_size': mp.cpu_count() * 4 * 704,
        'num_envs_per_worker': 4,
        'rollout_fragment_length': 704,  # TO TUNE
        'sgd_minibatch_size': 33000,
        'layer_size': 319,
        'lr': 0.0006861,  # TO TUNE
        'lr_start': 0.0006861,  # TO TUNE
        'lr_end': 0.00007783,  # TO TUNE
        'clip_param': 0.541,  # TO TUNE
        'vf_clip_param': 26,  # TO TUNE
        'num_sgd_iter': 12,  # TO TUNE
        "vf_loss_coeff": 0.7918,
        "kl_coeff": 0.496,
        'kl_target': 0.05047,  # TO TUNE
        'lambda': 1.0,
        'entropy_coeff': 0.0002458,  # TUNE LATER
        'entropy_start': 0.0002458,
        'entropy_end': 0.002042,
        'entropy_coeff_schedule': None,
        "batch_mode": "truncate_episodes",
        "grad_clip": None,
        "use_critic": True,
        "use_gae": True,
        "shuffle_sequences": True,
        "vf_share_layers": False,
        "observation_filter": "NoFilter",
        "simple_optimizer": False,
        "_fake_gpus": False,
    }

    wandb.init(config=default_config)
    ray.init()
    tf.random.set_seed(0)
    np.random.seed(0)
    random.seed(0)

    config = wandb.config

    ModelCatalog.register_custom_model("fc_masked_model_tf",
                                       FCMaskedActionsModelTF)

    config['model'] = {
        "fcnet_activation": "relu",
        "custom_model": "fc_masked_model_tf",
        'fcnet_hiddens':
        [config['layer_size'] for k in range(config['layer_nb'])],
        "vf_share_layers": False,
    }
    config['env_config'] = {
        'env_config': {
            'instance_path': config['instance_path']
        }
    }

    config = with_common_config(config)
    config['seed'] = 0
    config['callbacks'] = CustomCallbacks
    config['train_batch_size'] = config['sgd_minibatch_size']

    config['lr'] = config['lr_start']
    config['lr_schedule'] = [[0, config['lr_start']],
                             [15000000, config['lr_end']]]

    config['entropy_coeff'] = config['entropy_start']
    config['entropy_coeff_schedule'] = [[0, config['entropy_start']],
                                        [15000000, config['entropy_end']]]

    config.pop('instance_path', None)
    config.pop('layer_size', None)
    config.pop('layer_nb', None)
    config.pop('lr_start', None)
    config.pop('lr_end', None)
    config.pop('entropy_start', None)
    config.pop('entropy_end', None)

    stop = {
        "time_total_s": 10 * 60,
    }

    start_time = time.time()
    trainer = PPOTrainer(config=config)
    while start_time + stop['time_total_s'] > time.time():
        result = trainer.train()
        result = wandb_tune._clean_log(result)
        log, config_update = _handle_result(result)
        wandb.log(log)
        # wandb.config.update(config_update, allow_val_change=True)
    # trainer.export_policy_model("/home/jupyter/JSS/JSS/models/")

    ray.shutdown()
Пример #19
0
            "policies": policies,
            "policy_mapping_fn": lambda agent_id: "ppo_policy",
        },
        # "num_gpus": 0,
        # "num_gpus_per_worker": 0,
        "callbacks": PlayerScoreCallbacks
    })

if restore_checkpoint:
    trainer.restore(checkpoint_path)

start = time.time()

try:
    for i in range(num_iter):
        res = trainer.train()
        print("Iteration {}. policy result: {}".format(i, res))
        if i % eval_every == 0:
            trainer_eval.set_weights(trainer.get_weights(["ppo_policy"]))
            res = trainer_eval.train()
        if i % checkpoint_every == 0:
            trainer.save()
except:
    trainer.save()

stop = time.time()
train_duration = time.strftime('%H:%M:%S', time.gmtime(stop - start))
print(
    'Training finished ({}), check the results in ~/ray_results/<dir>/'.format(
        train_duration))
Пример #20
0
}

# analysis = tune.run(
#     "PPO",
#     stop={
#       "episode_reward_mean": 500000
#     },
#     config=trainer_config,
#     loggers=DEFAULT_LOGGERS + (WandbLogger, ),
#     checkpoint_at_end=True
# )

## debug code
ray.init(num_gpus=num_gpus, local_mode=True)
agent = PPOTrainer(env="TradingEnv", config=trainer_config)
agent.train()

# compute final reward
# ray.init(num_gpus=1, local_mode=False)
# env = build_env({
#     "window_size": 25
# })
# episode_reward = 0
# done = False
# obs = env.reset()
#
# while not done:
#     action = agent.compute_action(obs)
#     obs, reward, done, info = env.step(action)
#     episode_reward += reward
# print(f'reward: {episode_reward}')
Пример #21
0
    ten_gig = 10737418240

    trainer = PPOTrainer(
        env="ic20env",
        config=merge_dicts(
            DEFAULT_CONFIG,
            {
                # -- Rollout-Worker
                'num_gpus': 1,
                'num_workers': 10,
                "num_envs_per_worker": 1,
                "num_cpus_per_worker": 1,
                "memory_per_worker": ten_gig,
                'gamma': 0.99,
                'lambda': 0.95
            }))

    # Attempt to restore from checkpoint if possible.
    if os.path.exists(CHECKPOINT_FILE):
        checkpoint_path = open(CHECKPOINT_FILE).read()
        print("Restoring from checkpoint path", checkpoint_path)
        trainer.restore(checkpoint_path)

    # Serving and training loop
    while True:
        print(pretty_print(trainer.train()))
        checkpoint_path = trainer.save()
        print("Last checkpoint", checkpoint_path)
        with open(CHECKPOINT_FILE, "w") as f:
            f.write(checkpoint_path)
 def test_local(self):
     cf = DEFAULT_CONFIG.copy()
     agent = PPOTrainer(cf, "CartPole-v0")
     print(agent.train())
Пример #23
0
    def test_simple_optimizer_sequencing(self):
        ModelCatalog.register_custom_model("rnn", RNNSpyModel)
        register_env("counter", lambda _: DebugCounterEnv())
        ppo = PPOTrainer(
            env="counter",
            config={
                "num_workers": 0,
                "rollout_fragment_length": 10,
                "train_batch_size": 10,
                "sgd_minibatch_size": 10,
                "num_sgd_iter": 1,
                "simple_optimizer": True,
                "model": {
                    "custom_model": "rnn",
                    "max_seq_len": 4,
                    "vf_share_layers": True,
                },
                "framework": "tf",
            },
        )
        ppo.train()
        ppo.train()

        batch0 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0")
        )
        self.assertEqual(
            batch0["sequences"].tolist(),
            [[[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]]],
        )
        self.assertEqual(batch0[SampleBatch.SEQ_LENS].tolist(), [4, 4, 2])
        self.assertEqual(batch0["state_in"][0][0].tolist(), [0, 0, 0])
        self.assertEqual(batch0["state_in"][1][0].tolist(), [0, 0, 0])
        self.assertGreater(abs(np.sum(batch0["state_in"][0][1])), 0)
        self.assertGreater(abs(np.sum(batch0["state_in"][1][1])), 0)
        self.assertTrue(
            np.allclose(
                batch0["state_in"][0].tolist()[1:], batch0["state_out"][0].tolist()[:-1]
            )
        )
        self.assertTrue(
            np.allclose(
                batch0["state_in"][1].tolist()[1:], batch0["state_out"][1].tolist()[:-1]
            )
        )

        batch1 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1")
        )
        self.assertEqual(
            batch1["sequences"].tolist(),
            [
                [[10], [11], [12], [13]],
                [[14], [0], [0], [0]],
                [[0], [1], [2], [3]],
                [[4], [0], [0], [0]],
            ],
        )
        self.assertEqual(batch1[SampleBatch.SEQ_LENS].tolist(), [4, 1, 4, 1])
        self.assertEqual(batch1["state_in"][0][2].tolist(), [0, 0, 0])
        self.assertEqual(batch1["state_in"][1][2].tolist(), [0, 0, 0])
        self.assertGreater(abs(np.sum(batch1["state_in"][0][0])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][1][0])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][0][1])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][1][1])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][0][3])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][1][3])), 0)
Пример #24
0
def main() -> None:
    ray.init()
    np.random.seed(0)

    # instructions = {
    #     0: [Instruction(time=0, x=5, y=5)],
    #     1: [Instruction(time=1, x=5, y=5), Instruction(time=1, x=1, y=5)],
    #     2: [Instruction(time=2, x=5, y=5, rng=np.random.default_rng())],
    # }
    # task = Task(
    #     target_x=1,
    #     target_y=5,
    #     instructions=instructions,
    #     tot_frames=4,
    #     width=42,
    #     height=42,
    # )

    # task = ODR(target_x=1, target_y=5, width=42, height=42)
    # task = Gap(target_x=1, target_y=5, width=42, height=42)
    task = ODRDistract(target_x=1, target_y=5, width=42, height=42)

    def env_creator(env_config):
        return Environment(env_config)  # return an env instance

    register_env("my_env", env_creator)

    # trainer_config = DEFAULT_CONFIG.copy()
    # trainer_config["num_workers"] = 1
    # trainer_config["train_batch_size"] = 20  # 100
    # trainer_config["sgd_minibatch_size"] = 15  # 32
    # trainer_config["num_sgd_iter"] = 50

    trainer = PPOTrainer(
        env="my_env",
        config={
            "env_config": {"task": task},
            "framework": "torch",
            "num_workers": 1,
            "train_batch_size": 10,
            "sgd_minibatch_size": 5,
            "num_sgd_iter": 10,
            # "model": {
            #     # Whether to wrap the model with an LSTM.
            #     "use_lstm": True,
            #     # Max seq len for training the LSTM, defaults to 20.
            #     "max_seq_len": task.tot_frames - 1,
            #     # # Size of the LSTM cell.
            #     "lstm_cell_size": task.tot_frames - 1,
            #     # # Whether to feed a_{t-1}, r_{t-1} to LSTM.
            #     # # "lstm_use_prev_action_reward": False,
            # },
        },
    )

    trainer = A2CTrainer(
        env="my_env",
        config={
            "env_config": {"task": task},
            "framework": "torch",
            "num_workers": 1,
            "train_batch_size": 10,
            # "model": {
            #     # Whether to wrap the model with an LSTM.
            #     "use_lstm": True,
            #     # Max seq len for training the LSTM, defaults to 20.
            #     "max_seq_len": task.tot_frames - 1,
            #     # # Size of the LSTM cell.
            #     "lstm_cell_size": task.tot_frames - 1,
            #     # # Whether to feed a_{t-1}, r_{t-1} to LSTM.
            #     # # "lstm_use_prev_action_reward": False,
            # },
        },
    )

    # trainer = DQNTrainer(
    #     env="my_env",
    #     config={
    #         "env_config": {"task": task},
    #         "framework": "torch",
    #         "num_workers": 1,
    #         "train_batch_size": 10,
    #         # "model": {
    #         #     # Whether to wrap the model with an LSTM.
    #         #     "use_lstm": True,
    #         #     # Max seq len for training the LSTM, defaults to 20.
    #         #     "max_seq_len": task.tot_frames - 1,
    #         #     # # Size of the LSTM cell.
    #         #     "lstm_cell_size": task.tot_frames - 1,
    #         #     # # Whether to feed a_{t-1}, r_{t-1} to LSTM.
    #         #     # # "lstm_use_prev_action_reward": False,
    #         # },
    #     },
    # )

    env = Environment(env_config={"task": task})

    for i in range(200):
        print(f"Training iteration {i}...")
        trainer.train()

        done = False
        cumulative_reward = 0.0
        observation = env.reset()

        while not done:
            action = trainer.compute_action(observation)

            observation, reward, done, results = env.step(action)
            print(f"Time: {env.time}. Action: {action}")
            cumulative_reward += reward
        print(
            f"Last step reward: {reward: .3e}; Cumulative reward: {cumulative_reward:.3e}"
        )
Пример #25
0
    def test_minibatch_sequencing(self):
        ModelCatalog.register_custom_model("rnn", RNNSpyModel)
        register_env("counter", lambda _: DebugCounterEnv())
        ppo = PPOTrainer(
            env="counter",
            config={
                "shuffle_sequences": False,  # for deterministic testing
                "num_workers": 0,
                "rollout_fragment_length": 20,
                "train_batch_size": 20,
                "sgd_minibatch_size": 10,
                "vf_share_layers": True,
                "simple_optimizer": False,
                "num_sgd_iter": 1,
                "model": {
                    "custom_model": "rnn",
                    "max_seq_len": 4,
                    "state_shape": [3, 3],
                },
                "framework": "tf",
            })
        ppo.train()
        ppo.train()

        # first epoch: 20 observations get split into 2 minibatches of 8
        # four observations are discarded
        batch0 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0"))
        batch1 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1"))
        if batch0["sequences"][0][0][0] > batch1["sequences"][0][0][0]:
            batch0, batch1 = batch1, batch0  # sort minibatches
        self.assertEqual(batch0["seq_lens"].tolist(), [4, 4])
        self.assertEqual(batch1["seq_lens"].tolist(), [4, 3])
        self.assertEqual(batch0["sequences"].tolist(), [
            [[0], [1], [2], [3]],
            [[4], [5], [6], [7]],
        ])
        self.assertEqual(batch1["sequences"].tolist(), [
            [[8], [9], [10], [11]],
            [[12], [13], [14], [0]],
        ])

        # second epoch: 20 observations get split into 2 minibatches of 8
        # four observations are discarded
        batch2 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_2"))
        batch3 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_3"))
        if batch2["sequences"][0][0][0] > batch3["sequences"][0][0][0]:
            batch2, batch3 = batch3, batch2
        self.assertEqual(batch2["seq_lens"].tolist(), [4, 4])
        self.assertEqual(batch3["seq_lens"].tolist(), [2, 4])
        self.assertEqual(batch2["sequences"].tolist(), [
            [[5], [6], [7], [8]],
            [[9], [10], [11], [12]],
        ])
        self.assertEqual(batch3["sequences"].tolist(), [
            [[13], [14], [0], [0]],
            [[0], [1], [2], [3]],
        ])
def train_poker_approx_best_response_nfsp(
        br_player,
        ray_head_address,
        scenario,
        general_trainer_config_overrrides,
        br_policy_config_overrides,
        get_stopping_condition,
        avg_policy_specs_for_players: Dict[int, StrategySpec],
        results_dir: str,
        trainer_class_override=None,
        br_policy_class_override=None,
        print_train_results: bool = True):
    env_class = scenario.env_class
    env_config = scenario.env_config

    other_player = 1 - br_player
    env_config["discrete_actions_for_players"] = [other_player]

    policy_classes: Dict[str, Type[Policy]] = scenario.policy_classes

    if br_policy_class_override is not None:
        policy_classes["best_response"] = br_policy_class_override

    get_trainer_config = scenario.get_trainer_config
    should_log_result_fn = scenario.ray_should_log_result_filter

    init_ray_for_scenario(scenario=scenario,
                          head_address=ray_head_address,
                          logging_level=logging.INFO)

    def log(message, level=logging.INFO):
        logger.log(level, message)

    def select_policy(agent_id):
        if agent_id == br_player:
            return "best_response"
        else:
            return f"average_policy"

    tmp_env = env_class(env_config=env_config)

    all_discrete_action_env_config = env_config.copy()
    all_discrete_action_env_config["discrete_actions_for_players"] = [0, 1]
    all_discrete_action_tmp_env = env_class(env_config)

    avg_policy_model_config = get_trainer_config(
        all_discrete_action_tmp_env)["model"]

    from ray.rllib.agents.ppo import PPOTrainer, PPOTorchPolicy
    from grl.rl_apps.scenarios.trainer_configs.loss_game_configs import loss_game_psro_ppo_params

    br_trainer_config = {
        "log_level": "INFO",
        # "callbacks": None,
        "env": env_class,
        "env_config": env_config,
        "gamma": 1.0,
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        # "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "num_gpus": 0.0,
        "num_workers": 0,
        "num_gpus_per_worker": 0.0,
        "num_envs_per_worker": 1,
        "multiagent": {
            "policies_to_train": ["best_response"],
            "policies": {
                "average_policy":
                (policy_classes["average_policy"], tmp_env.observation_space,
                 tmp_env.discrete_action_space, {
                     "model": avg_policy_model_config,
                     "explore": False,
                 }),
                "best_response": (PPOTorchPolicy, tmp_env.observation_space,
                                  tmp_env.continuous_action_space, {}),
            },
            "policy_mapping_fn": select_policy,
        },
    }
    # br_trainer_config = merge_dicts(br_trainer_config, get_trainer_config(tmp_env))
    br_trainer_config = merge_dicts(br_trainer_config,
                                    loss_game_psro_ppo_params(tmp_env))

    br_trainer = PPOTrainer(config=br_trainer_config,
                            logger_creator=get_trainer_logger_creator(
                                base_dir=results_dir,
                                scenario_name="approx_br",
                                should_log_result_fn=should_log_result_fn))

    def _set_avg_policy(worker: RolloutWorker):
        avg_policy = worker.policy_map["average_policy"]
        load_pure_strat(
            policy=avg_policy,
            pure_strat_spec=avg_policy_specs_for_players[1 - br_player])

    br_trainer.workers.foreach_worker(_set_avg_policy)

    br_trainer.latest_avg_trainer_result = None
    train_iter_count = 0

    stopping_condition: StoppingCondition = get_stopping_condition()

    max_reward = None
    while True:
        train_iter_results = br_trainer.train(
        )  # do a step (or several) in the main RL loop
        br_reward_this_iter = train_iter_results["policy_reward_mean"][
            f"best_response"]

        if max_reward is None or br_reward_this_iter > max_reward:
            max_reward = br_reward_this_iter

        train_iter_count += 1
        if print_train_results:
            # Delete verbose debugging info before printing
            if "hist_stats" in train_iter_results:
                del train_iter_results["hist_stats"]
            if "td_error" in train_iter_results["info"]["learner"][
                    "best_response"]:
                del train_iter_results["info"]["learner"]["best_response"][
                    "td_error"]
            print(pretty_dict_str(train_iter_results))
            log(f"Trainer logdir is {br_trainer.logdir}")

        if stopping_condition.should_stop_this_iter(
                latest_trainer_result=train_iter_results):
            print("stopping condition met.")
            break

    return max_reward, None
Пример #27
0
config['num_workers'] = 1
config['num_gpus'] = 1
config['framework'] = "torch"
config['gamma'] = 0.1

config['monitor'] = False

# PPO config ...
# config['lr'] = 1e-4
# config['train_batch_size']
config['model']['dim'] = 21
config['model']['conv_filters'] = [[8, [4, 4], 2], [16, [2, 2], 2],
                                   [512, [6, 6], 1]]  #,
#[config['train_batch_size'], 4, 1, 1]]

# trainner = PPOTrainer(config=config, env="mars_explorer:explorer-v01")
trainner = PPOTrainer(config=config, env="custom-explorer")
# import pdb; pdb.set_trace()

PATH = "/home/dkoutras/ray_results/290_out_of_400/checkpoint_2991/checkpoint-2991"
trainner.restore(PATH)
import pdb
pdb.set_trace()

for _ in range(10):
    initial_time = time.time()
    result = trainner.train()
    print(
        f"mean:{result['episode_reward_mean']} time:{time.time() - initial_time:.2f}[sec]"
    )
Пример #28
0
from ray.rllib.agents.ppo import PPOTrainer

from rlcard.rllib_utils.model import ParametricActionsModel
from ray.rllib.models import ModelCatalog

from rlcard.rllib_utils.examples.envs.rps_env import RockPaperScissors
from ray.tune.registry import register_env

# Register env and model to be used by rllib
register_env("ParametricRPS", lambda _: RockPaperScissors())
ModelCatalog.register_custom_model("parametric_model_tf",
                                   ParametricActionsModel)

# Initialize ray
ray.init(num_cpus=4)

# Train the ParametricActionsModel on RockPaperScissors with PPO
ppo_trainer_config = {
    "env": "ParametricRPS",  # RockPaperScissors
    "model": {
        "custom_model": "parametric_model_tf",  # ParametricActionsModel,
    },
}
trainer = PPOTrainer(config=ppo_trainer_config)
for i in range(5):
    res = trainer.train()
    print("Iteration {}. episode_reward_mean: {}".format(
        i, res['episode_reward_mean']))

print('Training finished, check the results in ~/ray_results/<dir>/')
Пример #29
0
import gym
import ray
from ray.rllib.agents.ppo import PPOTrainer, DEFAULT_CONFIG
from ray.tune.logger import pretty_print

ray.init(num_gpus=1)

config = DEFAULT_CONFIG.copy()
config['num_gpus'] = 1
config['num_workers'] = 1
config['num_sgd_iter'] = 30
config['sgd_minibatch_size'] = 128
config['model']['fcnet_hiddens'] = [100, 100]
config[
    'num_cpus_per_worker'] = 0  # This avoids running out of resources in the notebook environment when this cell is re-executed

agent = PPOTrainer(config, 'CartPole-v0')

for i in range(5):
    result = agent.train()
    print(pretty_print(result))
Пример #30
0
    "training_iteration", "time_total_s", "timesteps_total",
    "episode_reward_max", "episode_reward_mean",
    [
        "info",
        [
            "sample_time_ms", "grad_time_ms", "opt_peak_throughput",
            "sample_peak_throughput"
        ]
    ]
]

try:
    result = {"timesteps_total": 0}
    while result["timesteps_total"] < timesteps_total:
        # Perform one iteration of training the policy
        result = train_agent.train()

        # Print the training status
        for field in results_fields_filter:
            if not isinstance(field, list):
                if field in result.keys():
                    print(f"{field}: {result[field]}")
            else:
                for subfield in field[1]:
                    if subfield in result[field[0]].keys():
                        print(f"{subfield} : {result[field[0]][subfield]}")
        print("============================")
except KeyboardInterrupt:
    print("Interrupting training...")
finally:
    checkpoint_path = train_agent.save()