예제 #1
0
def run_gym(
    params,
    offline_train,
    score_bar,
    gpu_id,
    save_timesteps_to_dataset=None,
    start_saving_from_score=None,
    path_to_pickled_transitions=None,
):
    logger.info("Running gym with params")
    logger.info(params)
    rl_parameters = RLParameters(**params["rl"])

    env_type = params["env"]
    model_type = params["model_type"]

    epsilon, epsilon_decay, minimum_epsilon = create_epsilon(
        offline_train, rl_parameters, params
    )
    env = OpenAIGymEnvironment(
        env_type,
        epsilon,
        rl_parameters.softmax_policy,
        rl_parameters.gamma,
        epsilon_decay,
        minimum_epsilon,
    )
    replay_buffer = create_replay_buffer(
        env, params, model_type, offline_train, path_to_pickled_transitions
    )

    use_gpu = gpu_id != USE_CPU
    trainer = create_trainer(params["model_type"], params, rl_parameters, use_gpu, env)
    predictor = create_predictor(trainer, model_type, use_gpu, env.action_dim)

    c2_device = core.DeviceOption(
        caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU, int(gpu_id)
    )
    return train(
        c2_device,
        env,
        offline_train,
        replay_buffer,
        model_type,
        trainer,
        predictor,
        "{} test run".format(env_type),
        score_bar,
        **params["run_details"],
        save_timesteps_to_dataset=save_timesteps_to_dataset,
        start_saving_from_score=start_saving_from_score,
    )
예제 #2
0
def run_gym(
    params,
    use_gpu,
    score_bar,
    embed_rl_dataset: RLDataset,
    gym_env: Env,
    mdnrnn: MemoryNetwork,
    max_embed_seq_len: int,
):
    rl_parameters = RLParameters(**params["rl"])
    env_type = params["env"]
    model_type = params["model_type"]
    epsilon, epsilon_decay, minimum_epsilon = create_epsilon(
        offline_train=True, rl_parameters=rl_parameters, params=params)

    replay_buffer = OpenAIGymMemoryPool(params["max_replay_memory_size"])
    for row in embed_rl_dataset.rows:
        replay_buffer.insert_into_memory(**row)

    state_mem = np.array([m[0] for m in replay_buffer.replay_memory])
    state_min_value = np.amin(state_mem)
    state_max_value = np.amax(state_mem)
    state_embed_env = StateEmbedGymEnvironment(gym_env, mdnrnn,
                                               max_embed_seq_len,
                                               state_min_value,
                                               state_max_value)
    open_ai_env = OpenAIGymEnvironment(
        state_embed_env,
        epsilon,
        rl_parameters.softmax_policy,
        rl_parameters.gamma,
        epsilon_decay,
        minimum_epsilon,
    )
    rl_trainer = create_trainer(params["model_type"], params, rl_parameters,
                                use_gpu, open_ai_env)
    rl_predictor = create_predictor(rl_trainer, model_type, use_gpu,
                                    open_ai_env.action_dim)

    return train_gym_offline_rl(
        open_ai_env,
        replay_buffer,
        model_type,
        rl_trainer,
        rl_predictor,
        "{} offline rl state embed".format(env_type),
        score_bar,
        max_steps=params["run_details"]["max_steps"],
        avg_over_num_episodes=params["run_details"]["avg_over_num_episodes"],
        offline_train_epochs=params["run_details"]["offline_train_epochs"],
        bcq_imitator_hyper_params=None,
    )
    def test_minibatches_per_step(self):
        _epochs = self.epochs
        self.epochs = 2
        rl_parameters = RLParameters(gamma=0.95,
                                     target_update_rate=0.9,
                                     maxq_learning=True)
        rainbow_parameters = RainbowDQNParameters(double_q_learning=True,
                                                  dueling_architecture=False)
        training_parameters1 = TrainingParameters(
            layers=self.layers,
            activations=self.activations,
            minibatch_size=1024,
            minibatches_per_step=1,
            learning_rate=0.25,
            optimizer="ADAM",
        )
        training_parameters2 = TrainingParameters(
            layers=self.layers,
            activations=self.activations,
            minibatch_size=128,
            minibatches_per_step=8,
            learning_rate=0.25,
            optimizer="ADAM",
        )
        env1 = Env(self.state_dims, self.action_dims)
        env2 = Env(self.state_dims, self.action_dims)
        model_parameters1 = DiscreteActionModelParameters(
            actions=env1.actions,
            rl=rl_parameters,
            rainbow=rainbow_parameters,
            training=training_parameters1,
        )
        model_parameters2 = DiscreteActionModelParameters(
            actions=env2.actions,
            rl=rl_parameters,
            rainbow=rainbow_parameters,
            training=training_parameters2,
        )
        # minibatch_size / 8, minibatches_per_step * 8 should give the same result
        logger.info("Training model 1")
        trainer1 = self._train(model_parameters1, env1)
        SummaryWriterContext._reset_globals()
        logger.info("Training model 2")
        trainer2 = self._train(model_parameters2, env2)

        weight1 = trainer1.q_network.fc.layers[-1].weight.detach().numpy()
        weight2 = trainer2.q_network.fc.layers[-1].weight.detach().numpy()

        # Due to numerical stability this tolerance has to be fairly high
        self.assertTrue(np.allclose(weight1, weight2, rtol=0.0, atol=1e-3))
        self.epochs = _epochs
예제 #4
0
    def test_trainer_maxq(self):
        environment = Gridworld()
        maxq_sarsa_parameters = DiscreteActionModelParameters(
            actions=environment.ACTIONS,
            rl=RLParameters(
                gamma=DISCOUNT,
                target_update_rate=0.5,
                reward_burnin=10,
                maxq_learning=True,
            ),
            training=TrainingParameters(
                layers=[-1, 1],
                activations=["linear"],
                minibatch_size=self.minibatch_size,
                learning_rate=0.01,
                optimizer="ADAM",
            ),
        )
        # construct the new trainer that using maxq
        maxq_trainer = DiscreteActionTrainer(
            maxq_sarsa_parameters, environment.normalization
        )

        samples = environment.generate_samples(100000, 1.0)
        predictor = maxq_trainer.predictor()
        tdps = environment.preprocess_samples(samples, self.minibatch_size)
        evaluator = GridworldEvaluator(environment, True)

        evaluator.evaluate(predictor)
        print(
            "Pre-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.reward_doubly_robust[-1],
        )
        self.assertGreater(evaluator.mc_loss[-1], 0.3)

        for _ in range(5):
            for tdp in tdps:
                maxq_trainer.train_numpy(tdp, None)

        evaluator.evaluate(predictor)
        print(
            "Post-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.reward_doubly_robust[-1],
        )
        self.assertLess(evaluator.mc_loss[-1], 0.1)

        self.assertGreater(
            evaluator.reward_doubly_robust[-1], evaluator.reward_doubly_robust[-2]
        )
예제 #5
0
    def test_trainer_maxq(self):
        env = Env(self.state_dims, self.action_dims)
        env.seed(42)
        maxq_parameters = DiscreteActionModelParameters(
            actions=env.actions,
            rl=RLParameters(
                gamma=0.99,
                target_update_rate=0.5,
                reward_burnin=10,
                maxq_learning=True
            ),
            training=TrainingParameters(
                layers=self.layers,
                activations=self.activations,
                minibatch_size=self.minibatch_size,
                learning_rate=0.01,
                optimizer='ADAM',
            )
        )
        maxq_trainer = DiscreteActionTrainer(
            maxq_parameters,
            env.normalization
        )
        # predictor = maxq_trainer.predictor()

        logger.info('Generating constant_reward MDPs..')

        states, actions, rewards, next_states, next_actions, is_terminal, \
            possible_next_actions, reward_timelines = \
            env.generate_samples_discrete(self.num_samples)

        logger.info('Preprocessing constant_reward MDPs..')

        tdps = env.preprocess_samples_discrete(states, actions, rewards, next_states,
            next_actions, is_terminal, possible_next_actions, reward_timelines,
            self.minibatch_size,)

        for epoch in range(self.epochs):
            logger.info('Training..', epoch)
            for tdp in tdps:
                maxq_trainer.train_numpy(tdp, None)
            logger.info('Training epoch', epoch, 'average q values',
                  np.mean(workspace.FetchBlob(maxq_trainer.q_score_output)),
                  'td_loss', workspace.FetchBlob(maxq_trainer.loss_blob))

        # Q value should converge to very close to 100
        avg_q_value_after_training = np.mean(
            workspace.FetchBlob(maxq_trainer.q_score_output))

        self.assertLess(avg_q_value_after_training, 101)
        self.assertGreater(avg_q_value_after_training, 99)
예제 #6
0
def main(params):
    # Set minibatch size based on # of devices being used to train
    params["shared_training"]["minibatch_size"] *= minibatch_size_multiplier(
        params["use_gpu"], params["use_all_avail_gpus"])

    rl_parameters = RLParameters(**params["rl"])
    training_parameters = DDPGTrainingParameters(**params["shared_training"])
    actor_parameters = DDPGNetworkParameters(**params["actor_training"])
    critic_parameters = DDPGNetworkParameters(**params["critic_training"])

    model_params = DDPGModelParameters(
        rl=rl_parameters,
        shared_training=training_parameters,
        actor_training=actor_parameters,
        critic_training=critic_parameters,
    )

    state_normalization = BaseWorkflow.read_norm_file(
        params["state_norm_data_path"])
    action_normalization = BaseWorkflow.read_norm_file(
        params["action_norm_data_path"])

    writer = SummaryWriter(log_dir=params["model_output_path"])
    logger.info("TensorBoard logging location is: {}".format(writer.log_dir))

    preprocess_handler = ContinuousPreprocessHandler(
        Preprocessor(state_normalization, False),
        Preprocessor(action_normalization, False),
        PandasSparseToDenseProcessor(),
    )

    workflow = ContinuousWorkflow(
        model_params,
        preprocess_handler,
        state_normalization,
        action_normalization,
        params["use_gpu"],
        params["use_all_avail_gpus"],
    )

    train_dataset = JSONDatasetReader(
        params["training_data_path"],
        batch_size=training_parameters.minibatch_size)
    eval_dataset = JSONDatasetReader(params["eval_data_path"], batch_size=16)

    with summary_writer_context(writer):
        workflow.train_network(train_dataset, eval_dataset,
                               int(params["epochs"]))
    return export_trainer_and_predictor(workflow.trainer,
                                        params["model_output_path"])  # noqa
예제 #7
0
 def get_td3_parameters(self, use_2_q_functions=False):
     return TD3ModelParameters(
         rl=RLParameters(gamma=DISCOUNT, target_update_rate=0.01),
         training=TD3TrainingParameters(
             minibatch_size=self.minibatch_size,
             use_2_q_functions=use_2_q_functions,
             q_network_optimizer=OptimizerParameters(),
             actor_network_optimizer=OptimizerParameters(),
         ),
         q_network=FeedForwardParameters(layers=[128, 64],
                                         activations=["relu", "relu"]),
         actor_network=FeedForwardParameters(layers=[128, 64],
                                             activations=["relu", "relu"]),
     )
    def setUp(self):
        super(self.__class__, self).setUp()
        np.random.seed(0)
        random.seed(0)

        self.state_dim, self.action_dim = 2, 3

        self._env = MockEnv(self.state_dim, self.action_dim)

        self._rl_parameters = RLParameters(
            gamma=0.9,
            target_update_rate=0.5,
            reward_burnin=10,
            maxq_learning=False,
        )
        self._rl_parameters_maxq = RLParameters(
            gamma=0.9,
            target_update_rate=0.5,
            reward_burnin=10,
            maxq_learning=True,
        )
        self._rl_parameters = ContinuousActionModelParameters(
            rl=self._rl_parameters,
            training=TrainingParameters(
                layers=[
                    -1, self._env.num_states * self._env.num_actions * 2, 1
                ],
                activations=['linear', 'linear'],
                minibatch_size=1024,
                learning_rate=0.01,
                optimizer='ADAM',
            ),
            knn=KnnParameters(model_type='DQN', ))
        self._trainer = ContinuousActionDQNTrainer(
            self._env.normalization, self._env.normalization_action,
            self._rl_parameters)
 def get_sarsa_parameters(self):
     return ContinuousActionModelParameters(
         rl=RLParameters(gamma=DISCOUNT,
                         target_update_rate=1.0,
                         maxq_learning=False),
         training=TrainingParameters(
             layers=[-1, 256, 128, -1],
             activations=["relu", "relu", "linear"],
             minibatch_size=self.minibatch_size,
             learning_rate=0.05,
             optimizer="ADAM",
         ),
         rainbow=RainbowDQNParameters(double_q_learning=True,
                                      dueling_architecture=False),
     )
 def get_sarsa_parameters(self):
     return ContinuousActionModelParameters(
         rl=RLParameters(
             gamma=DISCOUNT,
             target_update_rate=0.5,
             reward_burnin=10,
             maxq_learning=False,
         ),
         training=TrainingParameters(
             layers=[-1, 200, 1],
             activations=['linear', 'linear'],
             minibatch_size=1024,
             learning_rate=0.01,
             optimizer='ADAM',
         ),
         knn=KnnParameters(model_type='DQN', ))
예제 #11
0
    def test_trainer_maxq(self):
        environment = Gridworld()
        maxq_sarsa_parameters = DiscreteActionModelParameters(
            actions=environment.ACTIONS,
            rl=RLParameters(gamma=DISCOUNT,
                            target_update_rate=0.5,
                            reward_burnin=10,
                            maxq_learning=True),
            training=TrainingParameters(
                layers=[-1, 1],
                activations=['linear'],
                minibatch_size=self.minibatch_size,
                learning_rate=0.01,
                optimizer='ADAM',
            ))
        # construct the new trainer that using maxq
        maxq_trainer = DiscreteActionTrainer(
            maxq_sarsa_parameters,
            environment.normalization,
        )
        states, actions, rewards, next_states, next_actions, is_terminal,\
            possible_next_actions, reward_timelines = \
            environment.generate_samples(100000, 1.0)
        predictor = maxq_trainer.predictor()
        tdps = environment.preprocess_samples(
            states,
            actions,
            rewards,
            next_states,
            next_actions,
            is_terminal,
            possible_next_actions,
            reward_timelines,
            self.minibatch_size,
        )
        evaluator = GridworldEvaluator(environment, True)
        print("Pre-Training eval", evaluator.evaluate(predictor))
        self.assertGreater(evaluator.evaluate(predictor), 0.3)

        for _ in range(2):
            for tdp in tdps:
                maxq_trainer.stream_tdp(tdp, None)
            evaluator.evaluate(predictor)

        print("Post-Training eval", evaluator.evaluate(predictor))
        self.assertLess(evaluator.evaluate(predictor), 0.1)
예제 #12
0
 def get_sarsa_trainer(self, environment):
     rl_parameters = RLParameters(gamma=DISCOUNT,
                                  target_update_rate=0.5,
                                  reward_burnin=10,
                                  maxq_learning=False)
     training_parameters = TrainingParameters(
         layers=[-1, 1],
         activations=['linear'],
         minibatch_size=1024,
         learning_rate=0.01,
         optimizer='ADAM',
     )
     return DiscreteActionTrainer(
         environment.normalization,
         DiscreteActionModelParameters(actions=environment.ACTIONS,
                                       rl=rl_parameters,
                                       training=training_parameters))
예제 #13
0
def run_gym(
    params,
    score_bar,
    gpu_id,
    save_timesteps_to_dataset=None,
    start_saving_from_episode=0,
):

    # Caffe2 core uses the min of caffe2_log_level and minloglevel
    # to determine loglevel. See caffe2/caffe2/core/logging.cc for more info.
    core.GlobalInit(["caffe2", "--caffe2_log_level=2", "--minloglevel=2"])

    logger.info("Running gym with params")
    logger.info(params)
    rl_parameters = RLParameters(**params["rl"])

    env_type = params["env"]
    env = OpenAIGymEnvironment(
        env_type,
        rl_parameters.epsilon,
        rl_parameters.softmax_policy,
        rl_parameters.gamma,
    )
    replay_buffer = OpenAIGymMemoryPool(params["max_replay_memory_size"])
    model_type = params["model_type"]

    use_gpu = gpu_id != USE_CPU
    trainer = create_trainer(params["model_type"], params, rl_parameters, use_gpu, env)
    predictor = create_predictor(trainer, model_type, use_gpu)

    c2_device = core.DeviceOption(
        caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU, gpu_id
    )
    return train_sgd(
        c2_device,
        env,
        replay_buffer,
        model_type,
        trainer,
        predictor,
        "{} test run".format(env_type),
        score_bar,
        **params["run_details"],
        save_timesteps_to_dataset=save_timesteps_to_dataset,
        start_saving_from_episode=start_saving_from_episode,
    )
예제 #14
0
def train_network(params):
    logger.info("Running DQN workflow with params:")
    logger.info(params)

    action_names = np.array(params["actions"])
    rl_parameters = RLParameters(**params["rl"])
    training_parameters = TrainingParameters(**params["training"])
    rainbow_parameters = RainbowDQNParameters(**params["rainbow"])

    trainer_params = DiscreteActionModelParameters(
        actions=params["actions"],
        rl=rl_parameters,
        training=training_parameters,
        rainbow=rainbow_parameters,
    )

    dataset = JSONDataset(params["training_data_path"],
                          batch_size=training_parameters.minibatch_size)
    norm_data = JSONDataset(params["state_norm_data_path"])
    state_normalization = read_norm_params(norm_data.read_all())

    num_batches = int(len(dataset) / training_parameters.minibatch_size)

    logger.info("Read in batch data set {} of size {} examples. Data split "
                "into {} batches of size {}.".format(
                    params["training_data_path"],
                    len(dataset),
                    num_batches,
                    training_parameters.minibatch_size,
                ))

    trainer = DQNTrainer(trainer_params, state_normalization,
                         params["use_gpu"])

    for epoch in range(params["epochs"]):
        for batch_idx in range(num_batches):
            helpers.report_training_status(batch_idx, num_batches, epoch,
                                           params["epochs"])
            batch = dataset.read_batch(batch_idx)
            tdp = preprocess_batch_for_training(action_names, batch,
                                                state_normalization)
            trainer.train(tdp)

    logger.info("Training finished. Saving PyTorch model to {}".format(
        params["pytorch_output_path"]))
    helpers.save_model_to_file(trainer, params["pytorch_output_path"])
예제 #15
0
 def get_sarsa_parameters(self):
     return ContinuousActionModelParameters(
         rl=RLParameters(
             gamma=DISCOUNT,
             target_update_rate=1.0,
             reward_burnin=100,
             maxq_learning=False,
         ),
         training=TrainingParameters(
             layers=[-1, 256, 128, -1],
             activations=["relu", "relu", "linear"],
             minibatch_size=self.minibatch_size,
             learning_rate=0.1,
             optimizer="ADAM",
         ),
         knn=KnnParameters(model_type="DQN"),
     )
예제 #16
0
def run_gym(
    params,
    offline_train,
    score_bar,
    gpu_id,
    save_timesteps_to_dataset=None,
    start_saving_from_episode=0,
):
    logger.info("Running gym with params")
    logger.info(params)
    rl_parameters = RLParameters(**params["rl"])

    env_type = params["env"]
    if offline_train:
        # take random actions during data collection
        epsilon = 1.0
    else:
        epsilon = rl_parameters.epsilon
    env = OpenAIGymEnvironment(
        env_type, epsilon, rl_parameters.softmax_policy, rl_parameters.gamma
    )
    replay_buffer = OpenAIGymMemoryPool(params["max_replay_memory_size"])
    model_type = params["model_type"]

    use_gpu = gpu_id != USE_CPU
    trainer = create_trainer(params["model_type"], params, rl_parameters, use_gpu, env)
    predictor = create_predictor(trainer, model_type, use_gpu)

    c2_device = core.DeviceOption(
        caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU, int(gpu_id)
    )
    return train_sgd(
        c2_device,
        env,
        offline_train,
        replay_buffer,
        model_type,
        trainer,
        predictor,
        "{} test run".format(env_type),
        score_bar,
        **params["run_details"],
        save_timesteps_to_dataset=save_timesteps_to_dataset,
        start_saving_from_episode=start_saving_from_episode,
    )
예제 #17
0
 def test_sarsa_layer_validation(self):
     env = Gridworld()
     invalid_sarsa_params = DiscreteActionModelParameters(
         actions=env.ACTIONS,
         rl=RLParameters(gamma=DISCOUNT,
                         target_update_rate=0.5,
                         reward_burnin=10,
                         maxq_learning=False),
         training=TrainingParameters(
             layers=[-1, 3],
             activations=['linear'],
             minibatch_size=32,
             learning_rate=0.1,
             optimizer='SGD',
         ))
     with self.assertRaises(Exception):
         # layers[-1] should be 1
         DiscreteActionTrainer(env.normalization, invalid_sarsa_params)
예제 #18
0
def main(args):
    parser = argparse.ArgumentParser(
        description="Train a RL net to play in an OpenAI Gym environment.")
    parser.add_argument("-p",
                        "--parameters",
                        help="Path to JSON parameters file.")
    parser.add_argument("-s",
                        "--score-bar",
                        help="Bar for averaged tests scores.",
                        type=float,
                        default=None)
    parser.add_argument(
        "-g",
        "--gpu_id",
        help="If set, will use GPU with specified ID. Otherwise will use CPU.",
        default=USE_CPU)
    args = parser.parse_args(args)
    with open(args.parameters, 'r') as f:
        params = json.load(f)

    rl_settings = params['rl']
    training_settings = params['training']
    rl_settings['gamma'] = rl_settings['reward_discount_factor']
    del rl_settings['reward_discount_factor']
    training_settings['gamma'] = training_settings['learning_rate_decay']
    del training_settings['learning_rate_decay']

    env_type = params['env']
    env = OpenAIGymEnvironment(env_type, rl_settings['epsilon'])

    trainer_params = DiscreteActionModelParameters(
        actions=env.actions,
        rl=RLParameters(**rl_settings),
        training=TrainingParameters(**training_settings))

    device = core.DeviceOption(
        caffe2_pb2.CPU if args.gpu_id == USE_CPU else caffe2_pb2.CUDA,
        args.gpu_id)
    with core.DeviceScope(device):
        trainer = DiscreteActionTrainer(env.normalization,
                                        trainer_params,
                                        skip_normalization=True)
        return run(env, trainer, "{} test run".format(env_type),
                   args.score_bar, **params["run_details"])
예제 #19
0
    def test_trainer_maxq(self):
        environment = GridworldContinuous()
        rl_parameters = self.get_sarsa_parameters()
        new_rl_parameters = ContinuousActionModelParameters(
            rl=RLParameters(
                gamma=DISCOUNT,
                target_update_rate=0.5,
                reward_burnin=10,
                maxq_learning=True,
            ),
            training=rl_parameters.training,
            knn=rl_parameters.knn
        )
        maxq_trainer = ContinuousActionDQNTrainer(
            new_rl_parameters,
            environment.normalization,
            environment.normalization_action,
        )

        states, actions, rewards, next_states, next_actions, is_terminal,\
            possible_next_actions, reward_timelines = \
            environment.generate_samples(100000, 1.0)
        predictor = maxq_trainer.predictor()
        tdps = environment.preprocess_samples(
            states,
            actions,
            rewards,
            next_states,
            next_actions,
            is_terminal,
            possible_next_actions,
            reward_timelines,
            self.minibatch_size,
        )
        evaluator = GridworldContinuousEvaluator(environment, True)
        self.assertGreater(evaluator.evaluate(predictor), 0.4)

        for _ in range(2):
            for tdp in tdps:
                maxq_trainer.train_numpy(tdp, None)
            evaluator.evaluate(predictor)

        self.assertLess(evaluator.evaluate(predictor), 0.1)
예제 #20
0
 def get_sac_parameters(self, use_2_q_functions=False):
     return SACModelParameters(
         rl=RLParameters(gamma=DISCOUNT,
                         target_update_rate=0.5,
                         reward_burnin=100),
         training=SACTrainingParameters(
             minibatch_size=self.minibatch_size,
             use_2_q_functions=use_2_q_functions,
             q_network_optimizer=OptimizerParameters(),
             value_network_optimizer=OptimizerParameters(),
             actor_network_optimizer=OptimizerParameters(),
         ),
         q_network=FeedForwardParameters(layers=[128, 64],
                                         activations=["relu", "relu"]),
         value_network=FeedForwardParameters(layers=[128, 64],
                                             activations=["relu", "relu"]),
         actor_network=FeedForwardParameters(layers=[128, 64],
                                             activations=["relu", "relu"]),
     )
예제 #21
0
 def get_sarsa_parameters(self):
     return ContinuousActionModelParameters(
         rl=RLParameters(
             gamma=DISCOUNT,
             target_update_rate=1.0,
             reward_burnin=100,
             maxq_learning=False,
         ),
         training=TrainingParameters(
             layers=[-1, 256, 128, -1],
             activations=["relu", "relu", "linear"],
             minibatch_size=self.minibatch_size,
             learning_rate=0.05,
             optimizer="ADAM",
         ),
         knn=KnnParameters(model_type="DQN"),
         rainbow=RainbowDQNParameters(double_q_learning=True,
                                      dueling_architecture=False),
         in_training_cpe=InTrainingCPEParameters(mdp_sampled_rate=0.1),
     )
예제 #22
0
 def get_sarsa_trainer_reward_boost(self, environment, reward_shape):
     rl_parameters = RLParameters(
         gamma=DISCOUNT,
         target_update_rate=0.5,
         reward_burnin=10,
         maxq_learning=False,
         reward_boost=reward_shape,
     )
     training_parameters = TrainingParameters(
         layers=[-1, -1],
         activations=["linear"],
         minibatch_size=self.minibatch_size,
         learning_rate=0.01,
         optimizer="ADAM",
     )
     return DiscreteActionTrainer(
         DiscreteActionModelParameters(
             actions=environment.ACTIONS,
             rl=rl_parameters,
             training=training_parameters,
         ),
         environment.normalization,
     )
예제 #23
0
 def get_sarsa_parameters(self, environment, reward_shape, dueling, clip_grad_norm):
     rl_parameters = RLParameters(
         gamma=DISCOUNT,
         target_update_rate=1.0,
         maxq_learning=False,
         reward_boost=reward_shape,
     )
     training_parameters = TrainingParameters(
         layers=[-1, 128, -1] if dueling else [-1, -1],
         activations=["relu", "linear"] if dueling else ["linear"],
         minibatch_size=self.minibatch_size,
         learning_rate=0.05,
         optimizer="ADAM",
         clip_grad_norm=clip_grad_norm,
     )
     return DiscreteActionModelParameters(
         actions=environment.ACTIONS,
         rl=rl_parameters,
         training=training_parameters,
         rainbow=RainbowDQNParameters(
             double_q_learning=True, dueling_architecture=dueling
         ),
     )
    def test_trainer_maxq(self):
        env = Env(self.state_dims, self.action_dims)
        maxq_parameters = DiscreteActionModelParameters(
            actions=env.actions,
            rl=RLParameters(gamma=0.95,
                            target_update_rate=0.9,
                            maxq_learning=True),
            rainbow=RainbowDQNParameters(double_q_learning=True,
                                         dueling_architecture=False),
            training=TrainingParameters(
                layers=self.layers,
                activations=self.activations,
                minibatch_size=1024,
                learning_rate=0.25,
                optimizer="ADAM",
            ),
        )

        # Q value should converge to very close to 20
        trainer = self._train(maxq_parameters, env)
        avg_q_value_after_training = torch.mean(trainer.all_action_scores)
        self.assertLess(avg_q_value_after_training, 22)
        self.assertGreater(avg_q_value_after_training, 18)
예제 #25
0
 def get_ddpg_parameters(self):
     return DDPGModelParameters(
         rl=RLParameters(gamma=DISCOUNT,
                         target_update_rate=0.5,
                         maxq_learning=True),
         shared_training=DDPGTrainingParameters(
             minibatch_size=self.minibatch_size,
             final_layer_init=0.003,
             optimizer="ADAM",
         ),
         actor_training=DDPGNetworkParameters(
             layers=[-1, 256, 128, -1],
             activations=["relu", "relu", "tanh"],
             learning_rate=0.05,
             l2_decay=0.01,
         ),
         critic_training=DDPGNetworkParameters(
             layers=[-1, 256, 256, 128, -1],
             activations=["relu", "relu", "relu", "linear"],
             learning_rate=0.05,
             l2_decay=0.01,
         ),
     )
예제 #26
0
def run_gym(params, score_bar, gpu_id):
    rl_settings = params['rl']
    training_settings = params['training']
    rl_settings['gamma'] = rl_settings['reward_discount_factor']
    del rl_settings['reward_discount_factor']
    training_settings['gamma'] = training_settings['learning_rate_decay']
    del training_settings['learning_rate_decay']

    env_type = params['env']
    env = OpenAIGymEnvironment(env_type, rl_settings['epsilon'])
    trainer_params = DiscreteActionModelParameters(
        actions=env.actions,
        rl=RLParameters(**rl_settings),
        training=TrainingParameters(**training_settings))

    device = core.DeviceOption(
        caffe2_pb2.CPU if gpu_id == USE_CPU else caffe2_pb2.CUDA,
        gpu_id,
    )
    with core.DeviceScope(device):
        if env.img:
            trainer = DiscreteActionConvTrainer(
                DiscreteActionConvModelParameters(
                    fc_parameters=trainer_params,
                    cnn_parameters=CNNModelParameters(**params['cnn']),
                    num_input_channels=env.num_input_channels,
                    img_height=env.height,
                    img_width=env.width),
                env.normalization,
            )
        else:
            trainer = DiscreteActionTrainer(
                trainer_params,
                env.normalization,
            )
        return run(env, trainer, "{} test run".format(env_type), score_bar,
                   **params["run_details"])
예제 #27
0
 def get_sac_parameters(
     self,
     use_2_q_functions=False,
     logged_action_uniform_prior=True,
     constrain_action_sum=False,
 ):
     return SACModelParameters(
         rl=RLParameters(gamma=DISCOUNT, target_update_rate=0.5),
         training=SACTrainingParameters(
             minibatch_size=self.minibatch_size,
             use_2_q_functions=use_2_q_functions,
             q_network_optimizer=OptimizerParameters(),
             value_network_optimizer=OptimizerParameters(),
             actor_network_optimizer=OptimizerParameters(),
             logged_action_uniform_prior=logged_action_uniform_prior,
         ),
         q_network=FeedForwardParameters(layers=[128, 64],
                                         activations=["relu", "relu"]),
         value_network=FeedForwardParameters(layers=[128, 64],
                                             activations=["relu", "relu"]),
         actor_network=FeedForwardParameters(layers=[128, 64],
                                             activations=["relu", "relu"]),
         constrain_action_sum=constrain_action_sum,
     )
예제 #28
0
 def get_sarsa_trainer_reward_boost(
     self,
     environment,
     reward_shape,
     dueling,
     use_gpu=False,
     use_all_avail_gpus=False,
 ):
     rl_parameters = RLParameters(
         gamma=DISCOUNT,
         target_update_rate=1.0,
         reward_burnin=10,
         maxq_learning=False,
         reward_boost=reward_shape,
     )
     training_parameters = TrainingParameters(
         layers=[-1, 128, -1] if dueling else [-1, -1],
         activations=["relu", "linear"] if dueling else ["linear"],
         minibatch_size=self.minibatch_size,
         learning_rate=0.05,
         optimizer="ADAM",
     )
     return DQNTrainer(
         DiscreteActionModelParameters(
             actions=environment.ACTIONS,
             rl=rl_parameters,
             training=training_parameters,
             rainbow=RainbowDQNParameters(
                 double_q_learning=True, dueling_architecture=dueling
             ),
             in_training_cpe=InTrainingCPEParameters(mdp_sampled_rate=0.1),
         ),
         environment.normalization,
         use_gpu=use_gpu,
         use_all_avail_gpus=use_all_avail_gpus,
     )
예제 #29
0
def run_gym(params, score_bar, gpu_id, save_timesteps_to_dataset=None):
    logger.info("Running gym with params")
    logger.info(params)
    rl_parameters = RLParameters(**params["rl"])

    env_type = params["env"]
    env = OpenAIGymEnvironment(
        env_type,
        rl_parameters.epsilon,
        rl_parameters.softmax_policy,
        params["max_replay_memory_size"],
    )
    model_type = params["model_type"]
    c2_device = core.DeviceOption(
        caffe2_pb2.CPU if gpu_id == USE_CPU else caffe2_pb2.CUDA, gpu_id
    )

    if model_type == ModelType.DISCRETE_ACTION.value:
        with core.DeviceScope(c2_device):
            training_settings = params["training"]
            training_parameters = TrainingParameters(**training_settings)
            if env.img:
                assert (
                    training_parameters.cnn_parameters is not None
                ), "Missing CNN parameters for image input"
                training_parameters.cnn_parameters = CNNParameters(
                    **training_settings["cnn_parameters"]
                )
                training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
                training_parameters.cnn_parameters.input_height = env.height
                training_parameters.cnn_parameters.input_width = env.width
                training_parameters.cnn_parameters.num_input_channels = (
                    env.num_input_channels
                )
            else:
                assert (
                    training_parameters.cnn_parameters is None
                ), "Extra CNN parameters for non-image input"
            trainer_params = DiscreteActionModelParameters(
                actions=env.actions, rl=rl_parameters, training=training_parameters
            )
            trainer = DiscreteActionTrainer(trainer_params, env.normalization)
    elif model_type == ModelType.PARAMETRIC_ACTION.value:
        with core.DeviceScope(c2_device):
            training_settings = params["training"]
            training_parameters = TrainingParameters(**training_settings)
            if env.img:
                assert (
                    training_parameters.cnn_parameters is not None
                ), "Missing CNN parameters for image input"
                training_parameters.cnn_parameters = CNNParameters(
                    **training_settings["cnn_parameters"]
                )
                training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
            else:
                assert (
                    training_parameters.cnn_parameters is None
                ), "Extra CNN parameters for non-image input"
            trainer_params = ContinuousActionModelParameters(
                rl=rl_parameters,
                training=training_parameters,
                knn=KnnParameters(model_type="DQN"),
            )
            trainer = ContinuousActionDQNTrainer(
                trainer_params, env.normalization, env.normalization_action
            )
    elif model_type == ModelType.CONTINUOUS_ACTION.value:
        training_settings = params["shared_training"]
        actor_settings = params["actor_training"]
        critic_settings = params["critic_training"]
        trainer_params = DDPGModelParameters(
            rl=rl_parameters,
            shared_training=DDPGTrainingParameters(**training_settings),
            actor_training=DDPGNetworkParameters(**actor_settings),
            critic_training=DDPGNetworkParameters(**critic_settings),
        )

        # DDPG can handle continuous and discrete action spaces
        if env.action_type == EnvType.CONTINUOUS_ACTION:
            action_range = env.action_space.high
        else:
            action_range = None

        trainer = DDPGTrainer(
            trainer_params,
            env.normalization,
            env.normalization_action,
            use_gpu=False,
            action_range=action_range,
        )

    else:
        raise NotImplementedError("Model of type {} not supported".format(model_type))

    return run(
        c2_device,
        env,
        model_type,
        trainer,
        "{} test run".format(env_type),
        score_bar,
        **params["run_details"],
        save_timesteps_to_dataset=save_timesteps_to_dataset,
    )
예제 #30
0
def train_network(params):
    writer = None
    if params["model_output_path"] is not None:
        writer = SummaryWriter(log_dir=params["model_output_path"])

    logger.info("Running DQN workflow with params:")
    logger.info(params)

    # Set minibatch size based on # of devices being used to train
    params["training"]["minibatch_size"] *= minibatch_size_multiplier(
        params["use_gpu"], params["use_all_avail_gpus"])

    action_names = np.array(params["actions"])
    rl_parameters = RLParameters(**params["rl"])
    training_parameters = TrainingParameters(**params["training"])
    rainbow_parameters = RainbowDQNParameters(**params["rainbow"])

    trainer_params = DiscreteActionModelParameters(
        actions=params["actions"],
        rl=rl_parameters,
        training=training_parameters,
        rainbow=rainbow_parameters,
    )

    dataset = JSONDataset(params["training_data_path"],
                          batch_size=training_parameters.minibatch_size)
    eval_dataset = JSONDataset(params["eval_data_path"], batch_size=16)
    state_normalization = read_norm_file(params["state_norm_data_path"])

    num_batches = int(len(dataset) / training_parameters.minibatch_size)
    logger.info("Read in batch data set {} of size {} examples. Data split "
                "into {} batches of size {}.".format(
                    params["training_data_path"],
                    len(dataset),
                    num_batches,
                    training_parameters.minibatch_size,
                ))

    trainer = DQNTrainer(
        trainer_params,
        state_normalization,
        use_gpu=params["use_gpu"],
        use_all_avail_gpus=params["use_all_avail_gpus"],
    )
    trainer = update_model_for_warm_start(trainer)
    preprocessor = Preprocessor(state_normalization, False)

    evaluator = Evaluator(
        trainer_params.actions,
        trainer_params.rl.gamma,
        trainer,
        metrics_to_score=trainer.metrics_to_score,
    )

    start_time = time.time()
    for epoch in range(int(params["epochs"])):
        dataset.reset_iterator()
        for batch_idx in range(num_batches):
            report_training_status(batch_idx, num_batches, epoch,
                                   int(params["epochs"]))
            batch = dataset.read_batch(batch_idx)
            tdp = preprocess_batch_for_training(preprocessor, batch,
                                                action_names)

            tdp.set_type(trainer.dtype)
            trainer.train(tdp)

        eval_dataset.reset_iterator()
        accumulated_edp = None
        while True:
            batch = eval_dataset.read_batch(batch_idx)
            if batch is None:
                break
            tdp = preprocess_batch_for_training(preprocessor, batch,
                                                action_names)
            edp = EvaluationDataPage.create_from_tdp(tdp, trainer)
            if accumulated_edp is None:
                accumulated_edp = edp
            else:
                accumulated_edp = accumulated_edp.append(edp)
        accumulated_edp = accumulated_edp.compute_values(trainer.gamma)

        cpe_start_time = time.time()
        details = evaluator.evaluate_post_training(accumulated_edp)
        details.log()
        logger.info("CPE evaluation took {} seconds.".format(time.time() -
                                                             cpe_start_time))

    through_put = (len(dataset) * int(params["epochs"])) / (time.time() -
                                                            start_time)
    logger.info("Training finished. Processed ~{} examples / s.".format(
        round(through_put)))

    if writer is not None:
        writer.close()

    return export_trainer_and_predictor(trainer, params["model_output_path"])