示例#1
0
    def get_trainer(
        self, environment, parameters=None, use_gpu=False, use_all_avail_gpus=False
    ):
        parameters = parameters or self.get_sarsa_parameters()
        q_network = FullyConnectedParametricDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=get_num_output_features(environment.normalization_action),
            sizes=parameters.training.layers[1:-1],
            activations=parameters.training.activations[:-1],
        )
        reward_network = FullyConnectedParametricDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=get_num_output_features(environment.normalization_action),
            sizes=parameters.training.layers[1:-1],
            activations=parameters.training.activations[:-1],
        )
        if use_gpu:
            q_network = q_network.cuda()
            reward_network = reward_network.cuda()
            if use_all_avail_gpus:
                q_network = q_network.get_distributed_data_parallel_model()
                reward_network = reward_network.get_distributed_data_parallel_model()

        q_network_target = q_network.get_target_network()
        trainer = ParametricDQNTrainer(
            q_network, q_network_target, reward_network, parameters
        )
        return trainer
示例#2
0
def create_parametric_dqn_trainer_from_params(
    model: ContinuousActionModelParameters,
    state_normalization_parameters: Dict[int, NormalizationParameters],
    action_normalization_parameters: Dict[int, NormalizationParameters],
    use_gpu: bool = False,
    use_all_avail_gpus: bool = False,
):
    q_network = FullyConnectedParametricDQN(
        state_dim=get_num_output_features(state_normalization_parameters),
        action_dim=get_num_output_features(action_normalization_parameters),
        sizes=model.training.layers[1:-1],
        activations=model.training.activations[:-1],
    )
    reward_network = FullyConnectedParametricDQN(
        state_dim=get_num_output_features(state_normalization_parameters),
        action_dim=get_num_output_features(action_normalization_parameters),
        sizes=model.training.layers[1:-1],
        activations=model.training.activations[:-1],
    )
    q_network_target = q_network.get_target_network()

    if use_gpu and torch.cuda.is_available():
        q_network = q_network.cuda()
        q_network_target = q_network_target.cuda()
        reward_network = reward_network.cuda()

    if use_all_avail_gpus:
        q_network = q_network.get_distributed_data_parallel_model()
        q_network_target = q_network_target.get_distributed_data_parallel_model(
        )
        reward_network = reward_network.get_distributed_data_parallel_model()

    return ParametricDQNTrainer(q_network, q_network_target, reward_network,
                                model, use_gpu)
    def get_trainer(self,
                    environment,
                    parameters=None,
                    use_gpu=False,
                    use_all_avail_gpus=False):
        layers = [256, 128]
        activations = ["relu", "relu"]
        parameters = parameters or self.get_sarsa_parameters()
        q_network = FullyConnectedParametricDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=get_num_output_features(
                environment.normalization_action),
            sizes=layers,
            activations=activations,
        )
        reward_network = FullyConnectedParametricDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=get_num_output_features(
                environment.normalization_action),
            sizes=layers,
            activations=activations,
        )
        if use_gpu:
            q_network = q_network.cuda()
            reward_network = reward_network.cuda()
            if use_all_avail_gpus:
                q_network = q_network.get_distributed_data_parallel_model()
                reward_network = reward_network.get_distributed_data_parallel_model(
                )

        q_network_target = q_network.get_target_network()
        param_dict = parameters.asdict()  # type: ignore
        trainer = ParametricDQNTrainer(q_network, q_network_target,
                                       reward_network, **param_dict)
        return trainer
示例#4
0
def create_parametric_dqn_trainer_from_params(
    model: ContinuousActionModelParameters,
    state_normalization_parameters: Dict[int, NormalizationParameters],
    action_normalization_parameters: Dict[int, NormalizationParameters],
    use_gpu: bool = False,
    use_all_avail_gpus: bool = False,
):
    q_network = FullyConnectedParametricDQN(
        state_dim=get_num_output_features(state_normalization_parameters),
        action_dim=get_num_output_features(action_normalization_parameters),
        sizes=model.training.layers[1:-1],
        activations=model.training.activations[:-1],
    )
    reward_network = FullyConnectedParametricDQN(
        state_dim=get_num_output_features(state_normalization_parameters),
        action_dim=get_num_output_features(action_normalization_parameters),
        sizes=model.training.layers[1:-1],
        activations=model.training.activations[:-1],
    )
    q_network_target = q_network.get_target_network()

    if use_gpu:
        q_network = q_network.cuda()
        q_network_target = q_network_target.cuda()
        reward_network = reward_network.cuda()

    if use_all_avail_gpus:
        q_network = q_network.get_distributed_data_parallel_model()
        q_network_target = q_network_target.get_distributed_data_parallel_model(
        )
        reward_network = reward_network.get_distributed_data_parallel_model()

    trainer_parameters = ParametricDQNTrainerParameters(  # type: ignore
        rl=model.rl,
        double_q_learning=model.rainbow.double_q_learning,
        minibatch_size=model.training.minibatch_size,
        optimizer=OptimizerParameters(
            optimizer=model.training.optimizer,
            learning_rate=model.training.learning_rate,
            l2_decay=model.training.l2_decay,
        ),
    )

    return ParametricDQNTrainer(
        q_network,
        q_network_target,
        reward_network,
        use_gpu=use_gpu,
        **trainer_parameters.asdict()  # type: ignore
    )
示例#5
0
    def get_modular_sarsa_trainer_exporter(self,
                                           environment,
                                           parameters=None,
                                           use_gpu=False,
                                           use_all_avail_gpus=False):
        parameters = parameters or self.get_sarsa_parameters()
        q_network = FullyConnectedParametricDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=get_num_output_features(
                environment.normalization_action),
            sizes=parameters.training.layers[1:-1],
            activations=parameters.training.activations[:-1],
        )
        reward_network = FullyConnectedParametricDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=get_num_output_features(
                environment.normalization_action),
            sizes=parameters.training.layers[1:-1],
            activations=parameters.training.activations[:-1],
        )
        if use_gpu:
            q_network = q_network.cuda()
            reward_network = reward_network.cuda()
            if use_all_avail_gpus:
                q_network = q_network.get_data_parallel_model()
                reward_network = reward_network.get_data_parallel_model()

        q_network_target = q_network.get_target_network()
        trainer = _ParametricDQNTrainer(q_network, q_network_target,
                                        reward_network, parameters)
        state_preprocessor = Preprocessor(environment.normalization, False,
                                          True)
        action_preprocessor = Preprocessor(environment.normalization_action,
                                           False, True)
        feature_extractor = PredictorFeatureExtractor(
            state_normalization_parameters=environment.normalization,
            action_normalization_parameters=environment.normalization_action,
        )
        output_transformer = ParametricActionOutputTransformer()
        exporter = ParametricDQNExporter(
            q_network,
            feature_extractor,
            output_transformer,
            state_preprocessor,
            action_preprocessor,
        )
        return (trainer, exporter)
示例#6
0
    def test_slate_q_trainer(self):
        recsim = RecSim(num_users=10)

        # Build memory pool with random policy
        memory_pool = OpenAIGymMemoryPool(10000000)
        random_reward = recsim.rollout_policy(random_policy, memory_pool)

        # Train a model
        q_network = FullyConnectedParametricDQN(
            state_dim=memory_pool.state_dim,
            action_dim=memory_pool.action_dim,
            sizes=[64, 32],
            activations=["relu", "relu"],
        )

        q_network = q_network.eval()
        recsim.reset()
        untrained_policy_reward = recsim.rollout_policy(
            partial(top_k_policy, q_network))
        q_network = q_network.train()

        q_network_target = q_network.get_target_network()
        parameters = SlateQTrainerParameters()
        trainer = SlateQTrainer(q_network, q_network_target, parameters)

        for _i in range(1000):
            tdp = memory_pool.sample_memories(
                128, model_type=ModelType.PYTORCH_PARAMETRIC_DQN.value)
            training_batch = tdp.as_slate_q_training_batch()
            trainer.train(training_batch)

        q_network = q_network.eval()
        recsim.reset()
        trained_policy_reward = recsim.rollout_policy(
            partial(top_k_policy, q_network))

        print(
            f"Reward; random: {random_reward}; untrained: {untrained_policy_reward}; "
            f"trained: {trained_policy_reward}")

        self.assertGreater(trained_policy_reward, untrained_policy_reward)
        self.assertGreater(trained_policy_reward, random_reward)
        self.assertEqual(random_reward, 1384.0)
        self.assertEqual(untrained_policy_reward, 1200.0)
        self.assertEqual(trained_policy_reward, 1432.0)
    def get_modular_sarsa_trainer_exporter(
        self, environment, parameters=None, use_gpu=False, use_all_avail_gpus=False
    ):
        parameters = parameters or self.get_sarsa_parameters()
        q_network = FullyConnectedParametricDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=get_num_output_features(environment.normalization_action),
            sizes=parameters.training.layers[1:-1],
            activations=parameters.training.activations[:-1],
        )
        reward_network = FullyConnectedParametricDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=get_num_output_features(environment.normalization_action),
            sizes=parameters.training.layers[1:-1],
            activations=parameters.training.activations[:-1],
        )
        if use_gpu:
            q_network = q_network.cuda()
            reward_network = reward_network.cuda()
            if use_all_avail_gpus:
                q_network = q_network.get_data_parallel_model()
                reward_network = reward_network.get_data_parallel_model()

        q_network_target = q_network.get_target_network()
        trainer = _ParametricDQNTrainer(
            q_network, q_network_target, reward_network, parameters
        )
        feature_extractor = PredictorFeatureExtractor(
            state_normalization_parameters=environment.normalization,
            action_normalization_parameters=environment.normalization_action,
        )
        output_transformer = ParametricActionOutputTransformer()
        exporter = ParametricDQNExporter(
            q_network, feature_extractor, output_transformer
        )
        return (trainer, exporter)