Пример #1
0
 def test_save_load(self):
     state_dim = 8
     action_dim = 4
     q_network = FullyConnectedDQN(state_dim,
                                   action_dim,
                                   sizes=[8, 4],
                                   activations=["relu", "relu"])
     imitator_network = FullyConnectedNetwork(
         layers=[state_dim, 8, 4, action_dim],
         activations=["relu", "relu", "linear"])
     model = BatchConstrainedDQN(
         state_dim=state_dim,
         q_network=q_network,
         imitator_network=imitator_network,
         bcq_drop_threshold=0.05,
     )
     # 6 for DQN + 6 for Imitator Network + 2 for BCQ constants
     expected_num_params, expected_num_inputs, expected_num_outputs = 14, 1, 1
     check_save_load(self, model, expected_num_params, expected_num_inputs,
                     expected_num_outputs)
Пример #2
0
    def test_basic(self):
        state_dim = 8
        action_dim = 4
        q_network = FullyConnectedDQN(state_dim,
                                      action_dim,
                                      sizes=[8, 4],
                                      activations=["relu", "relu"])
        imitator_network = FullyConnectedNetwork(
            layers=[state_dim, 8, 4, action_dim],
            activations=["relu", "relu", "linear"])
        model = BatchConstrainedDQN(
            state_dim=state_dim,
            q_network=q_network,
            imitator_network=imitator_network,
            bcq_drop_threshold=0.05,
        )

        input = model.input_prototype()
        self.assertEqual((1, state_dim), input.state.float_features.shape)
        q_values = model(input)
        self.assertEqual((1, action_dim), q_values.q_values.shape)
Пример #3
0
    def __init__(self, state_dim, action_dim, sizes, activations, use_batch_norm=False):
        """
        AKA the multivariate beta distribution. Used in cases where actor's action
        must sum to 1.
        """
        super().__init__()
        assert state_dim > 0, "state_dim must be > 0, got {}".format(state_dim)
        assert action_dim > 0, "action_dim must be > 0, got {}".format(action_dim)
        self.state_dim = state_dim
        self.action_dim = action_dim
        assert len(sizes) == len(
            activations
        ), "The numbers of sizes and activations must match; got {} vs {}".format(
            len(sizes), len(activations)
        )

        # The last layer gives the concentration of the distribution.
        self.fc = FullyConnectedNetwork(
            [state_dim] + sizes + [action_dim],
            activations + ["linear"],
            use_batch_norm=use_batch_norm,
        )
Пример #4
0
    def __init__(
        self,
        state_dim: int,
        action_dim: int,
        sizes: List[int],
        activations: List[str],
        scale: float = 0.05,
        use_batch_norm: bool = False,
        use_layer_norm: bool = False,
    ):
        super().__init__()
        assert state_dim > 0, "state_dim must be > 0, got {}".format(state_dim)
        assert action_dim > 0, "action_dim must be > 0, got {}".format(
            action_dim)
        self.state_dim = state_dim
        self.action_dim = action_dim
        assert len(sizes) == len(
            activations
        ), "The numbers of sizes and activations must match; got {} vs {}".format(
            len(sizes), len(activations))
        # The last layer is mean & scale for reparameterization trick
        self.fc = FullyConnectedNetwork(
            [state_dim] + sizes + [action_dim * 2],
            activations + ["linear"],
            use_batch_norm=use_batch_norm,
            use_layer_norm=use_layer_norm,
        )
        self.use_layer_norm = use_layer_norm
        if self.use_layer_norm:
            self.loc_layer_norm = torch.nn.LayerNorm(action_dim)
            self.scale_layer_norm = torch.nn.LayerNorm(action_dim)

        # used to calculate log-prob
        self.const = math.log(math.sqrt(2 * math.pi))
        self.eps = 1e-6
        self._log_min_max = (-20.0, 2.0)
Пример #5
0
class Seq2RewardTrainer(ReAgentLightningModule):
    """Trainer for Seq2Reward"""
    def __init__(self, seq2reward_network: Seq2RewardNetwork,
                 params: Seq2RewardTrainerParameters):
        super().__init__()
        self.seq2reward_network = seq2reward_network
        self.params = params

        # Turning off Q value output during training:
        self.view_q_value = params.view_q_value
        # permutations used to do planning
        self.all_permut = gen_permutations(params.multi_steps,
                                           len(self.params.action_names))
        self.mse_loss = nn.MSELoss(reduction="mean")

        # Predict how many steps are remaining from the current step
        self.step_predict_network = FullyConnectedNetwork(
            [
                self.seq2reward_network.state_dim,
                self.params.step_predict_net_size,
                self.params.step_predict_net_size,
                self.params.multi_steps,
            ],
            ["relu", "relu", "linear"],
            use_layer_norm=False,
        )
        self.step_loss = nn.CrossEntropyLoss(reduction="mean")

    def configure_optimizers(self):
        optimizers = []
        optimizers.append({
            "optimizer":
            torch.optim.Adam(self.seq2reward_network.parameters(),
                             lr=self.params.learning_rate),
        })
        optimizers.append(
            {
                "optimizer":
                torch.optim.Adam(self.step_predict_network.parameters(),
                                 lr=self.params.learning_rate)
            }, )
        return optimizers

    def train_step_gen(self, training_batch: rlt.MemoryNetworkInput,
                       batch_idx: int):
        mse_loss = self.get_mse_loss(training_batch)
        detached_mse_loss = mse_loss.cpu().detach().item()
        yield mse_loss

        step_entropy_loss = self.get_step_entropy_loss(training_batch)
        detached_step_entropy_loss = step_entropy_loss.cpu().detach().item()

        if self.view_q_value:
            state_first_step = training_batch.state.float_features[0]
            q_values = (get_Q(
                self.seq2reward_network,
                state_first_step,
                self.all_permut,
            ).cpu().mean(0).tolist())
        else:
            q_values = [0] * len(self.params.action_names)

        step_probability = (get_step_prediction(
            self.step_predict_network,
            training_batch).cpu().mean(dim=0).numpy())
        logger.info(
            f"Seq2Reward trainer output: mse_loss={detached_mse_loss}, "
            f"step_entropy_loss={detached_step_entropy_loss}, q_values={q_values}, "
            f"step_probability={step_probability}")
        self.reporter.log(
            mse_loss=detached_mse_loss,
            step_entropy_loss=detached_step_entropy_loss,
            q_values=[q_values],
        )

        yield step_entropy_loss

    # pyre-ignore inconsistent override because lightning doesn't use types
    def validation_step(self, batch: rlt.MemoryNetworkInput, batch_idx: int):
        detached_mse_loss = self.get_mse_loss(batch).cpu().detach().item()

        detached_step_entropy_loss = (
            self.get_step_entropy_loss(batch).cpu().detach().item())

        state_first_step = batch.state.float_features[0]
        # shape: batch_size, action_dim
        q_values_all_action_all_data = get_Q(
            self.seq2reward_network,
            state_first_step,
            self.all_permut,
        ).cpu()
        q_values = q_values_all_action_all_data.mean(0).tolist()

        action_distribution = torch.bincount(
            torch.argmax(q_values_all_action_all_data, dim=1),
            minlength=len(self.params.action_names),
        )
        # normalize
        action_distribution = (action_distribution.float() /
                               torch.sum(action_distribution)).tolist()

        self.reporter.log(
            eval_mse_loss=detached_mse_loss,
            eval_step_entropy_loss=detached_step_entropy_loss,
            eval_q_values=[q_values],
            eval_action_distribution=[action_distribution],
        )
        return (
            detached_mse_loss,
            detached_step_entropy_loss,
            q_values,
            action_distribution,
        )

    def get_mse_loss(self, training_batch: rlt.MemoryNetworkInput):
        """
        Compute losses:
            MSE(predicted_acc_reward, target_acc_reward)

        :param training_batch:
            training_batch has these fields:
            - state: (SEQ_LEN, BATCH_SIZE, STATE_DIM) torch tensor
            - action: (SEQ_LEN, BATCH_SIZE, ACTION_DIM) torch tensor
            - reward: (SEQ_LEN, BATCH_SIZE) torch tensor

        :returns:
            mse loss on reward
        """
        # pyre-fixme[16]: Optional type has no attribute `flatten`.
        valid_step = training_batch.valid_step.flatten()

        seq2reward_output = self.seq2reward_network(
            training_batch.state,
            rlt.FeatureData(training_batch.action),
            valid_step,
        )
        predicted_acc_reward = seq2reward_output.acc_reward

        seq_len, batch_size = training_batch.reward.size()
        gamma = self.params.gamma
        gamma_mask = (torch.Tensor([[gamma**i for i in range(seq_len)]
                                    for _ in range(batch_size)]).transpose(
                                        0, 1).to(training_batch.reward.device))

        target_acc_rewards = torch.cumsum(training_batch.reward * gamma_mask,
                                          dim=0)
        target_acc_reward = target_acc_rewards[
            valid_step - 1, torch.arange(batch_size)].unsqueeze(1)

        # make sure the prediction and target tensors have the same size
        # the size should both be (BATCH_SIZE, 1) in this case.
        assert (predicted_acc_reward.size() == target_acc_reward.size()
                ), f"{predicted_acc_reward.size()}!={target_acc_reward.size()}"
        return self.mse_loss(predicted_acc_reward, target_acc_reward)

    def get_step_entropy_loss(self, training_batch: rlt.MemoryNetworkInput):
        """
        Compute cross-entropy losses of step predictions

        :param training_batch:
            training_batch has these fields:
            - state: (SEQ_LEN, BATCH_SIZE, STATE_DIM) torch tensor
            - action: (SEQ_LEN, BATCH_SIZE, ACTION_DIM) torch tensor
            - reward: (SEQ_LEN, BATCH_SIZE) torch tensor

        :returns:
            step_entropy_loss on step prediction
        """
        # pyre-fixme[16]: Optional type has no attribute `flatten`.
        valid_step = training_batch.valid_step.flatten()

        first_step_state = training_batch.state.float_features[0]
        valid_step_output = self.step_predict_network(first_step_state)

        # step loss's target is zero-based indexed, so subtract 1 from valid_step
        return self.step_loss(valid_step_output, valid_step - 1)

    def warm_start_components(self):
        components = ["seq2reward_network"]
        return components
Пример #6
0
def get_sac_trainer(
    env: OpenAIGymEnvironment,
    rl_parameters: RLParameters,
    trainer_parameters: SACTrainerParameters,
    critic_training: FeedForwardParameters,
    actor_training: FeedForwardParameters,
    sac_value_training: Optional[FeedForwardParameters],
    use_gpu: bool,
) -> SACTrainer:
    assert rl_parameters == trainer_parameters.rl
    state_dim = get_num_output_features(env.normalization)
    action_dim = get_num_output_features(env.normalization_action)
    q1_network = FullyConnectedParametricDQN(state_dim, action_dim,
                                             critic_training.layers,
                                             critic_training.activations)
    q2_network = None
    # TODO:
    # if trainer_parameters.use_2_q_functions:
    #     q2_network = FullyConnectedParametricDQN(
    #         state_dim,
    #         action_dim,
    #         critic_training.layers,
    #         critic_training.activations,
    #     )
    value_network = None
    if sac_value_training:
        value_network = FullyConnectedNetwork(
            [state_dim] + sac_value_training.layers + [1],
            sac_value_training.activations + ["linear"],
        )
    actor_network = GaussianFullyConnectedActor(state_dim, action_dim,
                                                actor_training.layers,
                                                actor_training.activations)

    min_action_range_tensor_training = torch.full((1, action_dim), -1 + 1e-6)
    max_action_range_tensor_training = torch.full((1, action_dim), 1 - 1e-6)
    min_action_range_tensor_serving = (
        torch.from_numpy(env.action_space.low).float().unsqueeze(
            dim=0)  # type: ignore
    )
    max_action_range_tensor_serving = (
        torch.from_numpy(env.action_space.high).float().unsqueeze(
            dim=0)  # type: ignore
    )

    if use_gpu:
        q1_network.cuda()
        if q2_network:
            q2_network.cuda()
        if value_network:
            value_network.cuda()
        actor_network.cuda()

        min_action_range_tensor_training = min_action_range_tensor_training.cuda(
        )
        max_action_range_tensor_training = max_action_range_tensor_training.cuda(
        )
        min_action_range_tensor_serving = min_action_range_tensor_serving.cuda(
        )
        max_action_range_tensor_serving = max_action_range_tensor_serving.cuda(
        )

    return SACTrainer(
        q1_network,
        actor_network,
        trainer_parameters,
        use_gpu=use_gpu,
        value_network=value_network,
        q2_network=q2_network,
        min_action_range_tensor_training=min_action_range_tensor_training,
        max_action_range_tensor_training=max_action_range_tensor_training,
        min_action_range_tensor_serving=min_action_range_tensor_serving,
        max_action_range_tensor_serving=max_action_range_tensor_serving,
    )
class ConvolutionalNetwork(nn.Module):
    def __init__(self, cnn_parameters, layers, activations,
                 use_layer_norm) -> None:
        super().__init__()
        self.conv_dims = cnn_parameters.conv_dims
        self.conv_height_kernels = cnn_parameters.conv_height_kernels
        self.conv_width_kernels = cnn_parameters.conv_width_kernels
        self.use_layer_norm = use_layer_norm

        self.conv_layers: nn.ModuleList = nn.ModuleList()
        self.pool_layers: nn.ModuleList = nn.ModuleList()
        self.layer_norm_layers: nn.ModuleList = nn.ModuleList()

        for i, _ in enumerate(self.conv_dims[1:]):
            self.conv_layers.append(
                nn.Conv2d(
                    self.conv_dims[i],
                    self.conv_dims[i + 1],
                    kernel_size=(
                        self.conv_height_kernels[i],
                        self.conv_width_kernels[i],
                    ),
                ))
            nn.init.kaiming_normal_(self.conv_layers[i].weight)
            if cnn_parameters.pool_types[i] == "max":
                self.pool_layers.append(
                    nn.MaxPool2d(
                        kernel_size=cnn_parameters.pool_kernels_strides[i]))
            else:
                assert False, "Unknown pooling type".format(layers)
            if self.use_layer_norm:
                self.layer_norm_layers.append(
                    nn.GroupNorm(1, self.conv_dims[i + 1]))

        input_size = (
            cnn_parameters.num_input_channels,
            cnn_parameters.input_height,
            cnn_parameters.input_width,
        )
        conv_out = self.conv_forward(torch.ones(1, *input_size))
        self.fc_input_dim = int(np.prod(conv_out.size()[1:]))
        layers[0] = self.fc_input_dim
        self.feed_forward = FullyConnectedNetwork(
            layers, activations, use_layer_norm=use_layer_norm)

    def conv_forward(self, input):
        x = input
        for i, _ in enumerate(self.conv_layers):
            x = self.conv_layers[i](x)
            if self.use_layer_norm:
                x = self.layer_norm_layers[i](x)
            x = F.relu(x)
            x = self.pool_layers[i](x)
        return x

    def forward(self, input) -> torch.FloatTensor:
        """Forward pass for generic convnet DNNs. Assumes activation names
        are valid pytorch activation names.
        :param input image tensor
        """
        x = self.conv_forward(input)
        x = x.view(-1, self.fc_input_dim)
        # pyre-fixme[7]: Expected `FloatTensor` but got `Tensor`.
        return self.feed_forward.forward(x)
Пример #8
0
    def get_sac_trainer(
        self,
        env,
        use_gpu,
        use_2_q_functions=False,
        logged_action_uniform_prior=True,
        constrain_action_sum=False,
        use_value_network=True,
        use_alpha_optimizer=True,
        entropy_temperature=None,
    ):
        q_network_params = FeedForwardParameters(layers=[128, 64],
                                                 activations=["relu", "relu"])
        value_network_params = FeedForwardParameters(
            layers=[128, 64], activations=["relu", "relu"])
        actor_network_params = FeedForwardParameters(
            layers=[128, 64], activations=["relu", "relu"])

        state_dim = get_num_output_features(env.normalization)
        action_dim = get_num_output_features(
            env.normalization_continuous_action)
        q1_network = FullyConnectedParametricDQN(state_dim, action_dim,
                                                 q_network_params.layers,
                                                 q_network_params.activations)
        q2_network = None
        if use_2_q_functions:
            q2_network = FullyConnectedParametricDQN(
                state_dim,
                action_dim,
                q_network_params.layers,
                q_network_params.activations,
            )
        if constrain_action_sum:
            actor_network = DirichletFullyConnectedActor(
                state_dim,
                action_dim,
                actor_network_params.layers,
                actor_network_params.activations,
            )
        else:
            actor_network = GaussianFullyConnectedActor(
                state_dim,
                action_dim,
                actor_network_params.layers,
                actor_network_params.activations,
            )

        value_network = None
        if use_value_network:
            value_network = FullyConnectedNetwork(
                [state_dim] + value_network_params.layers + [1],
                value_network_params.activations + ["linear"],
            )

        if use_gpu:
            q1_network.cuda()
            if q2_network:
                q2_network.cuda()
            if value_network:
                value_network.cuda()
            actor_network.cuda()

        parameters = SACTrainerParameters(
            rl=RLParameters(gamma=DISCOUNT, target_update_rate=0.5),
            minibatch_size=self.minibatch_size,
            q_network_optimizer=OptimizerParameters(),
            value_network_optimizer=OptimizerParameters(),
            actor_network_optimizer=OptimizerParameters(),
            alpha_optimizer=OptimizerParameters()
            if use_alpha_optimizer else None,
            entropy_temperature=entropy_temperature,
            logged_action_uniform_prior=logged_action_uniform_prior,
        )

        return SACTrainer(
            q1_network,
            actor_network,
            parameters,
            use_gpu=use_gpu,
            value_network=value_network,
            q2_network=q2_network,
        )
Пример #9
0
class Seq2RewardTrainer(Trainer):
    """ Trainer for Seq2Reward """

    def __init__(
        self, seq2reward_network: Seq2RewardNetwork, params: Seq2RewardTrainerParameters
    ):
        self.seq2reward_network = seq2reward_network
        self.params = params
        self.mse_optimizer = torch.optim.Adam(
            self.seq2reward_network.parameters(), lr=params.learning_rate
        )
        self.minibatch_size = self.params.batch_size
        self.loss_reporter = NoOpLossReporter()

        # PageHandler must use this to activate evaluator:
        self.calc_cpe_in_training = True
        # Turning off Q value output during training:
        self.view_q_value = params.view_q_value
        # permutations used to do planning
        self.all_permut = gen_permutations(
            params.multi_steps, len(self.params.action_names)
        )
        self.mse_loss = nn.MSELoss(reduction="mean")

        # Predict how many steps are remaining from the current step
        self.step_predict_network = FullyConnectedNetwork(
            [
                self.seq2reward_network.state_dim,
                self.params.step_predict_net_size,
                self.params.step_predict_net_size,
                self.params.multi_steps,
            ],
            ["relu", "relu", "linear"],
            use_layer_norm=False,
        )
        self.step_loss = nn.CrossEntropyLoss(reduction="mean")
        self.step_optimizer = torch.optim.Adam(
            self.step_predict_network.parameters(), lr=params.learning_rate
        )

    def train(self, training_batch: rlt.MemoryNetworkInput):
        mse_loss, step_entropy_loss = self.get_loss(training_batch)

        self.mse_optimizer.zero_grad()
        mse_loss.backward()
        self.mse_optimizer.step()

        self.step_optimizer.zero_grad()
        step_entropy_loss.backward()
        self.step_optimizer.step()

        detached_mse_loss = mse_loss.cpu().detach().item()
        detached_step_entropy_loss = step_entropy_loss.cpu().detach().item()

        if self.view_q_value:
            state_first_step = training_batch.state.float_features[0]
            q_values = (
                get_Q(
                    self.seq2reward_network,
                    state_first_step,
                    self.all_permut,
                )
                .cpu()
                .mean(0)
                .tolist()
            )
        else:
            q_values = [0] * len(self.params.action_names)

        step_probability = (
            get_step_prediction(self.step_predict_network, training_batch)
            .cpu()
            .mean(dim=0)
            .numpy()
        )
        logger.info(
            f"Seq2Reward trainer output: mse_loss={detached_mse_loss}, "
            f"step_entropy_loss={detached_step_entropy_loss}, q_values={q_values}, "
            f"step_probability={step_probability}"
        )
        # pyre-fixme[16]: `Seq2RewardTrainer` has no attribute `notify_observers`.
        self.notify_observers(
            mse_loss=detached_mse_loss,
            step_entropy_loss=detached_step_entropy_loss,
            q_values=[q_values],
        )
        return (detached_mse_loss, detached_step_entropy_loss, q_values)

    def get_loss(self, training_batch: rlt.MemoryNetworkInput):
        """
        Compute losses:
            MSE(predicted_acc_reward, target_acc_reward)

        :param training_batch:
            training_batch has these fields:
            - state: (SEQ_LEN, BATCH_SIZE, STATE_DIM) torch tensor
            - action: (SEQ_LEN, BATCH_SIZE, ACTION_DIM) torch tensor
            - reward: (SEQ_LEN, BATCH_SIZE) torch tensor

        :returns:
            mse loss on reward
            step_entropy_loss on step prediction
        """
        # pyre-fixme[16]: Optional type has no attribute `flatten`.
        valid_reward_len = training_batch.valid_next_seq_len.flatten()

        first_step_state = training_batch.state.float_features[0]
        valid_reward_len_output = self.step_predict_network(first_step_state)
        step_entropy_loss = self.step_loss(
            valid_reward_len_output, valid_reward_len - 1
        )

        seq2reward_output = self.seq2reward_network(
            training_batch.state,
            rlt.FeatureData(training_batch.action),
            valid_reward_len,
        )
        predicted_acc_reward = seq2reward_output.acc_reward

        seq_len, batch_size = training_batch.reward.size()
        gamma = self.params.gamma
        gamma_mask = (
            torch.Tensor(
                [[gamma ** i for i in range(seq_len)] for _ in range(batch_size)]
            )
            .transpose(0, 1)
            .to(training_batch.reward.device)
        )

        target_acc_rewards = torch.cumsum(training_batch.reward * gamma_mask, dim=0)
        target_acc_reward = target_acc_rewards[
            valid_reward_len - 1, torch.arange(batch_size)
        ].unsqueeze(1)

        # make sure the prediction and target tensors have the same size
        # the size should both be (BATCH_SIZE, 1) in this case.
        assert (
            predicted_acc_reward.size() == target_acc_reward.size()
        ), f"{predicted_acc_reward.size()}!={target_acc_reward.size()}"
        mse = self.mse_loss(predicted_acc_reward, target_acc_reward)
        return mse, step_entropy_loss

    def warm_start_components(self):
        components = ["seq2reward_network"]
        return components