예제 #1
0
def train_seq2reward_model(training_data, learning_rate=0.01, num_epochs=5):
    SEQ_LEN, batch_size, NUM_ACTION = next(iter(training_data)).action.shape
    assert SEQ_LEN == 6 and NUM_ACTION == 2

    seq2reward_network = Seq2RewardNetwork(
        state_dim=NUM_ACTION,
        action_dim=NUM_ACTION,
        num_hiddens=64,
        num_hidden_layers=2,
    )

    trainer_param = Seq2RewardTrainerParameters(
        learning_rate=learning_rate,
        multi_steps=SEQ_LEN,
        action_names=["0", "1"],
        gamma=1.0,
        view_q_value=True,
    )

    trainer = Seq2RewardTrainer(
        seq2reward_network=seq2reward_network, params=trainer_param
    )

    pl.seed_everything(SEED)
    pl_trainer = pl.Trainer(max_epochs=num_epochs, deterministic=True)
    pl_trainer.fit(trainer, training_data)

    return trainer
예제 #2
0
 def build_trainer(self, use_gpu: bool) -> Seq2RewardTrainer:
     seq2reward_network = self.net_builder.value.build_value_network(
         self.state_normalization_data
     )
     trainer = Seq2RewardTrainer(
         seq2reward_network=seq2reward_network, params=self.trainer_param
     )
     return trainer
예제 #3
0
    def build_trainer(self) -> Seq2RewardTrainer:
        seq2reward_network = self.net_builder.value.build_value_network(
            self.state_normalization_data)

        if self.use_gpu:
            seq2reward_network = seq2reward_network.cuda()

        return Seq2RewardTrainer(seq2reward_network=seq2reward_network,
                                 params=self.trainer_param)
예제 #4
0
def train_seq2reward(
    env: EnvWrapper,
    trainer: Seq2RewardTrainer,
    trainer_preprocessor,
    num_train_transitions: int,
    seq_len: int,
    batch_size: int,
    num_train_epochs: int,
    # for optional validation
    test_replay_buffer=None,
):
    train_replay_buffer = ReplayBuffer(
        replay_capacity=num_train_transitions,
        batch_size=batch_size,
        stack_size=seq_len,
        return_everything_as_stack=True,
    )
    fill_replay_buffer(env, train_replay_buffer, num_train_transitions)
    num_batch_per_epoch = train_replay_buffer.size // batch_size
    logger.info("Made RBs, starting to train now!")
    # pyre-fixme[16]: `EnvWrapper` has no attribute `observation_space`.
    state_dim = env.observation_space.shape[0]
    for epoch in range(num_train_epochs):
        for i in range(num_batch_per_epoch):
            batch = train_replay_buffer.sample_transition_batch(
                batch_size=batch_size)
            preprocessed_batch = trainer_preprocessor(batch)
            adhoc_padding(preprocessed_batch, state_dim=state_dim)
            losses = trainer.train(preprocessed_batch)
            print_seq2reward_losses(epoch, i, losses)

        # validation
        if test_replay_buffer is not None:
            with torch.no_grad():
                trainer.seq2reward_network.eval()
                test_batch = test_replay_buffer.sample_transition_batch(
                    batch_size=batch_size)
                preprocessed_test_batch = trainer_preprocessor(test_batch)
                adhoc_padding(preprocessed_test_batch, state_dim=state_dim)
                valid_losses = trainer.get_loss(preprocessed_test_batch)
                print_seq2reward_losses(epoch, "validation", valid_losses)
                trainer.seq2reward_network.train()
    return trainer
예제 #5
0
    def build_trainer(self) -> Seq2RewardTrainer:
        seq2reward_network = self.net_builder.value.build_value_network(
            self.state_normalization_data)
        trainer = Seq2RewardTrainer(seq2reward_network=seq2reward_network,
                                    params=self.trainer_param)
        if self.use_gpu:
            trainer.seq2reward_network = trainer.seq2reward_network.cuda()
            trainer.step_predict_network = trainer.step_predict_network.cuda()
            trainer.all_permut = trainer.all_permut.cuda()

        return trainer
예제 #6
0
 def build_trainer(
     self,
     normalization_data_map: Dict[str, NormalizationData],
     use_gpu: bool,
     reward_options: Optional[RewardOptions] = None,
 ) -> Seq2RewardTrainer:
     seq2reward_network = self.net_builder.value.build_value_network(
         normalization_data_map[NormalizationKey.STATE])
     trainer = Seq2RewardTrainer(seq2reward_network=seq2reward_network,
                                 params=self.trainer_param)
     return trainer
예제 #7
0
def train_and_eval_seq2reward_model(training_data,
                                    eval_data,
                                    learning_rate=0.01,
                                    num_epochs=5):
    SEQ_LEN, batch_size, NUM_ACTION = training_data[0].action.shape
    assert SEQ_LEN == 6 and NUM_ACTION == 2

    seq2reward_network = Seq2RewardNetwork(
        state_dim=NUM_ACTION,
        action_dim=NUM_ACTION,
        num_hiddens=64,
        num_hidden_layers=2,
    )

    trainer_param = Seq2RewardTrainerParameters(
        learning_rate=0.01,
        multi_steps=SEQ_LEN,
        action_names=["0", "1"],
        batch_size=batch_size,
        gamma=1.0,
        view_q_value=True,
    )

    trainer = Seq2RewardTrainer(seq2reward_network=seq2reward_network,
                                params=trainer_param)

    for _ in range(num_epochs):
        for batch in training_data:
            trainer.train(batch)

    total_eval_mse_loss = 0
    for batch in eval_data:
        mse_loss, _ = trainer.get_loss(batch)
        total_eval_mse_loss += mse_loss.cpu().detach().item()
    eval_mse_loss = total_eval_mse_loss / len(eval_data)

    initial_state = torch.Tensor([[0, 0]])
    q_values = torch.squeeze(
        get_Q(
            trainer.seq2reward_network,
            initial_state,
            trainer.all_permut,
        ))
    return eval_mse_loss, q_values