def train_seq2reward_model(training_data, learning_rate=0.01, num_epochs=5): SEQ_LEN, batch_size, NUM_ACTION = next(iter(training_data)).action.shape assert SEQ_LEN == 6 and NUM_ACTION == 2 seq2reward_network = Seq2RewardNetwork( state_dim=NUM_ACTION, action_dim=NUM_ACTION, num_hiddens=64, num_hidden_layers=2, ) trainer_param = Seq2RewardTrainerParameters( learning_rate=learning_rate, multi_steps=SEQ_LEN, action_names=["0", "1"], gamma=1.0, view_q_value=True, ) trainer = Seq2RewardTrainer( seq2reward_network=seq2reward_network, params=trainer_param ) pl.seed_everything(SEED) pl_trainer = pl.Trainer(max_epochs=num_epochs, deterministic=True) pl_trainer.fit(trainer, training_data) return trainer
def build_trainer(self, use_gpu: bool) -> Seq2RewardTrainer: seq2reward_network = self.net_builder.value.build_value_network( self.state_normalization_data ) trainer = Seq2RewardTrainer( seq2reward_network=seq2reward_network, params=self.trainer_param ) return trainer
def build_trainer(self) -> Seq2RewardTrainer: seq2reward_network = self.net_builder.value.build_value_network( self.state_normalization_data) if self.use_gpu: seq2reward_network = seq2reward_network.cuda() return Seq2RewardTrainer(seq2reward_network=seq2reward_network, params=self.trainer_param)
def train_seq2reward( env: EnvWrapper, trainer: Seq2RewardTrainer, trainer_preprocessor, num_train_transitions: int, seq_len: int, batch_size: int, num_train_epochs: int, # for optional validation test_replay_buffer=None, ): train_replay_buffer = ReplayBuffer( replay_capacity=num_train_transitions, batch_size=batch_size, stack_size=seq_len, return_everything_as_stack=True, ) fill_replay_buffer(env, train_replay_buffer, num_train_transitions) num_batch_per_epoch = train_replay_buffer.size // batch_size logger.info("Made RBs, starting to train now!") # pyre-fixme[16]: `EnvWrapper` has no attribute `observation_space`. state_dim = env.observation_space.shape[0] for epoch in range(num_train_epochs): for i in range(num_batch_per_epoch): batch = train_replay_buffer.sample_transition_batch( batch_size=batch_size) preprocessed_batch = trainer_preprocessor(batch) adhoc_padding(preprocessed_batch, state_dim=state_dim) losses = trainer.train(preprocessed_batch) print_seq2reward_losses(epoch, i, losses) # validation if test_replay_buffer is not None: with torch.no_grad(): trainer.seq2reward_network.eval() test_batch = test_replay_buffer.sample_transition_batch( batch_size=batch_size) preprocessed_test_batch = trainer_preprocessor(test_batch) adhoc_padding(preprocessed_test_batch, state_dim=state_dim) valid_losses = trainer.get_loss(preprocessed_test_batch) print_seq2reward_losses(epoch, "validation", valid_losses) trainer.seq2reward_network.train() return trainer
def build_trainer(self) -> Seq2RewardTrainer: seq2reward_network = self.net_builder.value.build_value_network( self.state_normalization_data) trainer = Seq2RewardTrainer(seq2reward_network=seq2reward_network, params=self.trainer_param) if self.use_gpu: trainer.seq2reward_network = trainer.seq2reward_network.cuda() trainer.step_predict_network = trainer.step_predict_network.cuda() trainer.all_permut = trainer.all_permut.cuda() return trainer
def build_trainer( self, normalization_data_map: Dict[str, NormalizationData], use_gpu: bool, reward_options: Optional[RewardOptions] = None, ) -> Seq2RewardTrainer: seq2reward_network = self.net_builder.value.build_value_network( normalization_data_map[NormalizationKey.STATE]) trainer = Seq2RewardTrainer(seq2reward_network=seq2reward_network, params=self.trainer_param) return trainer
def train_and_eval_seq2reward_model(training_data, eval_data, learning_rate=0.01, num_epochs=5): SEQ_LEN, batch_size, NUM_ACTION = training_data[0].action.shape assert SEQ_LEN == 6 and NUM_ACTION == 2 seq2reward_network = Seq2RewardNetwork( state_dim=NUM_ACTION, action_dim=NUM_ACTION, num_hiddens=64, num_hidden_layers=2, ) trainer_param = Seq2RewardTrainerParameters( learning_rate=0.01, multi_steps=SEQ_LEN, action_names=["0", "1"], batch_size=batch_size, gamma=1.0, view_q_value=True, ) trainer = Seq2RewardTrainer(seq2reward_network=seq2reward_network, params=trainer_param) for _ in range(num_epochs): for batch in training_data: trainer.train(batch) total_eval_mse_loss = 0 for batch in eval_data: mse_loss, _ = trainer.get_loss(batch) total_eval_mse_loss += mse_loss.cpu().detach().item() eval_mse_loss = total_eval_mse_loss / len(eval_data) initial_state = torch.Tensor([[0, 0]]) q_values = torch.squeeze( get_Q( trainer.seq2reward_network, initial_state, trainer.all_permut, )) return eval_mse_loss, q_values