def build_trainer(self) -> DQNTrainer: net_builder = self.net_builder.value q_network = net_builder.build_q_network( self.state_feature_config, self.state_normalization_parameters, # pyre-fixme[16]: `DiscreteDQN` has no attribute `action_names`. # pyre-fixme[16]: `DiscreteDQN` has no attribute `action_names`. len(self.action_names), ) if self.use_gpu: q_network = q_network.cuda() q_network_target = q_network.get_target_network() reward_network, q_network_cpe, q_network_cpe_target = None, None, None if self.trainer_param.evaluation.calc_cpe_in_training: # Metrics + reward num_output_nodes = (len(self.metrics_to_score) + 1) * len( self.trainer_param.actions) cpe_net_builder = self.cpe_net_builder.value reward_network = cpe_net_builder.build_q_network( self.state_feature_config, self.state_normalization_parameters, num_output_nodes, ) q_network_cpe = cpe_net_builder.build_q_network( self.state_feature_config, self.state_normalization_parameters, num_output_nodes, ) if self.use_gpu: reward_network.cuda() q_network_cpe.cuda() q_network_cpe_target = q_network_cpe.get_target_network() # pyre-fixme[16]: `DiscreteDQN` has no attribute `_q_network`. # pyre-fixme[16]: `DiscreteDQN` has no attribute `_q_network`. self._q_network = q_network trainer = DQNTrainer( q_network, q_network_target, reward_network, self.trainer_param, self.use_gpu, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, metrics_to_score=self.metrics_to_score, loss_reporter=NoOpLossReporter(), ) return trainer
def create_from_tensors_dqn( cls, trainer: DQNTrainer, mdp_ids: torch.Tensor, sequence_numbers: torch.Tensor, states: rlt.FeatureData, actions: rlt.FeatureData, propensities: torch.Tensor, rewards: torch.Tensor, possible_actions_mask: torch.Tensor, metrics: Optional[torch.Tensor] = None, ): old_q_train_state = trainer.q_network.training # pyre-fixme[16]: `DQNTrainer` has no attribute `reward_network`. old_reward_train_state = trainer.reward_network.training # pyre-fixme[16]: `DQNTrainer` has no attribute `q_network_cpe`. old_q_cpe_train_state = trainer.q_network_cpe.training trainer.q_network.train(False) trainer.reward_network.train(False) trainer.q_network_cpe.train(False) num_actions = trainer.num_actions action_mask = actions.float() rewards = trainer.boost_rewards(rewards, actions) model_values = trainer.q_network_cpe(states)[:, 0:num_actions] optimal_q_values, _ = trainer.get_detached_q_values(states) # Do we ever really use eval_action_idxs? eval_action_idxs = trainer.get_max_q_values(optimal_q_values, possible_actions_mask)[1] model_propensities = masked_softmax(optimal_q_values, possible_actions_mask, trainer.rl_temperature) assert model_values.shape == actions.shape, ("Invalid shape: " + str(model_values.shape) + " != " + str(actions.shape)) assert model_values.shape == possible_actions_mask.shape, ( "Invalid shape: " + str(model_values.shape) + " != " + str(possible_actions_mask.shape)) model_values_for_logged_action = torch.sum(model_values * action_mask, dim=1, keepdim=True) rewards_and_metric_rewards = trainer.reward_network(states) # In case we reuse the modular for Q-network if hasattr(rewards_and_metric_rewards, "q_values"): rewards_and_metric_rewards = rewards_and_metric_rewards model_rewards = rewards_and_metric_rewards[:, 0:num_actions] assert model_rewards.shape == actions.shape, ( "Invalid shape: " + str(model_rewards.shape) + " != " + str(actions.shape)) model_rewards_for_logged_action = torch.sum(model_rewards * action_mask, dim=1, keepdim=True) model_metrics = rewards_and_metric_rewards[:, num_actions:] assert model_metrics.shape[1] % num_actions == 0, ( "Invalid metrics shape: " + str(model_metrics.shape) + " " + str(num_actions)) num_metrics = model_metrics.shape[1] // num_actions if num_metrics == 0: model_metrics_values = None model_metrics_for_logged_action = None model_metrics_values_for_logged_action = None else: model_metrics_values = trainer.q_network_cpe(states) # Backward compatility if hasattr(model_metrics_values, "q_values"): model_metrics_values = model_metrics_values model_metrics_values = model_metrics_values[:, num_actions:] assert model_metrics_values.shape[ 1] == num_actions * num_metrics, ( "Invalid shape: " + str(model_metrics_values.shape[1]) + " != " + str(actions.shape[1] * num_metrics)) model_metrics_for_logged_action_list = [] model_metrics_values_for_logged_action_list = [] for metric_index in range(num_metrics): metric_start = metric_index * num_actions metric_end = (metric_index + 1) * num_actions model_metrics_for_logged_action_list.append( torch.sum( model_metrics[:, metric_start:metric_end] * action_mask, dim=1, keepdim=True, )) model_metrics_values_for_logged_action_list.append( torch.sum( model_metrics_values[:, metric_start:metric_end] * action_mask, dim=1, keepdim=True, )) model_metrics_for_logged_action = torch.cat( model_metrics_for_logged_action_list, dim=1) model_metrics_values_for_logged_action = torch.cat( model_metrics_values_for_logged_action_list, dim=1) trainer.q_network_cpe.train(old_q_cpe_train_state) trainer.q_network.train(old_q_train_state) trainer.reward_network.train(old_reward_train_state) return cls( mdp_id=mdp_ids, sequence_number=sequence_numbers, logged_propensities=propensities, logged_rewards=rewards, action_mask=action_mask, model_rewards=model_rewards, model_rewards_for_logged_action=model_rewards_for_logged_action, model_values=model_values, model_values_for_logged_action=model_values_for_logged_action, model_metrics_values=model_metrics_values, model_metrics_values_for_logged_action= model_metrics_values_for_logged_action, model_propensities=model_propensities, logged_metrics=metrics, model_metrics=model_metrics, model_metrics_for_logged_action=model_metrics_for_logged_action, # Will compute later logged_values=None, logged_metrics_values=None, possible_actions_mask=possible_actions_mask, optimal_q_values=optimal_q_values, eval_action_idxs=eval_action_idxs, )
def get_modular_sarsa_trainer_reward_boost( self, environment, reward_shape, dueling, categorical, quantile, use_gpu=False, use_all_avail_gpus=False, clip_grad_norm=None, ): assert not quantile or not categorical parameters = self.get_sarsa_parameters( environment, reward_shape, dueling, categorical, quantile, clip_grad_norm ) def make_dueling_dqn(num_atoms=None): return models.DuelingQNetwork.make_fully_connected( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), layers=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], num_atoms=num_atoms, ) if quantile: if dueling: q_network = make_dueling_dqn(num_atoms=parameters.rainbow.num_atoms) else: q_network = models.FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), num_atoms=parameters.rainbow.num_atoms, sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) elif categorical: assert not dueling distributional_network = models.FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), num_atoms=parameters.rainbow.num_atoms, sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) q_network = models.CategoricalDQN( distributional_network, qmin=-100, qmax=200, num_atoms=parameters.rainbow.num_atoms, ) else: if dueling: q_network = make_dueling_dqn() else: q_network = models.FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) q_network_cpe, q_network_cpe_target, reward_network = None, None, None if parameters.evaluation and parameters.evaluation.calc_cpe_in_training: q_network_cpe = models.FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) q_network_cpe_target = q_network_cpe.get_target_network() reward_network = models.FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) if use_gpu: q_network = q_network.cuda() if parameters.evaluation.calc_cpe_in_training: reward_network = reward_network.cuda() q_network_cpe = q_network_cpe.cuda() q_network_cpe_target = q_network_cpe_target.cuda() if use_all_avail_gpus and not categorical: q_network = q_network.get_distributed_data_parallel_model() reward_network = reward_network.get_distributed_data_parallel_model() q_network_cpe = q_network_cpe.get_distributed_data_parallel_model() q_network_cpe_target = ( q_network_cpe_target.get_distributed_data_parallel_model() ) if quantile: parameters = QRDQNTrainerParameters.from_discrete_action_model_parameters( parameters ) trainer = QRDQNTrainer( q_network, q_network.get_target_network(), parameters, use_gpu, reward_network=reward_network, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, ) elif categorical: parameters = C51TrainerParameters.from_discrete_action_model_parameters( parameters ) trainer = C51Trainer( q_network, q_network.get_target_network(), parameters, use_gpu ) else: parameters = DQNTrainerParameters.from_discrete_action_model_parameters( parameters ) trainer = DQNTrainer( q_network, q_network.get_target_network(), reward_network, parameters, use_gpu, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, ) return trainer
def create_dqn_trainer_from_params( model: DiscreteActionModelParameters, normalization_parameters: Dict[int, NormalizationParameters], use_gpu: bool = False, use_all_avail_gpus: bool = False, metrics_to_score=None, ): metrics_to_score = metrics_to_score or [] if model.rainbow.quantile: q_network = QuantileDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), num_atoms=model.rainbow.num_atoms, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) elif model.rainbow.categorical: q_network = CategoricalDQN( # type: ignore state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), num_atoms=model.rainbow.num_atoms, qmin=model.rainbow.qmin, qmax=model.rainbow.qmax, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, use_gpu=use_gpu, ) elif model.rainbow.dueling_architecture: q_network = DuelingQNetwork( # type: ignore layers=[get_num_output_features(normalization_parameters)] + model.training.layers[1:-1] + [len(model.actions)], activations=model.training.activations, ) else: q_network = FullyConnectedDQN( # type: ignore state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) if use_gpu and torch.cuda.is_available(): q_network = q_network.cuda() q_network_target = q_network.get_target_network() reward_network, q_network_cpe, q_network_cpe_target = None, None, None if model.evaluation.calc_cpe_in_training: # Metrics + reward num_output_nodes = (len(metrics_to_score) + 1) * len(model.actions) reward_network = FullyConnectedDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=num_output_nodes, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) q_network_cpe = FullyConnectedDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=num_output_nodes, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) if use_gpu and torch.cuda.is_available(): reward_network.cuda() q_network_cpe.cuda() q_network_cpe_target = q_network_cpe.get_target_network() if (use_all_avail_gpus and not model.rainbow.categorical and not model.rainbow.quantile): q_network = q_network.get_distributed_data_parallel_model() reward_network = (reward_network.get_distributed_data_parallel_model() if reward_network else None) q_network_cpe = (q_network_cpe.get_distributed_data_parallel_model() if q_network_cpe else None) if model.rainbow.quantile: assert (not use_all_avail_gpus ), "use_all_avail_gpus not implemented for distributional RL" parameters = QRDQNTrainerParameters.from_discrete_action_model_parameters( model) return QRDQNTrainer( q_network, q_network_target, parameters, use_gpu, metrics_to_score=metrics_to_score, reward_network=reward_network, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, ) elif model.rainbow.categorical: assert (not use_all_avail_gpus ), "use_all_avail_gpus not implemented for distributional RL" return C51Trainer( q_network, q_network_target, C51TrainerParameters.from_discrete_action_model_parameters(model), use_gpu, metrics_to_score=metrics_to_score, ) else: parameters = DQNTrainerParameters.from_discrete_action_model_parameters( model) return DQNTrainer( q_network, q_network_target, reward_network, parameters, use_gpu, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, metrics_to_score=metrics_to_score, )