def test_discrete_action(self): state_dim = 8 action_dim = 4 model = DuelingQNetwork( layers=[state_dim, 8, 4, action_dim], activations=["relu", "relu", "linear"], use_batch_norm=True, ) input = model.input_prototype() self.assertEqual((1, state_dim), input.state.float_features.shape) # Using batch norm requires more than 1 example in training, avoid that model.eval() q_values = model(input) self.assertEqual((1, action_dim), q_values.q_values.shape)
def test_save_load_discrete_action_batch_norm(self): state_dim = 8 action_dim = 4 model = DuelingQNetwork( layers=[state_dim, 8, 4, action_dim], activations=["relu", "relu", "linear"], use_batch_norm=False, ) # Freezing batch_norm model.eval() # Number of expected params is the same because DuelingQNetwork always # initialize batch norm layer even if it doesn't use it. expected_num_params, expected_num_inputs, expected_num_outputs = 22, 1, 1 check_save_load(self, model, expected_num_params, expected_num_inputs, expected_num_outputs)
def _get_model(self, training_parameters, dueling_architecture=False): if dueling_architecture: return DuelingQNetwork( training_parameters.layers, training_parameters.activations, action_dim=self.num_action_features, ) elif training_parameters.factorization_parameters is None: return FullyConnectedNetwork( training_parameters.layers, training_parameters.activations, use_noisy_linear_layers=training_parameters. use_noisy_linear_layers, ) else: return ParametricInnerProduct( FullyConnectedNetwork( training_parameters.factorization_parameters.state.layers, training_parameters.factorization_parameters.state. activations, ), FullyConnectedNetwork( training_parameters.factorization_parameters.action.layers, training_parameters.factorization_parameters.action. activations, ), self.num_state_features, self.num_action_features, )
def build_q_network( self, state_normalization_parameters: Dict[int, NormalizationParameters], output_dim: int, ) -> ModelBase: state_dim = self._get_input_dim(state_normalization_parameters) return DuelingQNetwork( layers=[state_dim] + self.config.sizes + [output_dim], activations=self.config.activations + ["linear"], )
def test_save_load_discrete_action(self): state_dim = 8 action_dim = 4 model = DuelingQNetwork( layers=[state_dim, 8, 4, action_dim], activations=["relu", "relu", "linear"], use_batch_norm=False, ) expected_num_params, expected_num_inputs, expected_num_outputs = 22, 1, 1 check_save_load(self, model, expected_num_params, expected_num_inputs, expected_num_outputs)
def create_dqn_trainer_from_params( model: DiscreteActionModelParameters, normalization_parameters: Dict[int, NormalizationParameters], use_gpu: bool = False, use_all_avail_gpus: bool = False, metrics_to_score=None, ): metrics_to_score = metrics_to_score or [] if model.rainbow.quantile: q_network = QuantileDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), num_atoms=model.rainbow.num_atoms, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) elif model.rainbow.categorical: q_network = CategoricalDQN( # type: ignore state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), num_atoms=model.rainbow.num_atoms, qmin=model.rainbow.qmin, qmax=model.rainbow.qmax, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, use_gpu=use_gpu, ) elif model.rainbow.dueling_architecture: q_network = DuelingQNetwork( # type: ignore layers=[get_num_output_features(normalization_parameters)] + model.training.layers[1:-1] + [len(model.actions)], activations=model.training.activations, ) else: q_network = FullyConnectedDQN( # type: ignore state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) if use_gpu and torch.cuda.is_available(): q_network = q_network.cuda() q_network_target = q_network.get_target_network() reward_network, q_network_cpe, q_network_cpe_target = None, None, None if model.evaluation.calc_cpe_in_training: # Metrics + reward num_output_nodes = (len(metrics_to_score) + 1) * len(model.actions) reward_network = FullyConnectedDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=num_output_nodes, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) q_network_cpe = FullyConnectedDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=num_output_nodes, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) if use_gpu and torch.cuda.is_available(): reward_network.cuda() q_network_cpe.cuda() q_network_cpe_target = q_network_cpe.get_target_network() if (use_all_avail_gpus and not model.rainbow.categorical and not model.rainbow.quantile): q_network = q_network.get_distributed_data_parallel_model() reward_network = (reward_network.get_distributed_data_parallel_model() if reward_network else None) q_network_cpe = (q_network_cpe.get_distributed_data_parallel_model() if q_network_cpe else None) if model.rainbow.quantile: assert (not use_all_avail_gpus ), "use_all_avail_gpus not implemented for distributional RL" return QRDQNTrainer( q_network, q_network_target, model, use_gpu, metrics_to_score=metrics_to_score, ) elif model.rainbow.categorical: assert (not use_all_avail_gpus ), "use_all_avail_gpus not implemented for distributional RL" return C51Trainer( q_network, q_network_target, model, use_gpu, metrics_to_score=metrics_to_score, ) else: return DQNTrainer( q_network, q_network_target, reward_network, model, use_gpu, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, metrics_to_score=metrics_to_score, )
def get_modular_sarsa_trainer_reward_boost( self, environment, reward_shape, dueling, categorical, quantile, use_gpu=False, use_all_avail_gpus=False, clip_grad_norm=None, ): assert not quantile or not categorical parameters = self.get_sarsa_parameters(environment, reward_shape, dueling, categorical, quantile, clip_grad_norm) if quantile: if dueling: q_network = DuelingQuantileDQN( layers=[ get_num_output_features(environment.normalization) ] + parameters.training.layers[1:-1] + [len(environment.ACTIONS)], activations=parameters.training.activations, num_atoms=parameters.rainbow.num_atoms, ) else: q_network = QuantileDQN( state_dim=get_num_output_features( environment.normalization), action_dim=len(environment.ACTIONS), num_atoms=parameters.rainbow.num_atoms, sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) elif categorical: assert not dueling q_network = CategoricalDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), num_atoms=parameters.rainbow.num_atoms, qmin=-100, qmax=200, sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) else: if dueling: q_network = DuelingQNetwork( layers=[ get_num_output_features(environment.normalization) ] + parameters.training.layers[1:-1] + [len(environment.ACTIONS)], activations=parameters.training.activations, ) else: q_network = FullyConnectedDQN( state_dim=get_num_output_features( environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) q_network_cpe, q_network_cpe_target, reward_network = None, None, None if parameters.evaluation and parameters.evaluation.calc_cpe_in_training: q_network_cpe = FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) q_network_cpe_target = q_network_cpe.get_target_network() reward_network = FullyConnectedDQN( state_dim=get_num_output_features(environment.normalization), action_dim=len(environment.ACTIONS), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) if use_gpu: q_network = q_network.cuda() if parameters.evaluation.calc_cpe_in_training: reward_network = reward_network.cuda() q_network_cpe = q_network_cpe.cuda() q_network_cpe_target = q_network_cpe_target.cuda() if use_all_avail_gpus and not categorical: q_network = q_network.get_distributed_data_parallel_model() reward_network = reward_network.get_distributed_data_parallel_model( ) q_network_cpe = q_network_cpe.get_distributed_data_parallel_model( ) q_network_cpe_target = ( q_network_cpe_target.get_distributed_data_parallel_model()) if quantile: trainer = QRDQNTrainer( q_network, q_network.get_target_network(), parameters, use_gpu, reward_network=reward_network, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, ) elif categorical: trainer = C51Trainer(q_network, q_network.get_target_network(), parameters, use_gpu) else: parameters = DQNTrainerParameters.from_discrete_action_model_parameters( parameters) trainer = DQNTrainer( q_network, q_network.get_target_network(), reward_network, parameters, use_gpu, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, ) return trainer
def __init__( self, parameters: DiscreteActionModelParameters, state_normalization_parameters: Dict[int, NormalizationParameters], use_gpu: bool = False, additional_feature_types: AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES, metrics_to_score=None, gradient_handler=None, use_all_avail_gpus: bool = False, ) -> None: self.double_q_learning = parameters.rainbow.double_q_learning self.warm_start_model_path = parameters.training.warm_start_model_path self.minibatch_size = parameters.training.minibatch_size self._actions = parameters.actions if parameters.actions is not None else [] if parameters.training.cnn_parameters is None: self.state_normalization_parameters: Optional[Dict[ int, NormalizationParameters]] = state_normalization_parameters self.num_features = get_num_output_features( state_normalization_parameters) logger.info("Number of state features: " + str(self.num_features)) parameters.training.layers[0] = self.num_features else: self.state_normalization_parameters = None parameters.training.layers[-1] = self.num_actions RLTrainer.__init__( self, parameters, use_gpu, additional_feature_types, metrics_to_score, gradient_handler, actions=self._actions, ) self.reward_boosts = torch.zeros([1, len(self._actions)]).type(self.dtype) if parameters.rl.reward_boost is not None: for k in parameters.rl.reward_boost.keys(): i = self._actions.index(k) self.reward_boosts[0, i] = parameters.rl.reward_boost[k] if parameters.rainbow.dueling_architecture: self.q_network = DuelingQNetwork( parameters.training.layers, parameters.training.activations, use_batch_norm=parameters.training.use_batch_norm, ) else: if parameters.training.cnn_parameters is None: self.q_network = FullyConnectedNetwork( parameters.training.layers, parameters.training.activations, use_noisy_linear_layers=parameters.training. use_noisy_linear_layers, min_std=parameters.training.weight_init_min_std, use_batch_norm=parameters.training.use_batch_norm, ) else: self.q_network = ConvolutionalNetwork( parameters.training.cnn_parameters, parameters.training.layers, parameters.training.activations, use_noisy_linear_layers=parameters.training. use_noisy_linear_layers, min_std=parameters.training.weight_init_min_std, use_batch_norm=parameters.training.use_batch_norm, ) self.q_network_target = deepcopy(self.q_network) self.q_network._name = "training" self.q_network_target._name = "target" self._set_optimizer(parameters.training.optimizer) self.q_network_optimizer = self.optimizer_func( self.q_network.parameters(), lr=parameters.training.learning_rate, weight_decay=parameters.training.l2_decay, ) self._init_cpe_networks(parameters, use_all_avail_gpus) if self.use_gpu: self.q_network.cuda() self.q_network_target.cuda() if use_all_avail_gpus: self.q_network = torch.nn.DataParallel(self.q_network) self.q_network_target = torch.nn.DataParallel( self.q_network_target)
def create_dqn_trainer_from_params( model: DiscreteActionModelParameters, normalization_parameters: Dict[int, NormalizationParameters], use_gpu: bool = False, metrics_to_score=None, ): metrics_to_score = metrics_to_score or [] if model.rainbow.dueling_architecture: q_network = DuelingQNetwork( layers=[get_num_output_features(normalization_parameters)] + model.training.layers[1:-1] + [len(model.actions)], activations=model.training.activations, ) else: q_network = FullyConnectedDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=len(model.actions), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) if use_gpu and torch.cuda.is_available(): q_network = q_network.cuda() q_network_target = q_network.get_target_network() reward_network, q_network_cpe, q_network_cpe_target = None, None, None if model.evaluation.calc_cpe_in_training: # Metrics + reward num_output_nodes = (len(metrics_to_score) + 1) * len(model.actions) reward_network = FullyConnectedDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=num_output_nodes, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) q_network_cpe = FullyConnectedDQN( state_dim=get_num_output_features(normalization_parameters), action_dim=num_output_nodes, sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], dropout_ratio=model.training.dropout_ratio, ) if use_gpu and torch.cuda.is_available(): reward_network.cuda() q_network_cpe.cuda() q_network_cpe_target = q_network_cpe.get_target_network() return DQNTrainer( q_network, q_network_target, reward_network, model, use_gpu, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, metrics_to_score=metrics_to_score, )