def metrics_to_score(self) -> List[str]: assert self.reward_options is not None if self._metrics_to_score is None: # pyre-fixme[16]: `ParametricDQNBase` has no attribute `_metrics_to_score`. self._metrics_to_score = get_metrics_to_score( self._reward_options.metric_reward_values) return self._metrics_to_score
def setUp(self): # preparing various components for qr-dqn trainer initialization self.batch_size = 3 self.state_dim = 10 self.action_dim = 2 self.num_layers = 2 self.sizes = [20 for _ in range(self.num_layers)] self.activations = ["relu" for _ in range(self.num_layers)] self.use_layer_norm = False self.softmax_temperature = 1 self.actions = [str(i) for i in range(self.action_dim)] self.params = PPOTrainerParameters(actions=self.actions, normalize=False) self.reward_options = RewardOptions() self.metrics_to_score = get_metrics_to_score( self.reward_options.metric_reward_values ) self.policy_network = DuelingQNetwork.make_fully_connected( state_dim=self.state_dim, action_dim=self.action_dim, layers=self.sizes, activations=self.activations, ) self.sampler = SoftmaxActionSampler(temperature=self.softmax_temperature) self.policy = Policy(scorer=self.policy_network, sampler=self.sampler) self.value_network = FloatFeatureFullyConnected( state_dim=self.state_dim, output_dim=1, sizes=self.sizes, activations=self.activations, use_layer_norm=self.use_layer_norm, )
def metrics_to_score(self) -> List[str]: assert self.reward_options is not None if self._metrics_to_score is None: self._metrics_to_score = get_metrics_to_score( self._reward_options.metric_reward_values ) return self._metrics_to_score
def metrics_to_score(self) -> List[str]: assert self._reward_options is not None # pyre-fixme[16]: `ActorCriticBase` has no attribute `_metrics_to_score`. if self._metrics_to_score is None: self._metrics_to_score = get_metrics_to_score( self._reward_options.metric_reward_values) return self._metrics_to_score
def build_trainer( self, normalization_data_map: Dict[str, NormalizationData], use_gpu: bool, reward_options: Optional[RewardOptions] = None, ) -> ParametricDQNTrainer: net_builder = self.net_builder.value # pyre-fixme[16]: `ParametricDQN` has no attribute `_q_network`. self._q_network = net_builder.build_q_network( normalization_data_map[NormalizationKey.STATE], normalization_data_map[NormalizationKey.ACTION], ) # Metrics + reward reward_options = reward_options or RewardOptions() metrics_to_score = get_metrics_to_score( reward_options.metric_reward_values) reward_output_dim = len(metrics_to_score) + 1 reward_network = net_builder.build_q_network( normalization_data_map[NormalizationKey.STATE], normalization_data_map[NormalizationKey.ACTION], output_dim=reward_output_dim, ) q_network_target = self._q_network.get_target_network() return ParametricDQNTrainer( q_network=self._q_network, q_network_target=q_network_target, reward_network=reward_network, # pyre-fixme[16]: `ParametricDQNTrainerParameters` has no attribute # `asdict`. **self.trainer_param.asdict(), )
def build_trainer( self, normalization_data_map: Dict[str, NormalizationData], use_gpu: bool, reward_options: Optional[RewardOptions] = None, ) -> QRDQNTrainer: net_builder = self.net_builder.value q_network = net_builder.build_q_network( normalization_data_map[NormalizationKey.STATE], len(self.action_names), # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `num_atoms`. num_atoms=self.trainer_param.num_atoms, ) q_network_target = q_network.get_target_network() reward_options = reward_options or RewardOptions() metrics_to_score = get_metrics_to_score( reward_options.metric_reward_values) reward_network, q_network_cpe, q_network_cpe_target = None, None, None if self.eval_parameters.calc_cpe_in_training: # Metrics + reward num_output_nodes = (len(metrics_to_score) + 1) * len( # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `actions`. self.trainer_param.actions) cpe_net_builder = self.cpe_net_builder.value reward_network = cpe_net_builder.build_q_network( self.state_feature_config, normalization_data_map[NormalizationKey.STATE], num_output_nodes, ) q_network_cpe = cpe_net_builder.build_q_network( self.state_feature_config, normalization_data_map[NormalizationKey.STATE], num_output_nodes, ) q_network_cpe_target = q_network_cpe.get_target_network() trainer = QRDQNTrainer( q_network=q_network, q_network_target=q_network_target, reward_network=reward_network, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, metrics_to_score=metrics_to_score, evaluation=self.eval_parameters, # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `asdict`. **self.trainer_param.asdict(), ) return trainer
def setUp(self): # preparing various components for qr-dqn trainer initialization self.params = QRDQNTrainerParameters(actions=["1", "2"], num_atoms=11) self.reward_options = RewardOptions() self.metrics_to_score = get_metrics_to_score( self.reward_options.metric_reward_values ) self.state_dim = 10 self.action_dim = 2 self.sizes = [20, 20] self.num_atoms = 11 self.activations = ["relu", "relu"] self.dropout_ratio = 0 self.q_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.action_dim, sizes=self.sizes, num_atoms=self.num_atoms, activations=self.activations, dropout_ratio=self.dropout_ratio, ) self.q_network_target = self.q_network.get_target_network() self.x = FeatureData(float_features=torch.rand(5, 10)) self.eval_parameters = EvaluationParameters(calc_cpe_in_training=True) self.num_output_nodes = (len(self.metrics_to_score) + 1) * len( # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `actions`. self.params.actions ) self.reward_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.num_output_nodes, sizes=self.sizes, activations=self.activations, ) self.q_network_cpe = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.num_output_nodes, sizes=self.sizes, activations=self.activations, ) self.q_network_cpe_target = self.q_network_cpe.get_target_network()
def setUp(self): # preparing various components for qr-dqn trainer initialization self.batch_size = 3 self.state_dim = 10 self.action_dim = 2 self.num_layers = 2 self.sizes = [20 for _ in range(self.num_layers)] self.num_atoms = 11 self.activations = ["relu" for _ in range(self.num_layers)] self.dropout_ratio = 0 self.exploration_variance = 1e-10 self.actions = [str(i) for i in range(self.action_dim)] self.params = CRRTrainerParameters(actions=self.actions) self.reward_options = RewardOptions() self.metrics_to_score = get_metrics_to_score( self.reward_options.metric_reward_values ) self.actor_network = FullyConnectedActor( state_dim=self.state_dim, action_dim=self.action_dim, sizes=self.sizes, activations=self.activations, exploration_variance=self.exploration_variance, ) self.actor_network_target = self.actor_network.get_target_network() self.q1_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.action_dim, sizes=self.sizes, activations=self.activations, dropout_ratio=self.dropout_ratio, ) self.q1_network_target = self.q1_network.get_target_network() self.q2_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.action_dim, sizes=self.sizes, activations=self.activations, dropout_ratio=self.dropout_ratio, ) self.q2_network_target = self.q2_network.get_target_network() self.num_output_nodes = (len(self.metrics_to_score) + 1) * len( self.params.actions ) self.eval_parameters = EvaluationParameters(calc_cpe_in_training=True) self.reward_network = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.num_output_nodes, sizes=self.sizes, activations=self.activations, ) self.q_network_cpe = FullyConnectedDQN( state_dim=self.state_dim, action_dim=self.num_output_nodes, sizes=self.sizes, activations=self.activations, ) self.q_network_cpe_target = self.q_network_cpe.get_target_network() self.inp = DiscreteDqnInput( state=FeatureData( float_features=torch.rand(self.batch_size, self.state_dim) ), next_state=FeatureData( float_features=torch.rand(self.batch_size, self.state_dim) ), reward=torch.ones(self.batch_size, 1), time_diff=torch.ones(self.batch_size, 1) * 2, step=torch.ones(self.batch_size, 1) * 2, not_terminal=torch.ones( self.batch_size, 1 ), # todo: check terminal behavior action=torch.tensor([[0, 1], [1, 0], [0, 1]]), next_action=torch.tensor([[1, 0], [0, 1], [1, 0]]), possible_actions_mask=torch.ones(self.batch_size, self.action_dim), possible_next_actions_mask=torch.ones(self.batch_size, self.action_dim), extras=ExtraData(action_probability=torch.ones(self.batch_size, 1)), )
def build_trainer( self, normalization_data_map: Dict[str, NormalizationData], use_gpu: bool, reward_options: Optional[RewardOptions] = None, ) -> DiscreteCRRTrainer: actor_net_builder = self.actor_net_builder.value actor_network = actor_net_builder.build_actor( normalization_data_map[NormalizationKey.STATE], len(self.action_names)) actor_network_target = actor_network.get_target_network() # The arguments to q_network1 and q_network2 below are modeled after those in discrete_dqn.py critic_net_builder = self.critic_net_builder.value q1_network = critic_net_builder.build_q_network( self.state_feature_config, normalization_data_map[NormalizationKey.STATE], len(self.action_names), ) q1_network_target = q1_network.get_target_network() q2_network = q2_network_target = None # pyre-fixme[16]: `CRRTrainerParameters` has no attribute # `double_q_learning`. if self.trainer_param.double_q_learning: q2_network = critic_net_builder.build_q_network( self.state_feature_config, normalization_data_map[NormalizationKey.STATE], len(self.action_names), ) q2_network_target = q2_network.get_target_network() reward_options = reward_options or RewardOptions() metrics_to_score = get_metrics_to_score( reward_options.metric_reward_values) reward_network, q_network_cpe, q_network_cpe_target = None, None, None if self.eval_parameters.calc_cpe_in_training: # Metrics + reward num_output_nodes = (len(metrics_to_score) + 1) * len( # pyre-fixme[16]: `CRRTrainerParameters` has no attribute `actions`. self.trainer_param.actions) cpe_net_builder = self.cpe_net_builder.value reward_network = cpe_net_builder.build_q_network( self.state_feature_config, normalization_data_map[NormalizationKey.STATE], num_output_nodes, ) q_network_cpe = cpe_net_builder.build_q_network( self.state_feature_config, normalization_data_map[NormalizationKey.STATE], num_output_nodes, ) q_network_cpe_target = q_network_cpe.get_target_network() trainer = DiscreteCRRTrainer( actor_network=actor_network, actor_network_target=actor_network_target, q1_network=q1_network, q1_network_target=q1_network_target, reward_network=reward_network, q2_network=q2_network, q2_network_target=q2_network_target, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, metrics_to_score=metrics_to_score, evaluation=self.eval_parameters, # pyre-fixme[16]: `CRRTrainerParameters` has no attribute `asdict`. **self.trainer_param.asdict(), ) return trainer