class FullyConnected(ValueNetBuilder): __hash__ = param_hash sizes: List[int] = field(default_factory=lambda: [256, 128]) activations: List[str] = field(default_factory=lambda: ["relu", "relu"]) use_layer_norm: bool = False def __post_init_post_parse__(self): super().__init__() assert len(self.sizes) == len(self.activations), ( f"Must have the same numbers of sizes and activations; got: " f"{self.sizes}, {self.activations}") def build_value_network(self, state_normalization_data: NormalizationData, output_dim: int = 1) -> torch.nn.Module: state_dim = get_num_output_features( state_normalization_data.dense_normalization_parameters) return FloatFeatureFullyConnected( state_dim=state_dim, output_dim=output_dim, sizes=self.sizes, activations=self.activations, use_layer_norm=self.use_layer_norm, )
class DuelingQuantile(QRDQNNetBuilder): __hash__ = param_hash sizes: List[int] = field(default_factory=lambda: [256, 128]) activations: List[str] = field(default_factory=lambda: ["relu", "relu"]) def __post_init_post_parse__(self): assert len(self.sizes) == len(self.activations), ( f"Must have the same numbers of sizes and activations; got: " f"{self.sizes}, {self.activations}") def build_q_network( self, state_normalization_data: NormalizationData, output_dim: int, num_atoms: int, ) -> ModelBase: state_dim = self._get_input_dim(state_normalization_data) return DuelingQNetwork.make_fully_connected( state_dim, output_dim, layers=self.sizes, activations=self.activations, num_atoms=num_atoms, )
def __init__( self, actor_network, q1_network, q2_network=None, # Start TD3TrainerParameters rl: RLParameters = field(default_factory=RLParameters), # noqa: B008 q_network_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default ), actor_network_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default ), minibatch_size: int = 64, noise_variance: float = 0.2, noise_clip: float = 0.5, delayed_policy_update: int = 2, minibatches_per_step: int = 1, ) -> None: """ Args: actor_network: states -> actions, trained to maximize value q1_network: states, action -> q-value q2_network (optional): double q-learning to stabilize training from overestimation bias rl (optional): an instance of the RLParameter class, which defines relevant hyperparameters q_network_optimizer (optional): the optimizer class and optimizer hyperparameters for the q network(s) optimizer actor_network_optimizer (optional): see q_network_optimizer minibatch_size (optional): the size of the minibatch noise_variance (optional): the variance of action noise added to smooth q-value estimates noise_clip (optional): the maximum absolute value of action noise added to smooth q-value estimates delayed_policy_update (optional): the ratio of q network updates to target and policy network updates minibatches_per_step (optional, TODO: currently unused): the number of minibatch updates per training step """ super().__init__() self.rl_parameters = rl self.minibatch_size = minibatch_size self.minibatches_per_step = minibatches_per_step or 1 self.q1_network = q1_network self.q1_network_target = copy.deepcopy(self.q1_network) self.q_network_optimizer = q_network_optimizer self.q2_network = q2_network if self.q2_network is not None: self.q2_network_target = copy.deepcopy(self.q2_network) self.actor_network = actor_network self.actor_network_target = copy.deepcopy(self.actor_network) self.actor_network_optimizer = actor_network_optimizer self.noise_variance = noise_variance self.noise_clip_range = (-noise_clip, noise_clip) self.delayed_policy_update = delayed_policy_update
class FullyConnected(ValueNetBuilder): __hash__ = param_hash sizes: List[int] = field(default_factory=lambda: [256, 128]) activations: List[str] = field(default_factory=lambda: ["relu", "relu"]) use_layer_norm: bool = False def __post_init_post_parse__(self): super().__init__() assert len(self.sizes) == len(self.activations), ( f"Must have the same numbers of sizes and activations; got: " f"{self.sizes}, {self.activations}") # pyre-fixme[14]: `build_value_network` overrides method defined in # `ValueNetBuilder` inconsistently. def build_value_network( self, state_normalization_data: NormalizationData) -> torch.nn.Module: state_dim = get_num_output_features( state_normalization_data.dense_normalization_parameters) return FullyConnectedNetwork( [state_dim] + self.sizes + [1], self.activations + ["linear"], use_layer_norm=self.use_layer_norm, )
def __init__( self, q_network, q_network_target, reward_network, use_gpu: bool = False, # Start ParametricDQNTrainerParameters rl: rlp.RLParameters = field( default_factory=rlp.RLParameters), # noqa: B008 double_q_learning: bool = True, minibatch_size: int = 1024, minibatches_per_step: int = 1, optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), ) -> None: super().__init__(rl, use_gpu=use_gpu) self.double_q_learning = double_q_learning self.minibatch_size = minibatch_size self.minibatches_per_step = minibatches_per_step or 1 self.q_network = q_network self.q_network_target = q_network_target self.q_network_optimizer = optimizer.make_optimizer( self.q_network.parameters()) self.reward_network = reward_network self.reward_network_optimizer = optimizer.make_optimizer( self.reward_network.parameters())
class FullyConnected(ParametricDQNNetBuilder): __hash__ = param_hash sizes: List[int] = field(default_factory=lambda: [128, 64]) activations: List[str] = field(default_factory=lambda: ["relu", "relu"]) use_batch_norm: bool = False use_layer_norm: bool = False def __post_init_post_parse__(self): super().__init__() assert len(self.sizes) == len(self.activations), ( f"Must have the same numbers of sizes and activations; got: " f"{self.sizes}, {self.activations}") def build_q_network( self, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], output_dim: int = 1, ) -> ModelBase: state_dim = get_num_output_features(state_normalization_parameters) action_dim = get_num_output_features(action_normalization_parameters) return FullyConnectedCritic( state_dim=state_dim, action_dim=action_dim, sizes=self.sizes, activations=self.activations, use_batch_norm=self.use_batch_norm, use_layer_norm=self.use_layer_norm, output_dim=output_dim, )
class FullyConnected(DiscreteActorNetBuilder): __hash__ = param_hash sizes: List[int] = field(default_factory=lambda: [128, 64]) activations: List[str] = field(default_factory=lambda: ["relu", "relu"]) use_batch_norm: bool = False use_layer_norm: bool = False action_activation: str = "tanh" exploration_variance: Optional[float] = None def __post_init_post_parse__(self): super().__init__() assert len(self.sizes) == len(self.activations), ( f"Must have the same numbers of sizes and activations; got: " f"{self.sizes}, {self.activations}") def build_actor( self, state_normalization_data: NormalizationData, num_actions: int, ) -> ModelBase: state_dim = get_num_output_features( state_normalization_data.dense_normalization_parameters) return FullyConnectedActor( state_dim=state_dim, action_dim=num_actions, sizes=self.sizes, activations=self.activations, use_batch_norm=self.use_batch_norm, action_activation=self.action_activation, exploration_variance=self.exploration_variance, )
def __init__( self, seq2slate_net: Seq2SlateTransformerNet, minibatch_size: int = 1024, parameters: Seq2SlateParameters = field( # noqa: B008 default_factory=Seq2SlateParameters), baseline_net: Optional[BaselineNet] = None, baseline_warmup_num_batches: int = 0, use_gpu: bool = False, policy_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), baseline_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), ) -> None: self.seq2slate_net = seq2slate_net self.parameters = parameters self.use_gpu = use_gpu self.minibatch_size = minibatch_size self.minibatch = 0 self.baseline_net = baseline_net self.baseline_warmup_num_batches = baseline_warmup_num_batches self.rl_opt = policy_optimizer.make_optimizer( self.seq2slate_net.parameters()) if self.baseline_net: self.baseline_opt = baseline_optimizer.make_optimizer( # pyre-fixme[16]: `Optional` has no attribute `parameters`. self.baseline_net.parameters()) assert (self.parameters.importance_sampling_clamp_max is None or not self.parameters.on_policy), ( "importance_sampling_clamp_max is not useful and should " "be set to None in on-policy learning")
def __init__( self, q_network, q_network_target, reward_network, # Start ParametricDQNTrainerParameters rl: rlp.RLParameters = field( default_factory=rlp.RLParameters), # noqa: B008 double_q_learning: bool = True, minibatch_size: int = 1024, minibatches_per_step: int = 1, optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), ) -> None: super().__init__() self.rl_parameters = rl self.double_q_learning = double_q_learning self.minibatch_size = minibatch_size self.minibatches_per_step = minibatches_per_step or 1 self.q_network = q_network self.q_network_target = q_network_target self.reward_network = reward_network self.optimizer = optimizer if rl.q_network_loss == "mse": self.q_network_loss = F.mse_loss elif rl.q_network_loss == "huber": self.q_network_loss = F.smooth_l1_loss else: raise Exception("Q-Network loss type {} not valid loss.".format( rl.q_network_loss))
def __init__( self, q_network, q_network_target, use_gpu: bool = False, # Start SlateQTrainerParameters rl: rlp.RLParameters = field( # noqa: B008 default_factory=lambda: rlp.RLParameters(maxq_learning=False)), optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), single_selection: bool = True, minibatch_size: int = 1024, evaluation: rlp.EvaluationParameters = field( # noqa: B008 default_factory=lambda: rlp.EvaluationParameters( calc_cpe_in_training=False)), ) -> None: super().__init__(rl, use_gpu=use_gpu) self.minibatches_per_step = 1 self.minibatch_size = minibatch_size self.single_selection = single_selection self.q_network = q_network self.q_network_target = q_network_target self.q_network_optimizer = optimizer.make_optimizer( self.q_network.parameters())
class SingleStepSyntheticReward(SyntheticRewardNetBuilder): __hash__ = param_hash sizes: List[int] = field(default_factory=lambda: [256, 128]) activations: List[str] = field(default_factory=lambda: ["relu", "relu"]) last_layer_activation: str = "sigmoid" use_batch_norm: bool = False use_layer_norm: bool = False def build_synthetic_reward_network( self, state_normalization_data: NormalizationData, action_normalization_data: Optional[NormalizationData] = None, discrete_action_names: Optional[List[str]] = None, ) -> ModelBase: state_dim = get_num_output_features( state_normalization_data.dense_normalization_parameters) if not discrete_action_names: assert action_normalization_data is not None action_dim = get_num_output_features( action_normalization_data.dense_normalization_parameters) else: action_dim = len(discrete_action_names) net = SingleStepSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, sizes=self.sizes, activations=self.activations, last_layer_activation=self.last_layer_activation, use_batch_norm=self.use_batch_norm, use_layer_norm=self.use_layer_norm, ) return SyntheticRewardNet(net)
class Categorical(CategoricalDQNNetBuilder): __hash__ = param_hash sizes: List[int] = field(default_factory=lambda: [256, 128]) activations: List[str] = field(default_factory=lambda: ["relu", "relu"]) def __post_init_post_parse__(self): super().__init__() assert len(self.sizes) == len(self.activations), ( f"Must have the same numbers of sizes and activations; got: " f"{self.sizes}, {self.activations}") def build_q_network( self, state_normalization_data: NormalizationData, output_dim: int, num_atoms: int, qmin: int, qmax: int, ) -> ModelBase: state_dim = self._get_input_dim(state_normalization_data) distributional_network = FullyConnectedDQN( state_dim=state_dim, action_dim=output_dim, num_atoms=num_atoms, sizes=self.sizes, activations=self.activations, use_batch_norm=False, dropout_ratio=0.0, ) return CategoricalDQN(distributional_network, qmin=qmin, qmax=qmax, num_atoms=num_atoms)
def __init__( self, q_network, q_network_target, reward_network, rl: rlp.RLParameters = field( default_factory=rlp.RLParameters), # noqa B008 double_q_learning: bool = True, minibatch_size: int = 1024, minibatches_per_step: int = 1, optimizer: rlp.OptimizerParameters = field( # noqa B008 default_factory=rlp.OptimizerParameters), use_gpu: bool = False, ) -> None: super().__init__(rl, use_gpu=use_gpu) self.double_q_learning = double_q_learning self.minibatch_size = minibatch_size self.minibatches_per_step = minibatches_per_step or 1 self.q_network = q_network self.q_network_target = q_network_target self._set_optimizer(optimizer.optimizer) self.q_network_optimizer = self.optimizer_func( self.q_network.parameters(), lr=optimizer.learning_rate, weight_decay=optimizer.l2_decay, ) self.reward_network = reward_network self.reward_network_optimizer = self.optimizer_func( self.reward_network.parameters(), lr=optimizer.learning_rate, weight_decay=optimizer.l2_decay, )
class Categorical(CategoricalDQNNetBuilder): __hash__ = param_hash sizes: List[int] = field(default_factory=lambda: [256, 128]) activations: List[str] = field(default_factory=lambda: ["relu", "relu"]) num_atoms: int = 51 qmin: int = -100 qmax: int = 200 def __post_init_post_parse__(self): super().__init__() assert len(self.sizes) == len(self.activations), ( f"Must have the same numbers of sizes and activations; got: " f"{self.sizes}, {self.activations}" ) def build_q_network( self, state_normalization_parameters: Dict[int, NormalizationParameters], output_dim: int, ) -> ModelBase: state_dim = self._get_input_dim(state_normalization_parameters) return CategoricalDQN( state_dim, action_dim=output_dim, num_atoms=self.num_atoms, qmin=self.qmin, qmax=self.qmax, sizes=self.sizes, activations=self.activations, use_batch_norm=False, dropout_ratio=0.0, use_gpu=False, )
def __init__( self, q_network, q_network_target, # Start SlateQTrainerParameters rl: rlp.RLParameters = field( # noqa: B008 default_factory=lambda: rlp.RLParameters(maxq_learning=False)), optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), single_selection: bool = True, minibatch_size: int = 1024, evaluation: rlp.EvaluationParameters = field( # noqa: B008 default_factory=lambda: rlp.EvaluationParameters( calc_cpe_in_training=False)), ) -> None: """ Args: q_network: states, action -> q-value rl (optional): an instance of the RLParameter class, which defines relevant hyperparameters optimizer (optional): the optimizer class and optimizer hyperparameters for the q network(s) optimizer single_selection (optional): TBD minibatch_size (optional): the size of the minibatch evaluation (optional): TBD """ super().__init__() self.rl_parameters = rl self.single_selection = single_selection self.q_network = q_network self.q_network_target = q_network_target self.q_network_optimizer = optimizer
class DQNTrainerParameters: __hash__ = rlp.param_hash actions: List[str] = field(default_factory=list) rl: rlp.RLParameters = field(default_factory=rlp.RLParameters) double_q_learning: bool = True bcq: Optional[BCQConfig] = None minibatch_size: int = 1024 minibatches_per_step: int = 1 optimizer: rlp.OptimizerParameters = field( default_factory=rlp.OptimizerParameters) evaluation: rlp.EvaluationParameters = field( default_factory=rlp.EvaluationParameters) @classmethod def from_discrete_action_model_parameters( cls, params: DiscreteActionModelParameters): return cls( actions=params.actions, rl=params.rl, double_q_learning=params.rainbow.double_q_learning, bcq=BCQConfig(drop_threshold=params.rainbow.bcq_drop_threshold) if params.rainbow.bcq else None, minibatch_size=params.training.minibatch_size, minibatches_per_step=params.training.minibatches_per_step, optimizer=rlp.OptimizerParameters( optimizer=params.training.optimizer, learning_rate=params.training.learning_rate, l2_decay=params.training.l2_decay, ), evaluation=params.evaluation, )
class FullyConnected(DiscreteDQNNetBuilder): __hash__ = param_hash sizes: List[int] = field(default_factory=lambda: [256, 128]) activations: List[str] = field(default_factory=lambda: ["relu", "relu"]) dropout_ratio: float = 0.0 use_batch_norm: bool = False def __post_init_post_parse__(self): super().__init__() assert len(self.sizes) == len(self.activations), ( f"Must have the same numbers of sizes and activations; got: " f"{self.sizes}, {self.activations}") def build_q_network( self, state_feature_config: rlt.ModelFeatureConfig, state_normalization_data: NormalizationData, output_dim: int, ) -> ModelBase: state_dim = self._get_input_dim(state_normalization_data) return FullyConnectedDQN( state_dim=state_dim, action_dim=output_dim, sizes=self.sizes, activations=self.activations, dropout_ratio=self.dropout_ratio, use_batch_norm=self.use_batch_norm, )
class DirichletFullyConnected(ContinuousActorNetBuilder): __hash__ = param_hash sizes: List[int] = field(default_factory=lambda: [128, 64]) activations: List[str] = field(default_factory=lambda: ["relu", "relu"]) use_batch_norm: bool = False def __post_init_post_parse__(self): super().__init__() assert len(self.sizes) == len(self.activations), ( f"Must have the same numbers of sizes and activations; got: " f"{self.sizes}, {self.activations}") @property def default_action_preprocessing(self) -> str: return DO_NOT_PREPROCESS def build_actor( self, state_normalization_data: NormalizationData, action_normalization_data: NormalizationData, ) -> ModelBase: state_dim = get_num_output_features( state_normalization_data.dense_normalization_parameters) action_dim = get_num_output_features( action_normalization_data.dense_normalization_parameters) return DirichletFullyConnectedActor( state_dim=state_dim, action_dim=action_dim, sizes=self.sizes, activations=self.activations, use_batch_norm=self.use_batch_norm, )
class Quantile(QRDQNNetBuilder): __hash__ = param_hash sizes: List[int] = field(default_factory=lambda: [256, 128]) activations: List[str] = field(default_factory=lambda: ["relu", "relu"]) dropout_ratio: float = 0.0 def __post_init_post_parse__(self): super().__init__() assert len(self.sizes) == len(self.activations), ( f"Must have the same numbers of sizes and activations; got: " f"{self.sizes}, {self.activations}") def build_q_network( self, state_normalization_data: NormalizationData, output_dim: int, num_atoms: int, ) -> ModelBase: state_dim = self._get_input_dim(state_normalization_data) return FullyConnectedDQN( state_dim=state_dim, action_dim=output_dim, sizes=self.sizes, num_atoms=num_atoms, activations=self.activations, dropout_ratio=self.dropout_ratio, )
class Seq2RewardModel(WorldModelBase): __hash__ = param_hash net_builder: ValueNetBuilder__Union = field( # pyre-fixme[28]: Unexpected keyword argument `Seq2RewardNetBuilder`. # pyre-fixme[28]: Unexpected keyword argument `Seq2RewardNetBuilder`. default_factory=lambda: ValueNetBuilder__Union( Seq2RewardNetBuilder=Seq2RewardNetBuilder() ) ) trainer_param: Seq2RewardTrainerParameters = field( default_factory=Seq2RewardTrainerParameters ) def build_trainer(self) -> Seq2RewardTrainer: seq2reward_network = self.net_builder.value.build_value_network( self.state_normalization_data, self.action_normalization_data ) if self.use_gpu: seq2reward_network = seq2reward_network.cuda() return Seq2RewardTrainer( seq2reward_network=seq2reward_network, params=self.trainer_param ) def build_serving_module(self) -> torch.nn.Module: """ Returns a TorchScript predictor module """ raise NotImplementedError()
def __init__( self, seq2slate_net: Seq2SlateTransformerNet, reward_net_path: str, minibatch_size: int, parameters: Seq2SlateParameters, baseline_net: Optional[BaselineNet] = None, baseline_warmup_num_batches: int = 0, use_gpu: bool = False, policy_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default ), baseline_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default ), ) -> None: self.reward_net_path = reward_net_path # loaded when used self.reward_net = None self.parameters = parameters self.minibatch_size = minibatch_size self.use_gpu = use_gpu self.device = torch.device("cuda") if use_gpu else torch.device("cpu") self.permutation_index = torch.tensor( list( permutations( # pyre-fixme[6]: Expected `Iterable[Variable[itertools._T]]` for # 1st param but got `Tensor`. torch.arange(seq2slate_net.max_src_seq_len), seq2slate_net.max_tgt_seq_len, ) ), device=self.device, ).long() if self.parameters.simulation_distance_penalty is not None: # pyre-fixme[16]: `Optional` has no attribute `__gt__`. assert self.parameters.simulation_distance_penalty > 0 self.permutation_distance = ( torch.tensor( [swap_dist(x.tolist()) for x in self.permutation_index], device=self.device, ) .unsqueeze(1) .float() ) self.MAX_DISTANCE = torch.max(self.permutation_distance) self.trainer = Seq2SlateTrainer( seq2slate_net, minibatch_size, self.parameters, baseline_net=baseline_net, baseline_warmup_num_batches=baseline_warmup_num_batches, use_gpu=use_gpu, policy_optimizer=policy_optimizer, baseline_optimizer=baseline_optimizer, ) self.seq2slate_net = self.trainer.seq2slate_net self.baseline_net = self.trainer.baseline_net
class Seq2RewardModel(WorldModelBase): __hash__ = param_hash net_builder: ValueNetBuilder__Union = field( # pyre-fixme[28]: Unexpected keyword argument `Seq2RewardNetBuilder`. # pyre-fixme[28]: Unexpected keyword argument `Seq2RewardNetBuilder`. default_factory=lambda: ValueNetBuilder__Union(Seq2RewardNetBuilder= Seq2RewardNetBuilder())) compress_net_builder: ValueNetBuilder__Union = field( # pyre-fixme[28]: Unexpected keyword argument `FullyConnected`. # pyre-fixme[28]: Unexpected keyword argument `FullyConnected`. default_factory=lambda: ValueNetBuilder__Union(FullyConnected= FullyConnected())) trainer_param: Seq2RewardTrainerParameters = field( default_factory=Seq2RewardTrainerParameters) preprocessing_options: Optional[PreprocessingOptions] = None def build_trainer( self, normalization_data_map: Dict[str, NormalizationData], use_gpu: bool, reward_options: Optional[RewardOptions] = None, ) -> Seq2RewardTrainer: seq2reward_network = self.net_builder.value.build_value_network( normalization_data_map[NormalizationKey.STATE]) trainer = Seq2RewardTrainer(seq2reward_network=seq2reward_network, params=self.trainer_param) return trainer def get_reporter(self) -> Seq2RewardReporter: return Seq2RewardReporter(self.trainer_param.action_names)
def __init__( self, seq2slate_net: Seq2SlateTransformerNet, minibatch_size: int = 1024, parameters: Seq2SlateParameters = field( # noqa: B008 default_factory=Seq2SlateParameters), baseline_net: Optional[BaselineNet] = None, baseline_warmup_num_batches: int = 0, use_gpu: bool = False, policy_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), baseline_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), print_interval: int = 100, ) -> None: self.seq2slate_net = seq2slate_net self.parameters = parameters self.use_gpu = use_gpu self.print_interval = print_interval self.minibatch_size = minibatch_size self.minibatch = 0 self.baseline_net = baseline_net self.baseline_warmup_num_batches = baseline_warmup_num_batches self.rl_opt = policy_optimizer.make_optimizer( self.seq2slate_net.parameters()) if self.baseline_net: self.baseline_opt = baseline_optimizer.make_optimizer( # pyre-fixme[16]: `Optional` has no attribute `parameters`. self.baseline_net.parameters())
def __init__( self, q_network, q_network_target, metrics_to_score=None, reward_network=None, q_network_cpe=None, q_network_cpe_target=None, loss_reporter=None, use_gpu: bool = False, actions: List[str] = field(default_factory=list), # noqa: B008 rl: RLParameters = field(default_factory=RLParameters), # noqa: B008 double_q_learning: bool = True, num_atoms: int = 51, minibatch_size: int = 1024, minibatches_per_step: int = 1, optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), cpe_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), evaluation: EvaluationParameters = field( # noqa: B008 default_factory=EvaluationParameters), ) -> None: super().__init__( rl, use_gpu=use_gpu, metrics_to_score=metrics_to_score, actions=actions, evaluation_parameters=evaluation, loss_reporter=loss_reporter, ) self.double_q_learning = double_q_learning self.minibatch_size = minibatch_size self.minibatches_per_step = minibatches_per_step self._actions = actions self.q_network = q_network self.q_network_target = q_network_target self.q_network_optimizer = optimizer.make_optimizer( self.q_network.parameters()) self.num_atoms = num_atoms self.quantiles = ( (0.5 + torch.arange(self.num_atoms, device=self.device).float()) / float(self.num_atoms)).view(1, -1) self._initialize_cpe(reward_network, q_network_cpe, q_network_cpe_target, optimizer=cpe_optimizer) self.reward_boosts = torch.zeros([1, len(self._actions)], device=self.device) if rl.reward_boost is not None: # pyre-fixme[16]: Optional type has no attribute `keys`. for k in rl.reward_boost.keys(): i = self._actions.index(k) # pyre-fixme[16]: Optional type has no attribute `__getitem__`. self.reward_boosts[0, i] = rl.reward_boost[k]
class C51TrainerParameters: __hash__ = rlp.param_hash actions: List[str] = field(default_factory=list) rl: rlp.RLParameters = field(default_factory=rlp.RLParameters) double_q_learning: bool = True minibatch_size: int = 1024 minibatches_per_step: int = 1 num_atoms: int = 51 qmin: float = -100 qmax: float = 200 optimizer: rlp.OptimizerParameters = field( default_factory=rlp.OptimizerParameters) evaluation: rlp.EvaluationParameters = field( default_factory=rlp.EvaluationParameters) @classmethod def from_discrete_action_model_parameters( cls, params: DiscreteActionModelParameters): return cls( actions=params.actions, rl=params.rl, double_q_learning=params.rainbow.double_q_learning, minibatch_size=params.training.minibatch_size, minibatches_per_step=params.training.minibatches_per_step, num_atoms=params.rainbow.num_atoms, qmin=params.rainbow.qmin, qmax=params.rainbow.qmax, optimizer=rlp.OptimizerParameters( optimizer=params.training.optimizer, learning_rate=params.training.learning_rate, l2_decay=params.rainbow.c51_l2_decay, ), evaluation=params.evaluation, )
class FullyConnectedWithEmbedding(DiscreteDQNWithIdListNetBuilder): __hash__ = param_hash sizes: List[int] = field(default_factory=lambda: [256, 128]) activations: List[str] = field(default_factory=lambda: ["relu", "relu"]) embedding_dim: int = 64 dropout_ratio: float = 0.0 def __post_init_post_parse__(self): super().__init__() assert len(self.sizes) == len(self.activations), ( f"Must have the same numbers of sizes and activations; got: " f"{self.sizes}, {self.activations}") def build_q_network( self, state_feature_config: rlt.ModelFeatureConfig, state_normalization_parameters: Dict[int, NormalizationParameters], output_dim: int, ) -> ModelBase: state_dim = self._get_input_dim(state_normalization_parameters) return FullyConnectedDQNWithEmbedding( state_dim=state_dim, action_dim=output_dim, sizes=self.sizes, activations=self.activations, model_feature_config=state_feature_config, embedding_dim=self.embedding_dim, dropout_ratio=self.dropout_ratio, )
def __init__( self, seq2slate_net: Seq2SlateTransformerNet, params: Seq2SlateParameters = field( # noqa: B008 default_factory=Seq2SlateParameters), baseline_net: Optional[BaselineNet] = None, baseline_warmup_num_batches: int = 0, policy_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), baseline_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), policy_gradient_interval: int = 1, print_interval: int = 100, calc_cpe: bool = False, reward_network: Optional[nn.Module] = None, ) -> None: super().__init__( seq2slate_net, params=params, baseline_net=baseline_net, baseline_warmup_num_batches=baseline_warmup_num_batches, policy_optimizer=policy_optimizer, baseline_optimizer=baseline_optimizer, policy_gradient_interval=policy_gradient_interval, print_interval=print_interval, calc_cpe=calc_cpe, reward_network=reward_network, ) self.sim_param = params.simulation assert self.sim_param is not None # loaded when used self.reward_name_and_net = nn.ModuleDict({}) self.MAX_DISTANCE = (seq2slate_net.max_src_seq_len * (seq2slate_net.max_src_seq_len - 1) / 2)
class ParametricDQN(ParametricDQNBase): __hash__ = param_hash trainer_param: ParametricDQNTrainerParameters = field( default_factory=ParametricDQNTrainerParameters) net_builder: ParametricDQNNetBuilder__Union = field( # pyre-fixme[28]: Unexpected keyword argument `FullyConnected`. default_factory=lambda: ParametricDQNNetBuilder__Union( FullyConnected=FullyConnected())) @property def rl_parameters(self): return self.trainer_param.rl def build_trainer( self, normalization_data_map: Dict[str, NormalizationData], use_gpu: bool, reward_options: Optional[RewardOptions] = None, ) -> ParametricDQNTrainer: net_builder = self.net_builder.value # pyre-fixme[16]: `ParametricDQN` has no attribute `_q_network`. self._q_network = net_builder.build_q_network( normalization_data_map[NormalizationKey.STATE], normalization_data_map[NormalizationKey.ACTION], ) # Metrics + reward reward_options = reward_options or RewardOptions() metrics_to_score = get_metrics_to_score( reward_options.metric_reward_values) reward_output_dim = len(metrics_to_score) + 1 reward_network = net_builder.build_q_network( normalization_data_map[NormalizationKey.STATE], normalization_data_map[NormalizationKey.ACTION], output_dim=reward_output_dim, ) q_network_target = self._q_network.get_target_network() return ParametricDQNTrainer( q_network=self._q_network, q_network_target=q_network_target, reward_network=reward_network, # pyre-fixme[16]: `ParametricDQNTrainerParameters` has no attribute # `asdict`. **self.trainer_param.asdict(), ) def build_serving_module( self, trainer_module: ReAgentLightningModule, normalization_data_map: Dict[str, NormalizationData], ) -> torch.nn.Module: assert isinstance(trainer_module, ParametricDQNTrainer) net_builder = self.net_builder.value return net_builder.build_serving_module( trainer_module.q_network, normalization_data_map[NormalizationKey.STATE], normalization_data_map[NormalizationKey.ACTION], )
class NGramConvNetSyntheticReward(SyntheticRewardNetBuilder): __hash__ = param_hash sizes: List[int] = field(default_factory=lambda: [256, 128]) activations: List[str] = field(default_factory=lambda: ["relu", "relu"]) last_layer_activation: str = "sigmoid" context_size: int = 3 conv_net_params: ConvNetParameters = field( default_factory=lambda: ConvNetParameters( conv_dims=[256, 128], conv_height_kernels=[1, 1], pool_types=["max", "max"], pool_kernel_sizes=[1, 1], )) def build_synthetic_reward_network( self, state_normalization_data: NormalizationData, action_normalization_data: Optional[NormalizationData] = None, discrete_action_names: Optional[List[str]] = None, ) -> ModelBase: state_dim = get_num_output_features( state_normalization_data.dense_normalization_parameters) if not discrete_action_names: assert action_normalization_data is not None action_dim = get_num_output_features( action_normalization_data.dense_normalization_parameters) else: action_dim = len(discrete_action_names) conv_net = synthetic_reward.NGramConvolutionalNetwork( state_dim=state_dim, action_dim=action_dim, sizes=self.sizes, activations=self.activations, last_layer_activation=self.last_layer_activation, context_size=self.context_size, conv_net_params=self.conv_net_params, ) return NGramSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, context_size=self.context_size, net=conv_net, ) def build_serving_module( self, synthetic_reward_network: ModelBase, state_normalization_data: NormalizationData, action_normalization_data: Optional[NormalizationData] = None, discrete_action_names: Optional[List[str]] = None, ) -> torch.nn.Module: """ Returns a TorchScript predictor module """ raise NotImplementedError( "N-gram Synthetic Reward Predictor has not been implemented")
class DiscreteC51DQN(DiscreteDQNBase): __hash__ = param_hash trainer_param: C51TrainerParameters = field(default_factory=C51TrainerParameters) net_builder: CategoricalDQNNetBuilder__Union = field( default_factory=lambda: CategoricalDQNNetBuilder__Union( Categorical=Categorical() ) ) cpe_net_builder: CategoricalDQNNetBuilder__Union = field( default_factory=lambda: CategoricalDQNNetBuilder__Union( Categorical=Categorical() ) ) def __post_init_post_parse__(self): super().__post_init_post_parse__() self.rl_parameters = self.trainer_param.rl self.eval_parameters = self.trainer_param.evaluation self.action_names = self.trainer_param.actions assert len(self.action_names) > 1, "DiscreteC51DQN needs at least 2 actions" assert ( self.trainer_param.minibatch_size % 8 == 0 ), "The minibatch size must be divisible by 8 for performance reasons." def build_trainer(self) -> C51Trainer: net_builder = self.net_builder.value q_network = net_builder.build_q_network( self.state_normalization_parameters, len(self.action_names) ) if self.use_gpu: q_network = q_network.cuda() q_network_target = q_network.get_target_network() self._q_network = q_network return C51Trainer( q_network, q_network_target, self.trainer_param, self.use_gpu, metrics_to_score=self.metrics_to_score, loss_reporter=NoOpLossReporter(), ) def build_serving_module(self) -> torch.nn.Module: """ Returns a TorchScript predictor module """ assert self._q_network is not None, "_q_network was not initialized" net_builder = self.net_builder.value return net_builder.build_serving_module( self._q_network, self.state_normalization_parameters, action_names=self.action_names, state_feature_config=self.state_feature_config, )