def __init__( self, policy_network=None, baseline=None, z_normalization=True, conf_penalty=None, weight=1.0, input_dim=Ref("exp_global.default_layer_dim"), output_dim=2, param_init=Ref("exp_global.param_init", default=bare(param_initializers.GlorotInitializer)), bias_init=Ref("exp_global.bias_init", default=bare(param_initializers.ZeroInitializer))): self.input_dim = input_dim self.policy_network = self.add_serializable_component( "policy_network", policy_network, lambda: transforms.Linear(input_dim=self.input_dim, output_dim=output_dim, param_init=param_init, bias_init=bias_init)) self.baseline = self.add_serializable_component( "baseline", baseline, lambda: transforms.Linear(input_dim=self.input_dim, output_dim=1, param_init=param_init, bias_init=bias_init)) self.confidence_penalty = self.add_serializable_component( "conf_penalty", conf_penalty, lambda: conf_penalty) if conf_penalty is not None else None self.weight = weight self.z_normalization = z_normalization
def __init__( self, input_dim: numbers.Integral = Ref("exp_global.default_layer_dim"), vocab_size: Optional[numbers.Integral] = None, vocab: Optional[vocabs.Vocab] = None, trg_reader: Optional[input_readers.InputReader] = Ref( "model.trg_reader", default=None), attender=Ref("model.attender"), label_smoothing: numbers.Real = 0.0, param_init: param_initializers.ParamInitializer = Ref( "exp_global.param_init", default=bare(param_initializers.GlorotInitializer)), bias_init: param_initializers.ParamInitializer = Ref( "exp_global.bias_init", default=bare(param_initializers.ZeroInitializer)), output_projector: transforms.Linear = None, lexicon_file=None, lexicon_alpha=0.001, lexicon_type='bias', coef_predictor: transforms.Linear = None, src_vocab=Ref("model.src_reader.vocab", default=None) ) -> None: self.param_col = param_collections.ParamManager.my_params(self) self.input_dim = input_dim self.output_dim = self._choose_vocab_size(vocab_size, vocab, trg_reader) self.label_smoothing = label_smoothing self.output_projector = self.add_serializable_component( "output_projector", output_projector, lambda: output_projector or transforms.Linear(input_dim=self.input_dim, output_dim=self.output_dim, param_init=param_init, bias_init=bias_init)) self.coef_predictor = self.add_serializable_component( "coef_predictor", coef_predictor, lambda: coef_predictor or transforms.Linear(input_dim=self.input_dim, output_dim=1, param_init=param_init, bias_init=bias_init)) self.lexicon_file = lexicon_file self.lexicon_type = lexicon_type self.lexicon_alpha = lexicon_alpha assert lexicon_type in [ "bias", "linear" ], "Lexicon type can be either 'bias' or 'linear' only!" # Reference to other parts of the model self.src_vocab = src_vocab self.trg_vocab = vocab if vocab is not None else trg_reader.vocab self.attender = attender # Sparse data structure to store exteranl lexicon prob self.lexicon = None # State of the sofmax self.lexicon_prob = None self.coeff = None self.dict_prob = None
def __init__(self, input_dim: int = Ref("exp_global.default_layer_dim"), hidden_dim: int = Ref("exp_global.default_layer_dim"), downsample_by: int = 1, param_init=Ref("exp_global.param_init", default=bare( param_initializers.GlorotInitializer)), projection=None, batch_norm=None, nonlinearity=None): self.projection = self.add_serializable_component( "projection", projection, lambda: base.TransformSeqTransducer(modelparts_transforms.Linear( input_dim=input_dim * downsample_by, output_dim=hidden_dim, bias=False, param_init=param_init), downsample_by=downsample_by)) self.batch_norm = self.add_serializable_component( "batch_norm", batch_norm, lambda: norms.BatchNorm(hidden_dim=hidden_dim, num_dim=2)) self.nonlinearity = self.add_serializable_component( "nonlinearity", nonlinearity, lambda: base.TransformSeqTransducer( modelparts_transforms.Cwise("rectify"))) self.modules = [self.projection, self.batch_norm, self.nonlinearity]
def __init__(self, input_dim: numbers.Integral = Ref( "exp_global.default_layer_dim"), vocab_size: Optional[numbers.Integral] = None, vocab: Optional[vocabs.Vocab] = None, trg_reader: Optional[input_readers.InputReader] = Ref( "model.trg_reader", default=None), label_smoothing: numbers.Real = 0.0, param_init: param_initializers.ParamInitializer = Ref( "exp_global.param_init", default=bare(param_initializers.GlorotInitializer)), bias_init: param_initializers.ParamInitializer = Ref( "exp_global.bias_init", default=bare(param_initializers.ZeroInitializer)), output_projector: transforms.Linear = None) -> None: self.param_col = param_collections.ParamManager.my_params(self) self.input_dim = input_dim self.output_dim = self._choose_vocab_size(vocab_size, vocab, trg_reader) self.label_smoothing = label_smoothing self.output_projector = self.add_serializable_component( "output_projector", output_projector, lambda: output_projector or transforms.Linear(input_dim=self.input_dim, output_dim=self.output_dim, param_init=param_init, bias_init=bias_init))
def __init__(self, transducer: transducers.SeqTransducer, input_dim: int = Ref("exp_global.default_layer_dim"), softmax_dim: int = Ref("exp_global.default_layer_dim"), layer_dim: int = Ref("exp_global.default_layer_dim"), linear_layer: transforms.Linear = None, vocab: Optional[vocabs.Vocab] = None, scale: float = 1.0, mode: str = "entropy", param_init: param_initializers.ParamInitializer = Ref( "exp_global.param_init", default=bare(param_initializers.GlorotInitializer)), bias_init: param_initializers.ParamInitializer = Ref( "exp_global.bias_init", default=bare(param_initializers.ZeroInitializer))): self.transducer = transducer self.input_dim = input_dim if vocab: softmax_dim = len(vocab) self.softmax_dim = softmax_dim self.layer_dim = layer_dim self.scale = scale self.mode = mode self.linear_layer = self.add_serializable_component( "linear_layer", linear_layer, lambda: transforms.Linear(input_dim=self.softmax_dim, output_dim=self.layer_dim, bias=False, param_init=param_init, bias_init=bias_init))
def __init__(self, input_dim: int, hidden_dim: int, nonlinearity: str = "rectify", linear_transforms: typing.Optional[typing.Sequence[ transforms.Linear]] = None, layer_norm: typing.Optional[norms.LayerNorm] = None) -> None: w_12 = self.add_serializable_component( "linear_transforms", linear_transforms, lambda: [ transforms.Linear(input_dim, hidden_dim), transforms.Linear(hidden_dim, input_dim) ]) self.w_1 = w_12[0] self.w_2 = w_12[1] self.layer_norm = self.add_serializable_component( "layer_norm", layer_norm, lambda: norms.LayerNorm(input_dim)) self.nonlinearity = getattr(dy, nonlinearity)
def __init__(self, baseline:Optional[Serializable]=None, evaluation_metric: metrics.SentenceLevelEvaluator = bare(metrics.FastBLEUEvaluator), search_strategy: search_strategies.SearchStrategy = bare(search_strategies.SamplingSearch), inv_eval: bool = True, decoder_hidden_dim: numbers.Integral = Ref("exp_global.default_layer_dim")) -> None: self.inv_eval = inv_eval self.search_strategy = search_strategy self.evaluation_metric = evaluation_metric self.baseline = self.add_serializable_component("baseline", baseline, lambda: transforms.Linear(input_dim=decoder_hidden_dim, output_dim=1))
def __init__(self, src_reader: input_readers.InputReader, trg_reader: input_readers.InputReader, src_embedder: embedders.Embedder = bare(embedders.SimpleWordEmbedder), encoder: transducers.SeqTransducer = bare(recurrent.BiLSTMSeqTransducer), inference=bare(inferences.IndependentOutputInference), hidden_dim: int = Ref("exp_global.default_layer_dim"), output_layer: Optional[transforms.Linear] = None, generate_per_step: bool = False, mode:str="avg_mlp"): super().__init__(src_reader=src_reader, trg_reader=trg_reader) self.src_embedder = src_embedder self.encoder = encoder self.output_layer = self.add_serializable_component("output_layer", output_layer, lambda:transforms.Linear(input_dim=hidden_dim, output_dim=len(trg_reader.vocab))) self.inference = inference self.mode = mode self.generate_per_step = generate_per_step
def __init__(self, dec_layers: int = 1, enc_dim: int = Ref("exp_global.default_layer_dim"), dec_dim: int = Ref("exp_global.default_layer_dim"), param_init: param_initializers.ParamInitializer = Ref( "exp_global.param_init", default=bare(param_initializers.GlorotInitializer)), bias_init: param_initializers.ParamInitializer = Ref( "exp_global.bias_init", default=bare(param_initializers.ZeroInitializer)), projector=None): self.dec_layers = dec_layers self.enc_dim = enc_dim self.dec_dim = dec_dim self.projector = self.add_serializable_component( "projector", projector, lambda: transforms.Linear(input_dim=self.enc_dim, output_dim=self.dec_dim, param_init=param_init, bias_init=bias_init))
def __init__( self, input_dim=Ref("exp_global.default_layer_dim"), softmax_dim=Ref("exp_global.default_layer_dim"), output_dim=Ref("exp_global.default_layer_dim"), dropout=Ref("exp_global.dropout", default=0.0), residual=False, linear_layer=None, vocab=None, gumbel=False, param_init=Ref("exp_global.param_init", default=bare(param_initializers.GlorotInitializer)), bias_init=Ref("exp_global.bias_init", default=bare(param_initializers.ZeroInitializer))): param_col = param_collections.ParamManager.my_params(self) self.input_dim = input_dim if vocab: softmax_dim = len(vocab) self.softmax_dim = softmax_dim self.output_dim = output_dim self.dropout_rate = dropout self.residual = residual self.gumbel = gumbel if self.residual: assert self.input_dim == self.output_dim self.linear_layer = self.add_serializable_component( "linear_layer", linear_layer, lambda: transforms.Linear(input_dim=self.softmax_dim, output_dim=self.output_dim, bias=False, param_init=param_init, bias_init=bias_init)) # self.p_W = param_col.add_parameters(dim=(softmax_dim, input_dim), init=param_init.initializer((softmax_dim, input_dim))) # self.p_b = param_col.add_parameters(dim=(softmax_dim), init=bias_init.initializer((softmax_dim,))) self.p_E = param_col.add_parameters(dim=(output_dim, softmax_dim), init=param_init.initializer( (output_dim, softmax_dim)))
def __init__(self, head_count: int, model_dim: int, downsample_factor: int = 1, input_dim: int = None, ignore_masks: bool = False, plot_attention: typing.Optional[str] = None, diag_gauss_mask: typing.Union[bool, numbers.Real] = False, square_mask_std: bool = True, cross_pos_encoding_type: typing.Optional[str] = None, kq_pos_encoding_type: typing.Optional[str] = None, kq_pos_encoding_size: int = 40, max_len: int = 1500, param_init: xnmt.param_initializers.ParamInitializer = xnmt. param_initializers.GlorotInitializer(), bias_init: xnmt.param_initializers.ParamInitializer = xnmt. param_initializers.ZeroInitializer(), linear_kvq=None, kq_positional_embedder=None, layer_norm=None, res_shortcut=None, desc: typing.Any = None) -> None: if input_dim is None: input_dim = model_dim self.input_dim = input_dim assert model_dim % head_count == 0 self.dim_per_head = model_dim // head_count self.model_dim = model_dim self.head_count = head_count assert downsample_factor >= 1 self.downsample_factor = downsample_factor self.plot_attention = plot_attention self.plot_attention_counter = 0 self.desc = desc self.ignore_masks = ignore_masks self.diag_gauss_mask = diag_gauss_mask self.square_mask_std = square_mask_std self.kq_pos_encoding_type = kq_pos_encoding_type self.kq_pos_encoding_size = kq_pos_encoding_size self.max_len = max_len subcol = param_collections.ParamManager.my_params(self) if self.kq_pos_encoding_type is None: self.linear_kvq = self.add_serializable_component( "linear_kvq", linear_kvq, lambda: transforms.Linear(input_dim * downsample_factor, head_count * self.dim_per_head * 3, param_init=param_init, bias_init=bias_init)) else: self.linear_kq, self.linear_v = \ self.add_serializable_component("linear_kvq", linear_kvq, lambda: [ transforms.Linear(input_dim * downsample_factor + self.kq_pos_encoding_size, head_count * self.dim_per_head * 2, param_init=param_init, bias_init=bias_init), transforms.Linear(input_dim * downsample_factor, head_count * self.dim_per_head, param_init=param_init, bias_init=bias_init)]) assert self.kq_pos_encoding_type == "embedding" self.kq_positional_embedder = self.add_serializable_component( "kq_positional_embedder", kq_positional_embedder, lambda: embedders.PositionEmbedder(max_pos=self.max_len, emb_dim=self.kq_pos_encoding_size, param_init=param_init)) if self.diag_gauss_mask: if self.diag_gauss_mask == "rand": rand_init = np.exp( (np.random.random(size=(self.head_count, ))) * math.log(1000)) self.diag_gauss_mask_sigma = subcol.add_parameters( dim=(1, 1, self.head_count), init=dy.NumpyInitializer(rand_init)) else: self.diag_gauss_mask_sigma = subcol.add_parameters( dim=(1, 1, self.head_count), init=dy.ConstInitializer(self.diag_gauss_mask)) self.layer_norm = self.add_serializable_component( "layer_norm", layer_norm, lambda: norms.LayerNorm(model_dim)) if model_dim != input_dim * downsample_factor: self.res_shortcut = self.add_serializable_component( "res_shortcut", res_shortcut, lambda: transforms.Linear(input_dim * downsample_factor, model_dim, param_init=param_init, bias_init=bias_init)) self.cross_pos_encoding_type = cross_pos_encoding_type if cross_pos_encoding_type == "embedding": self.cross_pos_emb_p1 = subcol.add_parameters( dim=(self.max_len, self.dim_per_head, self.head_count), init=dy.NormalInitializer(mean=1.0, var=0.001)) self.cross_pos_emb_p2 = subcol.add_parameters( dim=(self.max_len, self.dim_per_head, self.head_count), init=dy.NormalInitializer(mean=1.0, var=0.001)) elif cross_pos_encoding_type is not None: raise NotImplementedError()
def __init__( self, trg_embedder: embedders.DenseWordEmbedder, src_reader: input_readers.InputReader = None, trg_reader: input_readers.InputReader = None, src_embedder=bare(embedders.SimpleWordEmbedder), encoder=bare(recurrent.BiLSTMSeqTransducer), attender=bare(attenders.MlpAttender), dec_lstm=bare(recurrent.UniLSTMSeqTransducer), bridge: bridges.Bridge = bare(bridges.CopyBridge), transform: transforms.Transform = bare(transforms.AuxNonLinear), scorer: scorers.Scorer = bare(scorers.Softmax), inference=bare(inferences.IndependentOutputInference), max_dec_len: int = 350, mode: Optional[str] = None, mode_translate: Optional[str] = None, mode_transduce: Optional[str] = None, unfold_until: str = "eos", transducer_loss: bool = False, split_regularizer: Union[bool, numbers.Real] = False, split_dual: Union[bool, Sequence[numbers.Real]] = False, dropout_dec_state: float = 0.0, split_dual_proj: Optional[transforms.Linear] = None, split_context_transform: Optional[transforms.Transform] = None, sampling_prob: numbers.Number = 0.0, compute_report: bool = Ref("exp_global.compute_report", default=False)): super().__init__(src_reader=src_reader, trg_reader=trg_reader) assert mode is None or (mode_translate is None and mode_transduce is None), \ f"illegal combination: mode={mode}, mode_translate={mode_translate}, mode_transduce={mode_transduce}" assert mode or mode_translate or mode_transduce if mode_translate or mode_transduce: assert mode_translate and mode_transduce assert mode_translate != "split" self.src_embedder = src_embedder self.trg_embedder = trg_embedder self.encoder = encoder self.attender = attender self.dec_lstm = dec_lstm self.bridge = bridge self.transform = transform self.scorer = scorer self.inference = inference self.max_dec_len = max_dec_len self.mode_translate = mode_translate or mode self.mode_transduce = mode_transduce or mode if transducer_loss: assert self.mode_transduce in ["teacher", "split"], \ f"mode_transduce='{self.mode_transduce}' not supported with transducer_loss option" self.trg_embedder = trg_embedder self.unfold_until = unfold_until self.transducer_loss = transducer_loss if split_regularizer: assert self.mode_transduce == "split" self.split_regularizer = split_regularizer self.dropout_dec_state = dropout_dec_state self.split_dual = [0.0, 0.0] if split_dual is True else split_dual self.split_context_transform = split_context_transform if self.split_dual: assert len(self.split_dual) == 2 and max( self.split_dual) <= 1.0 and min(self.split_dual) >= 0.0 self.split_dual_proj = self.add_serializable_component( "split_dual_proj", split_dual_proj, lambda: transforms.Linear( input_dim=self.dec_lstm.input_dim * 2, output_dim=self.dec_lstm.input_dim)) self.sampling_prob = sampling_prob self.compute_report = compute_report