def __init__(self, args, save_path=None): super(LMBase, self).__init__() logger.info(self.__class__.__name__) self.save_path = save_path self.d_model = args.transformer_d_model self.n_layers = args.n_layers self.n_heads = args.transformer_n_heads self.lsm_prob = args.lsm_prob self.vocab = args.vocab self.eos = 2 self.pad = 3 # NOTE: reserved in advance # for cache self.cache_theta = 0.2 # smoothing parameter self.cache_lambda = 0.2 # cache weight self.cache_ids = [] self.cache_keys = [] self.cache_attn = [] self.embed = nn.Embedding(self.vocab, self.d_model, padding_idx=self.pad) self.pos_enc = PositionalEncoding(self.d_model, args.dropout_in, args.transformer_pe_type) self.layers = repeat( TransformerDecoderBlock(self.d_model, args.transformer_d_ff, args.transformer_attn_type, self.n_heads, args.dropout_hidden, args.dropout_att, args.transformer_layer_norm_eps, args.transformer_ffn_activation, src_tgt_attention=False), self.n_layers) self.norm_out = nn.LayerNorm(self.d_model, eps=args.transformer_layer_norm_eps) if args.adaptive_softmax: self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss( self.d_model, self.vocab, cutoffs=[round(self.vocab / 15), 3 * round(self.vocab / 15)], # cutoffs=[self.vocab // 25, 3 * self.vocab // 5], div_value=4.0) self.output = None else: self.adaptive_softmax = None self.output = nn.Linear(self.d_model, self.vocab) if args.tie_embedding: self.output.weight = self.embed.weight self.reset_parameters()
def __init__(self, special_symbols, enc_n_units, attn_type, n_heads, n_layers, d_model, d_ff, pe_type, layer_norm_eps, ffn_activation, vocab, tie_embedding, dropout, dropout_emb, dropout_att, lsm_prob, ctc_weight, ctc_lsm_prob, ctc_fc_list, backward, global_weight, mtl_per_batch, param_init): super(TransformerDecoder, self).__init__() self.eos = special_symbols['eos'] self.unk = special_symbols['unk'] self.pad = special_symbols['pad'] self.blank = special_symbols['blank'] self.vocab = vocab self.enc_n_units = enc_n_units self.d_model = d_model self.n_layers = n_layers self.n_heads = n_heads self.pe_type = pe_type self.lsm_prob = lsm_prob self.ctc_weight = ctc_weight self.bwd = backward self.global_weight = global_weight self.mtl_per_batch = mtl_per_batch self.prev_spk = '' self.lmstate_final = None if ctc_weight > 0: self.ctc = CTC(eos=self.eos, blank=self.blank, enc_n_units=enc_n_units, vocab=vocab, dropout=dropout, lsm_prob=ctc_lsm_prob, fc_list=ctc_fc_list, param_init=0.1) if ctc_weight < global_weight: self.embed = nn.Embedding(vocab, d_model, padding_idx=self.pad) self.pos_enc = PositionalEncoding(d_model, dropout_emb, pe_type) self.layers = repeat( TransformerDecoderBlock(d_model, d_ff, attn_type, n_heads, dropout, dropout_att, layer_norm_eps, ffn_activation, param_init), n_layers) self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps) self.output = nn.Linear(d_model, vocab) if tie_embedding: self.output.weight = self.embed.weight if param_init == 'xavier_uniform': self.reset_parameters()
def __init__(self, special_symbols, enc_n_units, attn_type, n_heads, n_layers, d_model, d_ff, ffn_bottleneck_dim, pe_type, layer_norm_eps, ffn_activation, vocab, tie_embedding, dropout, dropout_emb, dropout_att, dropout_layer, dropout_head, lsm_prob, ctc_weight, ctc_lsm_prob, ctc_fc_list, backward, global_weight, mtl_per_batch, param_init, mma_chunk_size, mma_n_heads_mono, mma_n_heads_chunk, mma_init_r, mma_eps, mma_std, mma_no_denominator, mma_1dconv, mma_quantity_loss_weight, mma_headdiv_loss_weight, latency_metric, latency_loss_weight, mma_first_layer, share_chunkwise_attention, external_lm, lm_fusion): super(TransformerDecoder, self).__init__() self.eos = special_symbols['eos'] self.unk = special_symbols['unk'] self.pad = special_symbols['pad'] self.blank = special_symbols['blank'] self.vocab = vocab self.enc_n_units = enc_n_units self.d_model = d_model self.n_layers = n_layers self.n_heads = n_heads self.pe_type = pe_type self.lsm_prob = lsm_prob self.att_weight = global_weight - ctc_weight self.ctc_weight = ctc_weight self.bwd = backward self.mtl_per_batch = mtl_per_batch # for cache self.prev_spk = '' self.lmstate_final = None self.embed_cache = None # for attention plot self.aws_dict = {} self.data_dict = {} # for MMA self.attn_type = attn_type self.quantity_loss_weight = mma_quantity_loss_weight self._quantity_loss_weight = mma_quantity_loss_weight # for curriculum self.mma_first_layer = max(1, mma_first_layer) self.headdiv_loss_weight = mma_headdiv_loss_weight self.latency_metric = latency_metric self.latency_loss_weight = latency_loss_weight self.ctc_trigger = (self.latency_metric in ['ctc_sync']) if self.ctc_trigger: assert 0 < self.ctc_weight < 1 if ctc_weight > 0: self.ctc = CTC(eos=self.eos, blank=self.blank, enc_n_units=enc_n_units, vocab=vocab, dropout=dropout, lsm_prob=ctc_lsm_prob, fc_list=ctc_fc_list, param_init=0.1, backward=backward) if self.att_weight > 0: # token embedding self.embed = nn.Embedding(self.vocab, d_model, padding_idx=self.pad) self.pos_enc = PositionalEncoding(d_model, dropout_emb, pe_type, param_init) # decoder self.layers = nn.ModuleList([copy.deepcopy(TransformerDecoderBlock( d_model, d_ff, attn_type, n_heads, dropout, dropout_att, dropout_layer, layer_norm_eps, ffn_activation, param_init, src_tgt_attention=False if lth < mma_first_layer - 1 else True, mma_chunk_size=mma_chunk_size, mma_n_heads_mono=mma_n_heads_mono, mma_n_heads_chunk=mma_n_heads_chunk, mma_init_r=mma_init_r, mma_eps=mma_eps, mma_std=mma_std, mma_no_denominator=mma_no_denominator, mma_1dconv=mma_1dconv, dropout_head=dropout_head, lm_fusion=lm_fusion, ffn_bottleneck_dim=ffn_bottleneck_dim, share_chunkwise_attention=share_chunkwise_attention)) for lth in range(n_layers)]) self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps) self.output = nn.Linear(d_model, self.vocab) if tie_embedding: self.output.weight = self.embed.weight self.lm = external_lm if external_lm is not None: self.lm_output_proj = nn.Linear(external_lm.output_dim, d_model) self.reset_parameters(param_init)
def __init__(self, args, save_path=None): super(LMBase, self).__init__() logger.info(self.__class__.__name__) self.lm_type = args.lm_type self.save_path = save_path self.d_model = args.transformer_d_model self.n_layers = args.n_layers self.n_heads = args.transformer_n_heads self.lsm_prob = args.lsm_prob if args.mem_len > 0: self.mem_len = args.mem_len else: self.mem_len = args.bptt if args.recog_mem_len > 0: self.mem_len = args.recog_mem_len self.vocab = args.vocab self.eos = 2 self.pad = 3 # NOTE: reserved in advance # for cache self.cache_theta = 0.2 # smoothing parameter self.cache_lambda = 0.2 # cache weight self.cache_ids = [] self.cache_keys = [] self.cache_attn = [] self.embed_cache = None # positional embedding self.pos_emb = XLPositionalEmbedding(self.d_model, args.dropout_in) self.u_bias = nn.Parameter(torch.Tensor(self.n_heads, self.d_model // self.n_heads)) self.v_bias = nn.Parameter(torch.Tensor(self.n_heads, self.d_model // self.n_heads)) # NOTE: u_bias and v_bias are global parameters self.embed = nn.Embedding(self.vocab, self.d_model, padding_idx=self.pad) self.scale = math.sqrt(self.d_model) # for token embedding self.dropout_emb = nn.Dropout(p=args.dropout_in) # for token embedding self.layers = nn.ModuleList([copy.deepcopy(TransformerDecoderBlock( self.d_model, args.transformer_d_ff, 'scaled_dot', self.n_heads, args.dropout_hidden, args.dropout_att, args.dropout_layer, args.transformer_layer_norm_eps, args.transformer_ffn_activation, args.transformer_param_init, src_tgt_attention=False, memory_transformer=True)) for lth in range(self.n_layers)]) self.norm_out = nn.LayerNorm(self.d_model, eps=args.transformer_layer_norm_eps) self.adaptive_softmax = None self.output = None if args.adaptive_softmax: self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss( self.d_model, self.vocab, cutoffs=[round(self.vocab / 15), 3 * round(self.vocab / 15)], # cutoffs=[self.vocab // 25, 3 * self.vocab // 5], div_value=4.0) else: self.output = nn.Linear(self.d_model, self.vocab) if args.tie_embedding: self.output.weight = self.embed.weight self.reset_parameters()
def __init__(self, eos, unk, pad, blank, enc_n_units, attn_type, attn_n_heads, n_layers, d_model, d_ff, vocab, tie_embedding=False, pe_type='add', layer_norm_eps=1e-12, dropout=0.0, dropout_emb=0.0, dropout_att=0.0, lsm_prob=0.0, focal_loss_weight=0.0, focal_loss_gamma=2.0, ctc_weight=0.0, ctc_lsm_prob=0.0, ctc_fc_list=[], backward=False, global_weight=1.0, mtl_per_batch=False, adaptive_softmax=False): super(TransformerDecoder, self).__init__() logger = logging.getLogger('training') self.eos = eos self.unk = unk self.pad = pad self.blank = blank self.enc_n_units = enc_n_units self.d_model = d_model self.n_layers = n_layers self.attn_n_heads = attn_n_heads self.pe_type = pe_type self.lsm_prob = lsm_prob self.focal_loss_weight = focal_loss_weight self.focal_loss_gamma = focal_loss_gamma self.ctc_weight = ctc_weight self.bwd = backward self.global_weight = global_weight self.mtl_per_batch = mtl_per_batch if ctc_weight > 0: self.ctc = CTC(eos=eos, blank=blank, enc_n_units=enc_n_units, vocab=vocab, dropout=dropout, lsm_prob=ctc_lsm_prob, fc_list=ctc_fc_list, param_init=0.1) if ctc_weight < global_weight: self.embed = Embedding( vocab, d_model, dropout=0, # NOTE: do not apply dropout here ignore_index=pad) self.pos_enc = PositionalEncoding(d_model, dropout_emb, pe_type) self.layers = nn.ModuleList([ TransformerDecoderBlock(d_model, d_ff, attn_type, attn_n_heads, dropout, dropout_att, layer_norm_eps) for _ in range(n_layers) ]) self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps) if adaptive_softmax: self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss( d_model, vocab, cutoffs=[ round(self.vocab / 15), 3 * round(self.vocab / 15) ], # cutoffs=[self.vocab // 25, 3 * self.vocab // 5], div_value=4.0) self.output = None else: self.adaptive_softmax = None self.output = Linear(d_model, vocab) # Optionally tie weights as in: # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) # https://arxiv.org/abs/1608.05859 # and # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) # https://arxiv.org/abs/1611.01462 if tie_embedding: self.output.fc.weight = self.embed.embed.weight # Initialize parameters self.reset_parameters()
def __init__(self, args, save_path=None): super(LMBase, self).__init__() logger = logging.getLogger('training') logger.info(self.__class__.__name__) self.save_path = save_path self.d_model = args.d_model self.d_ff = args.d_ff self.pe_type = args.pe_type self.n_layers = args.n_layers self.n_heads = args.attn_n_heads self.tie_embedding = args.tie_embedding self.vocab = args.vocab self.eos = 2 self.pad = 3 # NOTE: reserved in advance # self.lsm_prob = lsm_prob # for cache self.cache_theta = 0.2 # smoothing parameter self.cache_lambda = 0.2 # cache weight self.cache_ids = [] self.cache_keys = [] self.cache_attn = [] self.embed = Embedding( vocab=self.vocab, emb_dim=self.d_model, dropout=0, # NOTE: do not apply dropout here ignore_index=self.pad) self.pos_enc = PositionalEncoding(args.d_model, args.dropout_emb, args.pe_type) self.layers = nn.ModuleList([ TransformerDecoderBlock(args.d_model, args.d_ff, args.attn_type, args.attn_n_heads, args.dropout_hidden, args.dropout_att, args.layer_norm_eps, src_attention=False) for _ in range(self.n_layers) ]) self.norm_out = nn.LayerNorm(args.d_model, eps=args.layer_norm_eps) if args.adaptive_softmax: self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss( args.d_model, self.vocab, cutoffs=[round(self.vocab / 15), 3 * round(self.vocab / 15)], # cutoffs=[self.vocab // 25, 3 * self.vocab // 5], div_value=4.0) self.output = None else: self.adaptive_softmax = None self.output = LinearND(self.d_model, self.vocab) # Optionally tie weights as in: # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) # https://arxiv.org/abs/1608.05859 # and # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) # https://arxiv.org/abs/1611.01462 if args.tie_embedding: self.output.fc.weight = self.embed.embed.weight # Initialize parameters self.reset_parameters()
def __init__( self, special_symbols, enc_n_units, attn_type, n_heads, n_layers, d_model, d_ff, pe_type, layer_norm_eps, ffn_activation, vocab, tie_embedding, dropout, dropout_emb, dropout_att, dropout_residual, lsm_prob, ctc_weight, ctc_lsm_prob, ctc_fc_list, backward, global_weight, mtl_per_batch, param_init, memory_transformer, mocha_chunk_size, mocha_n_heads_mono, mocha_n_heads_chunk, mocha_init_r, mocha_eps, mocha_std, mocha_quantity_loss_weight, mocha_head_divergence_loss_weight, latency_metric, latency_loss_weight, mocha_dropout_head, mocha_dropout_hard, mocha_first_layer, external_lm, lm_fusion, mem_len): super(TransformerDecoder, self).__init__() self.eos = special_symbols['eos'] self.unk = special_symbols['unk'] self.pad = special_symbols['pad'] self.blank = special_symbols['blank'] self.vocab = vocab self.enc_n_units = enc_n_units self.d_model = d_model self.n_layers = n_layers self.n_heads = n_heads self.pe_type = pe_type self.lsm_prob = lsm_prob self.ctc_weight = ctc_weight self.bwd = backward self.global_weight = global_weight self.mtl_per_batch = mtl_per_batch self.prev_spk = '' self.lmstate_final = None # for TransformerXL decoder self.memory_transformer = memory_transformer if memory_transformer: assert pe_type == 'none' self.mem_len = mem_len # for attention plot self.aws_dict = {} self.data_dict = {} # for mocha self.attn_type = attn_type self.quantity_loss_weight = mocha_quantity_loss_weight self.headdiv_loss_weight = mocha_head_divergence_loss_weight self.latency_metric = latency_metric self.latency_loss_weight = latency_loss_weight self.mocha_first_layer = mocha_first_layer if ctc_weight > 0: self.ctc = CTC(eos=self.eos, blank=self.blank, enc_n_units=enc_n_units, vocab=self.vocab, dropout=dropout, lsm_prob=ctc_lsm_prob, fc_list=ctc_fc_list, param_init=0.1, backward=backward) if ctc_weight < global_weight: # token embedding self.embed = nn.Embedding(self.vocab, d_model, padding_idx=self.pad) # positional embedding if memory_transformer: self.dropout_emb = nn.Dropout(p=dropout_emb) self.pos_emb = XLPositionalEmbedding(d_model) self.u = nn.Parameter( torch.Tensor(self.n_heads, self.d_model // self.n_heads)) self.v = nn.Parameter( torch.Tensor(self.n_heads, self.d_model // self.n_heads)) # NOTE: u and v are global parameters else: self.pos_enc = PositionalEncoding(d_model, dropout_emb, pe_type, param_init) # self-attention self.layers = nn.ModuleList([ copy.deepcopy( TransformerDecoderBlock( d_model, d_ff, attn_type, n_heads, dropout, dropout_att, dropout_residual * (l + 1) / n_layers, layer_norm_eps, ffn_activation, param_init, src_tgt_attention=False if 'mocha' in attn_type and l < mocha_first_layer - 1 else True, memory_transformer=memory_transformer, mocha_chunk_size=mocha_chunk_size, mocha_n_heads_mono=mocha_n_heads_mono, mocha_n_heads_chunk=mocha_n_heads_chunk, mocha_init_r=mocha_init_r, mocha_eps=mocha_eps, mocha_std=mocha_std, mocha_dropout_head=mocha_dropout_head, mocha_dropout_hard=mocha_dropout_hard * (n_layers - l) / n_layers, # the lower the stronger lm_fusion=lm_fusion)) for l in range(n_layers) ]) self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps) self.output = nn.Linear(d_model, self.vocab) if tie_embedding: self.output.weight = self.embed.weight self.lm = external_lm if external_lm is not None: self.lm_output_proj = nn.Linear(external_lm.output_dim, d_model) self.reset_parameters(param_init)