def __init__(self, args, save_path=None): super(LMBase, self).__init__() logger.info(self.__class__.__name__) self.save_path = save_path self.d_model = args.transformer_d_model self.n_layers = args.n_layers self.n_heads = args.transformer_n_heads self.lsm_prob = args.lsm_prob self.vocab = args.vocab self.eos = 2 self.pad = 3 # NOTE: reserved in advance # for cache self.cache_theta = 0.2 # smoothing parameter self.cache_lambda = 0.2 # cache weight self.cache_ids = [] self.cache_keys = [] self.cache_attn = [] self.embed = nn.Embedding(self.vocab, self.d_model, padding_idx=self.pad) self.pos_enc = PositionalEncoding(self.d_model, args.dropout_in, args.transformer_pe_type) self.layers = repeat( TransformerDecoderBlock(self.d_model, args.transformer_d_ff, args.transformer_attn_type, self.n_heads, args.dropout_hidden, args.dropout_att, args.transformer_layer_norm_eps, args.transformer_ffn_activation, src_tgt_attention=False), self.n_layers) self.norm_out = nn.LayerNorm(self.d_model, eps=args.transformer_layer_norm_eps) if args.adaptive_softmax: self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss( self.d_model, self.vocab, cutoffs=[round(self.vocab / 15), 3 * round(self.vocab / 15)], # cutoffs=[self.vocab // 25, 3 * self.vocab // 5], div_value=4.0) self.output = None else: self.adaptive_softmax = None self.output = nn.Linear(self.d_model, self.vocab) if args.tie_embedding: self.output.weight = self.embed.weight self.reset_parameters()
def __init__(self, special_symbols, enc_n_units, attn_type, n_heads, n_layers, d_model, d_ff, pe_type, layer_norm_eps, ffn_activation, vocab, tie_embedding, dropout, dropout_emb, dropout_att, lsm_prob, ctc_weight, ctc_lsm_prob, ctc_fc_list, backward, global_weight, mtl_per_batch, param_init): super(TransformerDecoder, self).__init__() self.eos = special_symbols['eos'] self.unk = special_symbols['unk'] self.pad = special_symbols['pad'] self.blank = special_symbols['blank'] self.vocab = vocab self.enc_n_units = enc_n_units self.d_model = d_model self.n_layers = n_layers self.n_heads = n_heads self.pe_type = pe_type self.lsm_prob = lsm_prob self.ctc_weight = ctc_weight self.bwd = backward self.global_weight = global_weight self.mtl_per_batch = mtl_per_batch self.prev_spk = '' self.lmstate_final = None if ctc_weight > 0: self.ctc = CTC(eos=self.eos, blank=self.blank, enc_n_units=enc_n_units, vocab=vocab, dropout=dropout, lsm_prob=ctc_lsm_prob, fc_list=ctc_fc_list, param_init=0.1) if ctc_weight < global_weight: self.embed = nn.Embedding(vocab, d_model, padding_idx=self.pad) self.pos_enc = PositionalEncoding(d_model, dropout_emb, pe_type) self.layers = repeat( TransformerDecoderBlock(d_model, d_ff, attn_type, n_heads, dropout, dropout_att, layer_norm_eps, ffn_activation, param_init), n_layers) self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps) self.output = nn.Linear(d_model, vocab) if tie_embedding: self.output.weight = self.embed.weight if param_init == 'xavier_uniform': self.reset_parameters()
def __init__(self, args, save_path=None): super(LMBase, self).__init__() logger.info(self.__class__.__name__) self.lm_type = args.lm_type self.save_path = save_path self.emb_dim = args.emb_dim self.rnn_type = args.lm_type assert args.lm_type in ['lstm', 'gru'] self.n_units = args.n_units self.n_projs = args.n_projs self.n_layers = args.n_layers self.residual = args.residual self.n_units_cv = args.n_units_null_context self.lsm_prob = args.lsm_prob self.vocab = args.vocab self.eos = 2 self.pad = 3 # NOTE: reserved in advance # for cache self.cache_theta = 0.2 # smoothing parameter self.cache_lambda = 0.2 # cache weight self.cache_ids = [] self.cache_keys = [] self.cache_attn = [] self.embed_cache = None self.embed = nn.Embedding(self.vocab, args.emb_dim, padding_idx=self.pad) self.dropout_emb = nn.Dropout(p=args.dropout_in) rnn = nn.LSTM if args.lm_type == 'lstm' else nn.GRU self.rnn = nn.ModuleList() self.dropout = nn.Dropout(p=args.dropout_hidden) if args.n_projs > 0: self.proj = repeat(nn.Linear(args.n_units, args.n_projs), args.n_layers) rnn_idim = args.emb_dim + args.n_units_null_context for _ in range(args.n_layers): self.rnn += [rnn(rnn_idim, args.n_units, 1, batch_first=True)] rnn_idim = args.n_units if args.n_projs > 0: rnn_idim = args.n_projs self.glu = None if args.use_glu: self.glu = LinearGLUBlock(rnn_idim) self._odim = rnn_idim self.adaptive_softmax = None self.output_proj = None self.output = None if args.adaptive_softmax: self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss( rnn_idim, self.vocab, # cutoffs=[self.vocab // 10, 3 * self.vocab // 10], cutoffs=[self.vocab // 25, self.vocab // 5], div_value=4.0) elif args.tie_embedding: if rnn_idim != args.emb_dim: self.output_proj = nn.Linear(rnn_idim, args.emb_dim) rnn_idim = args.emb_dim self._odim = rnn_idim self.output = nn.Linear(rnn_idim, self.vocab) self.output.weight = self.embed.weight else: self.output = nn.Linear(rnn_idim, self.vocab) self.reset_parameters(args.param_init)
def __init__(self, special_symbols, enc_n_units, rnn_type, n_units, n_projs, n_layers, bottleneck_dim, emb_dim, vocab, dropout=0., dropout_emb=0., lsm_prob=0., ctc_weight=0., ctc_lsm_prob=0., ctc_fc_list=[], lm_init=None, global_weight=1., mtl_per_batch=False, param_init=0.1): super(RNNTransducer, self).__init__() self.eos = special_symbols['eos'] self.unk = special_symbols['unk'] self.pad = special_symbols['pad'] self.blank = special_symbols['blank'] self.vocab = vocab self.rnn_type = rnn_type assert rnn_type in ['lstm_transducer', 'gru_transducer'] self.enc_n_units = enc_n_units self.dec_n_units = n_units self.n_projs = n_projs self.n_layers = n_layers self.lsm_prob = lsm_prob self.ctc_weight = ctc_weight self.global_weight = global_weight self.mtl_per_batch = mtl_per_batch # for cache self.prev_spk = '' self.lmstate_final = None self.state_cache = OrderedDict() if ctc_weight > 0: self.ctc = CTC(eos=self.eos, blank=self.blank, enc_n_units=enc_n_units, vocab=vocab, dropout=dropout, lsm_prob=ctc_lsm_prob, fc_list=ctc_fc_list, param_init=0.1) if ctc_weight < global_weight: # import warprnnt_pytorch # self.warprnnt_loss = warprnnt_pytorch.RNNTLoss() # Prediction network rnn_l = nn.LSTM if rnn_type == 'lstm_transducer' else nn.GRU self.rnn = nn.ModuleList() self.dropout = nn.Dropout(p=dropout) if n_projs > 0: self.proj = repeat(nn.Linear(n_units, n_projs), n_layers) dec_idim = emb_dim for l in range(n_layers): self.rnn += [rnn_l(dec_idim, n_units, 1, batch_first=True)] dec_idim = n_projs if n_projs > 0 else n_units self.embed = nn.Embedding(vocab, emb_dim, padding_idx=self.pad) self.dropout_emb = nn.Dropout(p=dropout_emb) # Joint network self.w_enc = nn.Linear(enc_n_units, bottleneck_dim) self.w_dec = nn.Linear(dec_idim, bottleneck_dim, bias=False) self.output = nn.Linear(bottleneck_dim, vocab) self.reset_parameters(param_init) # prediction network initialization with pre-trained LM if lm_init is not None: assert lm_init.vocab == vocab assert lm_init.n_units == n_units assert lm_init.n_projs == n_projs assert lm_init.n_layers == n_layers param_dict = dict(lm_init.named_parameters()) for n, p in self.named_parameters(): if n in param_dict.keys() and p.size() == param_dict[n].size(): if 'output' in n: continue p.data = param_dict[n].data logger.info('Overwrite %s' % n)
def __init__(self, input_dim, attn_type, n_heads, n_layers, d_model, d_ff, last_proj_dim, pe_type, layer_norm_eps, ffn_activation, dropout_in, dropout, dropout_att, n_stacks, n_splices, conv_in_channel, conv_channels, conv_kernel_sizes, conv_strides, conv_poolings, conv_batch_norm, conv_layer_norm, conv_bottleneck_dim, conv_param_init, param_init, chunk_size_left, chunk_size_current, chunk_size_right): super(TransformerEncoder, self).__init__() self.d_model = d_model self.n_layers = n_layers self.n_heads = n_heads self.pe_type = pe_type self.chunk_size_left = chunk_size_left self.chunk_size_current = chunk_size_current self.chunk_size_right = chunk_size_right # Setting for CNNs before RNNs if conv_channels: assert n_stacks == 1 and n_splices == 1 self.conv = ConvEncoder(input_dim, in_channel=conv_in_channel, channels=conv_channels, kernel_sizes=conv_kernel_sizes, strides=conv_strides, poolings=conv_poolings, dropout=0., batch_norm=conv_batch_norm, layer_norm=conv_layer_norm, layer_norm_eps=layer_norm_eps, residual=False, bottleneck_dim=d_model, param_init=conv_param_init) self._odim = self.conv.output_dim else: self.conv = None self._odim = input_dim * n_splices * n_stacks self.embed = nn.Linear(self._odim, d_model) self.pos_enc = PositionalEncoding(d_model, dropout_in, pe_type) self.layers = repeat( TransformerEncoderBlock(d_model, d_ff, attn_type, n_heads, dropout, dropout_att, layer_norm_eps, ffn_activation, param_init), n_layers) self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps) if last_proj_dim != self.output_dim: self.bridge = nn.Linear(self._odim, last_proj_dim) self._odim = last_proj_dim else: self.bridge = None self._odim = d_model # calculate subsampling factor self._factor = 1 if self.conv is not None: self._factor *= self.conv.subsampling_factor() if param_init == 'xavier_uniform': self.reset_parameters()