def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, device, pad_idx, embedding, att_type='concat', num_layers=1, adaptive_softmax=None): super(Decoder, self).__init__() #The num directions in decoder is always 1.... self.emb_dim = emb_dim self.enc_hid_dim = enc_hid_dim self.dec_hid_dim = dec_hid_dim self.output_dim = output_dim self.dropout = dropout self.device = device self.att_type = att_type self.num_layers = num_layers if self.att_type == 'concat': self.attn = nn.Linear((enc_hid_dim) + dec_hid_dim, dec_hid_dim) elif self.att_type == 'bilinear': self.attn = nn.Linear((enc_hid_dim), dec_hid_dim) self.context_linear = nn.Linear(enc_hid_dim * 2, enc_hid_dim) self.v = nn.Parameter(torch.rand(dec_hid_dim)) self.pad_idx = pad_idx self.embedding = embedding self.attn_linear = nn.Linear((enc_hid_dim * 1) + emb_dim, dec_hid_dim) self.rnn_layer = nn.GRU((enc_hid_dim * 1), dec_hid_dim, num_layers=self.num_layers) self.out = nn.Linear((enc_hid_dim * 1) + dec_hid_dim + emb_dim, output_dim) self.dropout = nn.Dropout(dropout) vocab_size = output_dim self.softmax_layer = nn.AdaptiveLogSoftmaxWithLoss( (enc_hid_dim * 1) + dec_hid_dim + emb_dim, vocab_size, cutoffs=[round(vocab_size / 15), 3 * round(vocab_size / 15)]) self.adaptive_softmax = adaptive_softmax
def __init__(self, vocab, hidden_size, enc_num_layer): super(BertNoEmbed, self).__init__() self.encoder = BertModelNoEmbed( config=BertConfig(vocab_size_or_config_json_file=len(vocab), hidden_size=hidden_size, num_hidden_layers=enc_num_layer, num_attention_heads=8, intermediate_size=3072, type_vocab_size=1, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1)) self.hidden_size = hidden_size self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss(hidden_size, len(vocab), cutoffs=[1000])
def __init__(self): super(aspect_rs, self).__init__() self.embedding_dim = conf.embedding_dim self.num_user = conf.num_users self.num_item = conf.num_items torch.manual_seed(0) self.embedding_user = nn.Embedding(self.num_user, self.embedding_dim) torch.manual_seed(0) self.embedding_item = nn.Embedding(self.num_item, self.embedding_dim) self.rating_loss_function = nn.MSELoss() self.review_loss_function = nn.AdaptiveLogSoftmaxWithLoss(\ conf.hidden_size, conf.vocab_sz, cutoffs=[round(conf.vocab_sz/15), 3*round(conf.vocab_sz/15)], div_value=2) self.avg_rating = torch.FloatTensor([conf.avg_rating]).cuda()
def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout=0.1, padding_idx=2): super().__init__() self.emb_dim = emb_dim self.hid_dim = hid_dim self.output_dim = output_dim self.n_layers = n_layers self.dropout = dropout self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=padding_idx) self.rnn = nn.GRU(emb_dim, 2*hid_dim, n_layers, dropout=dropout) self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss(2*hid_dim, output_dim, cutoffs=[10,12]) self.dropout = nn.Dropout(dropout)
def __init__(self, vocab, hidden_size, num_layer): super(LanGenNoEmbed, self).__init__() self.model = BertModelNoEmbed( config=BertConfig(vocab_size_or_config_json_file=len(vocab), hidden_size=hidden_size, num_hidden_layers=num_layer, num_attention_heads=8, intermediate_size=3072, type_vocab_size=2, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1)) self.model.encoder.layer = self.model.encoder.layer[:3] self.model.eval() self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss(hidden_size, len(vocab), cutoffs=[994])
def __init__(self, args, save_path=None): super(LMBase, self).__init__() logger.info(self.__class__.__name__) self.save_path = save_path self.d_model = args.transformer_d_model self.n_layers = args.n_layers self.n_heads = args.transformer_n_heads self.lsm_prob = args.lsm_prob self.vocab = args.vocab self.eos = 2 self.pad = 3 # NOTE: reserved in advance # for cache self.cache_theta = 0.2 # smoothing parameter self.cache_lambda = 0.2 # cache weight self.cache_ids = [] self.cache_keys = [] self.cache_attn = [] self.embed = nn.Embedding(self.vocab, self.d_model, padding_idx=self.pad) self.pos_enc = PositionalEncoding(self.d_model, args.dropout_in, args.transformer_pe_type) self.layers = nn.ModuleList([copy.deepcopy(TransformerDecoderBlock( self.d_model, args.transformer_d_ff, args.transformer_attn_type, self.n_heads, args.dropout_hidden, args.dropout_att, args.dropout_residual * (l + 1) / self.n_layers, args.transformer_layer_norm_eps, args.transformer_ffn_activation, args.transformer_param_init, src_tgt_attention=False))for l in range(self.n_layers)]) self.norm_out = nn.LayerNorm(self.d_model, eps=args.transformer_layer_norm_eps) self.adaptive_softmax = None self.output = None if args.adaptive_softmax: self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss( self.d_model, self.vocab, cutoffs=[round(self.vocab / 15), 3 * round(self.vocab / 15)], # cutoffs=[self.vocab // 25, 3 * self.vocab // 5], div_value=4.0) else: self.output = nn.Linear(self.d_model, self.vocab) if args.tie_embedding: self.output.weight = self.embed.weight self.reset_parameters()
def __init__(self, config): super(XLMPredLayer, self).__init__() self.asm = config.asm self.n_words = config.n_words self.pad_index = config.pad_index dim = config.emb_dim if config.asm is False: self.proj = nn.Linear(dim, config.n_words, bias=True) else: self.proj = nn.AdaptiveLogSoftmaxWithLoss( in_features=dim, n_classes=config.n_words, cutoffs=config.asm_cutoffs, div_value=config.asm_div_value, head_bias=True, # default is False )
def __init__(self, params): super().__init__() self.asm = params.asm self.n_words = params.n_words self.pad_index = params.pad_index dim = params.emb_dim if params.asm is False: self.proj = Linear(dim, params.n_words, bias=True) else: self.proj = nn.AdaptiveLogSoftmaxWithLoss( in_features=dim, n_classes=params.n_words, cutoffs=params.asm_cutoffs, div_value=params.asm_div_value, head_bias=True, # default is False )
def __init__(self, num_embeddings, embedding_dim, padding_idx, conv_filters, n_highways, word_size): super().__init__() self.char_embedding = CharEmbedding( num_embeddings, 16, padding_idx, conv_filters, n_highways, embedding_dim, ) self.forward_lm_1 = nn.LSTM( input_size=embedding_dim, hidden_size=4 * embedding_dim, batch_first=True, num_layers=1, ) self.forward_lp_1 = nn.Linear(4 * embedding_dim, embedding_dim) self.forward_lm_2 = nn.LSTM( input_size=embedding_dim, hidden_size=4 * embedding_dim, batch_first=True, num_layers=1, ) self.forward_lp_2 = nn.Linear(4 * embedding_dim, embedding_dim) self.backward_lm_1 = nn.LSTM( input_size=embedding_dim, hidden_size=4 * embedding_dim, batch_first=True, num_layers=1, ) self.backward_lp_1 = nn.Linear(4 * embedding_dim, embedding_dim) self.backward_lm_2 = nn.LSTM( input_size=embedding_dim, hidden_size=4 * embedding_dim, batch_first=True, num_layers=1, ) self.backward_lp_2 = nn.Linear(4 * embedding_dim, embedding_dim) self.loss = nn.AdaptiveLogSoftmaxWithLoss(embedding_dim, word_size, [100, 1000, 10000])
def __init__(self, **kwargs): super(PersonaModelAlt, self).__init__() block_size = kwargs.get('block_size', 100) emb_dim = kwargs.get('emb_dim', 50) num_LTM = kwargs.get('num_LTM', 5) embedding = kwargs.get('embedding', None) indexer = kwargs.get('indexer', None) assert (indexer is not None) self.block_size = block_size self.num_LTM = num_LTM self.embedding = embedding self.indexer = indexer self.embed = nn.Embedding(num_embeddings=len(indexer), embedding_dim=emb_dim) if embedding is not None: self.embed.weight.data.copy_(torch.from_numpy(embedding)) else: self.embed._parameters['weight'].data.normal_( 0.0, 1 / np.sqrt(emb_dim)) self.long_mem = EntNet(num_LTM, block_size) self.query_module_ltm = Attention(block_size) self.layernorm1 = nn.LayerNorm(emb_dim) self.layernorm2 = nn.LayerNorm(block_size) self.dropout = nn.Dropout(.5) self.LTM_state = None self.attn = Attention(block_size) self.self_attn = SelfAttention(block_size) self.encoder = nn.GRU(input_size=emb_dim, hidden_size=block_size, batch_first=True) self.decoder = nn.GRU(input_size=emb_dim, hidden_size=block_size, batch_first=True) #produces queries of size e_d self.reverse_embed = nn.AdaptiveLogSoftmaxWithLoss( block_size, len(indexer), cutoffs=[7, 21, 500]) self.nonlin = nn.LeakyReLU(.1) self.decoderAUX = nn.GRU(input_size=emb_dim, hidden_size=block_size, batch_first=True) self.idxs = np.arange(len(indexer))
def __init__(self, input_size=300, classes=1, hidden_size=512, drop_p=0.5, out_of_words=80000): super(elmo_model, self).__init__() self.dropout = nn.Dropout(drop_p) self.num_layers = 1 self.out_of_words = out_of_words self.char_embed = CharEmbedding(num_embeddings=260, embedding_dim=16, padding_idx=256, conv_filters=[(3, 128), (3, 128), (3, 128)], n_highways=2, projection_size=hidden_size) self.lstm1_f = nn.LSTM(hidden_size, hidden_size, batch_first=True, bidirectional=False) self.lstm2_f = nn.LSTM(hidden_size, hidden_size, batch_first=True, bidirectional=False) self.lstm1_b = nn.LSTM(hidden_size, hidden_size, batch_first=True, bidirectional=False) self.lstm2_b = nn.LSTM(hidden_size, hidden_size, batch_first=True, bidirectional=False) self.out = nn.AdaptiveLogSoftmaxWithLoss( in_features=hidden_size * 2, n_classes=out_of_words, cutoffs=[20, 200, 1000, 10000], div_value=4.0, head_bias=False)
def __init__(self, params): super().__init__() self.asm = params.asm self.n_words = params.n_words self.pad_index = params.pad_index self.label_smoothing = params.label_smoothing dim = params.emb_dim if params.asm is False: self.proj = Linear(dim, params.n_words, bias=True) # if params.label_smoothing > 0: # self.loss_func = LabelSmoothingCriterion(params.label_smoothing, params.n_words) else: self.proj = nn.AdaptiveLogSoftmaxWithLoss( in_features=dim, n_classes=params.n_words, cutoffs=params.asm_cutoffs, div_value=params.asm_div_value, head_bias=True, # default is False )
def __init__(self, w_num, w_dim, rnn_unit, num_layers, hidden_dim, dropout, cutoffs): super(LM, self).__init__() self.w_num = w_num self.w_dim = w_dim self.word_embed = nn.Embedding(w_num, w_dim) rnnunit_map = {'rnn': nn.RNN, 'lstm': nn.LSTM, 'gru': nn.GRU} self.rnn = rnnunit_map[rnn_unit](w_dim, hidden_dim, num_layers=num_layers, dropout=dropout) self.soft_max = nn.AdaptiveLogSoftmaxWithLoss(hidden_dim, w_num, cutoffs=cutoffs, div_value=4.0) self.dropout = nn.Dropout(p=dropout) self.reset_parameters()
def __init__(self, in_vocab_size, out_vocab_size, embed_size, hidden_size, GRU_count_enc=2, GRU_count_dec=2, ignore_class=None, use_feedforward=True): super(TranslatorModel, self).__init__() self.embed_size = embed_size self.hidden_size = hidden_size self.GRU_count_enc = GRU_count_enc self.GRU_count_dec = GRU_count_dec self.ignore_class = ignore_class self.use_feedforwad = use_feedforward self.enc_embed = nn.Embedding(in_vocab_size, self.embed_size) self.enc_GRU = nn.GRU(self.embed_size, self.hidden_size, num_layers=self.GRU_count_enc, bidirectional=True, dropout=0.2) self.dec_embed = nn.Embedding(out_vocab_size, self.embed_size) self.dec_ReLU = nn.ReLU() self.dec_GRU = nn.GRU(self.embed_size + self.hidden_size, self.hidden_size, num_layers=self.GRU_count_dec, dropout=0.2) self.Adaptive_Softmax = nn.AdaptiveLogSoftmaxWithLoss( self.hidden_size, out_vocab_size, [round(out_vocab_size / 20), 4 * round(out_vocab_size / 20)]) if self.use_feedforwad: self.feedforward_dense = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.att_Softmax = nn.Softmax(dim=1) self.bridge = nn.Linear(2 * self.hidden_size, self.GRU_count_dec * self.hidden_size)
def __init__( self, num_input_features: int, hp: HeadParams, ): x_reducer, loss_reducer, num_input_features = self._get_reducers( num_input_features=num_input_features, hp=hp) num_classes = hp.num_classes num_first_bin = round(num_classes / 20) head = nn.AdaptiveLogSoftmaxWithLoss( num_input_features, num_classes, cutoffs=[ num_first_bin, 5 * num_first_bin, ], div_value=4, ) super().__init__(num_input_features, num_classes, head, x_reducer, loss_reducer)
def lm_criterion(in_features, vocab_size): # if weight_tying: # in_features = 2 * input_size \ # if bidirectional else hidden_size # else: # in_features = 2 * hidden_size \ # if bidirectional else hidden_size splits = [] if vocab_size > _MEDIUM_TOKENS: # splits = [2800, 20000, 760000] splits = [2800, 20000] elif vocab_size > _HIGH_TOKENS: splits = [4200, 35000, 180000] splits += [vocab_size - 2] criterion = nn.AdaptiveLogSoftmaxWithLoss(in_features=in_features, n_classes=vocab_size, cutoffs=splits) return criterion
def __init__(self, max_seq_len, vocab_size, n_layers=6, dim=1024, d_ff=2048, dropout=0.1, heads=8, encoder_only=True, n_langs=15): super(Transformer, self).__init__() self.n_layers, self.max_seq_len, self.dim = n_layers, max_seq_len, dim self.encoder_only = encoder_only self.vocab_size = vocab_size self.embed = nn.Embedding(vocab_size, dim) self.pe = PositionalEncoding(dim=dim, max_seq_len=max_seq_len) self.lang_embed = nn.Embedding(n_langs, dim) self.encoders = nn.ModuleList([ TransformerLayer(decoder=False, dim=dim, d_ff=d_ff, dropout=dropout, heads=heads) for _ in range(n_layers) ]) if not encoder_only: self.decoders = nn.ModuleList([ TransformerLayer(decoder=True, dim=dim, d_ff=d_ff, dropout=dropout, heads=heads) for _ in range(n_layers) ]) self.pred = nn.AdaptiveLogSoftmaxWithLoss(in_features=dim, n_classes=vocab_size, cutoffs=[8000, 20000], head_bias=True) self.xnli_fc = nn.Linear(dim, 3) torch.nn.init.xavier_uniform_(self.embed.weight) torch.nn.init.xavier_uniform_(self.lang_embed.weight) torch.nn.init.xavier_uniform_(self.xnli_fc.weight)
def __init__(self, hidden_dim, embed_dim, num_keywords, num_layers, weight, num_labels, bidirectional, dropout=0.5, **kwargs): super(MTALSTM, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.embed_dim = embed_dim self.num_layers = num_layers self.num_labels = num_labels self.bidirectional = bidirectional if num_layers <= 1: self.dropout = 0 else: self.dropout = dropout self.embedding = nn.Embedding(vocab_size, embed_dim) self.embedding_topic = nn.Embedding(28 + 5, embed_dim) # todo # self.embedding = nn.Embedding.from_pretrained(weight) # self.embedding.weight.requires_grad = False self.Uf = nn.Linear(embed_dim * num_keywords, num_keywords, bias=False) # attention decoder self.decoder = AttentionDecoder(hidden_size=hidden_dim, embed_size=embed_dim, num_layers=num_layers, dropout=dropout) # adaptive softmax self.adaptiveSoftmax = nn.AdaptiveLogSoftmaxWithLoss( hidden_dim, num_labels, cutoffs=[round(num_labels / 20), 4 * round(num_labels / 20)])
def __init__(self, local_rank, vocab, embed_dim, ff_embed_dim, num_heads, dropout, layers, smoothing_factor, approx): super(BIGLM, self).__init__() self.vocab = vocab self.embed_dim = embed_dim self.tok_embed = Embedding(self.vocab.size, embed_dim, self.vocab.padding_idx) self.pos_embed = LearnedPositionalEmbedding(embed_dim, device=local_rank) self.layers = nn.ModuleList() for i in range(layers): self.layers.append( TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout, with_external=True)) self.emb_layer_norm = LayerNorm(embed_dim) self.one_more = nn.Linear(embed_dim, embed_dim) self.one_more_layer_norm = LayerNorm(embed_dim) self.out_proj = nn.Linear(embed_dim, self.vocab.size) self.attn_mask = SelfAttentionMask(device=local_rank) self.smoothing = LabelSmoothing(local_rank, self.vocab.size, self.vocab.padding_idx, smoothing_factor) self.dropout = dropout self.device = local_rank if approx == "none": self.approx = None elif approx == "adaptive": self.approx = nn.AdaptiveLogSoftmaxWithLoss( self.embed_dim, self.vocab.size, [10000, 20000, 200000]) else: raise NotImplementedError("%s has not been implemented" % approx) self.reset_parameters()
def __init__( self, policy, policy_optim, beta, beta_optim, hidden_size, gamma=0.99, k=10, weight_clip=2.0, offpolicy_correction=True, topk=True, adaptive_softmax=True, cutoffs=None, device=torch.device("cpu"), ): super(Reinforce, self).__init__() self.policy = policy self.policy_optim = policy_optim self.beta = beta self.beta_optim = beta_optim self.beta_criterion = nn.CrossEntropyLoss() self.gamma = gamma self.k = k self.weight_clip = weight_clip self.offpolicy_correction = offpolicy_correction self.topk = topk self.adaptive_softmax = adaptive_softmax if adaptive_softmax: assert cutoffs is not None, ( "must provide cutoffs when using adaptive_softmax" ) self.softmax_loss = nn.AdaptiveLogSoftmaxWithLoss( in_features=hidden_size, n_classes=policy.item_embeds.weight.size(0), cutoffs=cutoffs, div_value=4. ).to(device) self.device = device
def __init__(self, word_vocab, char_vocab): super(ELMO, self).__init__() self.embedding = CharEmbedding(num_embeddings=char_vocab.__len__(), embedding_dim=16, padding_idx=char_vocab.sp.pad.idx, conv_filters=[(1, 32), (2, 64), (3, 128), (4, 128), (5, 256), (6, 256), (7, 512)], n_highways=2, projection_size=512) self.lstm_forward = nn.LSTM(input_size=512, num_layers=1, hidden_size=2048, bidirectional=False, batch_first=True) self.linear_forward = nn.Linear(2048, 512) self.lstm_forward2 = nn.LSTM(input_size=512, num_layers=1, hidden_size=2048, bidirectional=False, batch_first=True) self.linear_forward2 = nn.Linear(2048, 512) self.lstm_backward = nn.LSTM(input_size=512, num_layers=1, hidden_size=2048, bidirectional=False, batch_first=True) self.linear_backward = nn.Linear(2048, 512) self.lstm_backward2 = nn.LSTM(input_size=512, num_layers=1, hidden_size=2048, bidirectional=False, batch_first=True) self.linear_backward2 = nn.Linear(2048, 512) self.output_layer = nn.AdaptiveLogSoftmaxWithLoss( in_features=512, n_classes=word_vocab.__len__(), cutoffs=[100, 1000, 10000])
def __init__(self, input_size, embed_size, hidden_size, class_count, LSTM_count, ignore_class=None): super(LSTMModel, self).__init__() self.LSTM_layers = LSTM_count self.hidden_size = hidden_size self.ignore_class = ignore_class self.embedding = nn.Embedding(input_size, embed_size) self.LSTM = nn.LSTM(embed_size, self.hidden_size, num_layers=self.LSTM_layers, bias=True, batch_first=True, dropout=0.25) self.Adaptive_Softmax = nn.AdaptiveLogSoftmaxWithLoss( self.hidden_size, class_count, [round(class_count / 20), 4 * round(class_count / 20)])
def __init__(self, eos, unk, pad, blank, enc_n_units, attn_type, attn_n_heads, n_layers, d_model, d_ff, vocab, tie_embedding=False, pe_type='add', layer_norm_eps=1e-12, dropout=0.0, dropout_emb=0.0, dropout_att=0.0, lsm_prob=0.0, focal_loss_weight=0.0, focal_loss_gamma=2.0, ctc_weight=0.0, ctc_lsm_prob=0.0, ctc_fc_list=[], backward=False, global_weight=1.0, mtl_per_batch=False, adaptive_softmax=False): super(TransformerDecoder, self).__init__() logger = logging.getLogger('training') self.eos = eos self.unk = unk self.pad = pad self.blank = blank self.enc_n_units = enc_n_units self.d_model = d_model self.n_layers = n_layers self.attn_n_heads = attn_n_heads self.pe_type = pe_type self.lsm_prob = lsm_prob self.focal_loss_weight = focal_loss_weight self.focal_loss_gamma = focal_loss_gamma self.ctc_weight = ctc_weight self.bwd = backward self.global_weight = global_weight self.mtl_per_batch = mtl_per_batch if ctc_weight > 0: self.ctc = CTC(eos=eos, blank=blank, enc_n_units=enc_n_units, vocab=vocab, dropout=dropout, lsm_prob=ctc_lsm_prob, fc_list=ctc_fc_list, param_init=0.1) if ctc_weight < global_weight: self.embed = Embedding( vocab, d_model, dropout=0, # NOTE: do not apply dropout here ignore_index=pad) self.pos_enc = PositionalEncoding(d_model, dropout_emb, pe_type) self.layers = nn.ModuleList([ TransformerDecoderBlock(d_model, d_ff, attn_type, attn_n_heads, dropout, dropout_att, layer_norm_eps) for _ in range(n_layers) ]) self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps) if adaptive_softmax: self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss( d_model, vocab, cutoffs=[ round(self.vocab / 15), 3 * round(self.vocab / 15) ], # cutoffs=[self.vocab // 25, 3 * self.vocab // 5], div_value=4.0) self.output = None else: self.adaptive_softmax = None self.output = Linear(d_model, vocab) # Optionally tie weights as in: # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) # https://arxiv.org/abs/1608.05859 # and # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) # https://arxiv.org/abs/1611.01462 if tie_embedding: self.output.fc.weight = self.embed.embed.weight # Initialize parameters self.reset_parameters()
def __init__(self, args, save_path=None): super(LMBase, self).__init__() logger.info(self.__class__.__name__) self.lm_type = args.lm_type self.save_path = save_path self.emb_dim = args.emb_dim self.n_units = args.n_units self.n_layers = args.n_layers self.lsm_prob = args.lsm_prob self.vocab = args.vocab self.eos = 2 self.pad = 3 # NOTE: reserved in advance # for cache self.cache_theta = 0.2 # smoothing parameter self.cache_lambda = 0.2 # cache weight self.cache_ids = [] self.cache_keys = [] self.cache_attn = [] self.embed_cache = None self.embed = nn.Embedding(self.vocab, args.emb_dim, padding_idx=self.pad) self.dropout_embed = nn.Dropout(p=args.dropout_in) model_size = args.lm_type.replace('gated_conv_', '') blocks = OrderedDict() dropout = args.dropout_hidden if model_size == 'custom': blocks['conv1'] = ConvGLUBlock(args.kernel_size, args.emb_dim, args.n_units, bottlececk_dim=args.n_projs, dropout=dropout) for lth in range(args.n_layers - 1): blocks['conv%d' % (lth + 2)] = ConvGLUBlock( args.kernel_size, args.n_units, args.n_units, bottlececk_dim=args.n_projs, dropout=dropout) last_dim = args.n_units elif model_size == '8': blocks['conv1'] = ConvGLUBlock(4, args.emb_dim, 900, dropout=dropout) for i in range(1, 8, 1): blocks['conv2-%d' % i] = ConvGLUBlock(4, 900, 900, dropout=dropout) last_dim = 900 elif model_size == '8B': blocks['conv1'] = ConvGLUBlock(1, args.emb_dim, 512, dropout=dropout) for i in range(1, 4, 1): blocks['conv2-%d' % i] = ConvGLUBlock(5, 512, 512, bottlececk_dim=128, dropout=dropout) for i in range(1, 4, 1): blocks['conv3-%d' % i] = ConvGLUBlock(5, 512, 512, bottlececk_dim=256, dropout=dropout) blocks['conv4'] = ConvGLUBlock(1, 512, 2048, bottlececk_dim=1024, dropout=dropout) last_dim = 2048 elif model_size == '9': blocks['conv1'] = ConvGLUBlock(4, args.emb_dim, 807, dropout=dropout) for i in range(1, 4, 1): blocks['conv2-%d-1' % i] = ConvGLUBlock(4, 807, 807, dropout=dropout) blocks['conv2-%d-2' % i] = ConvGLUBlock(4, 807, 807, dropout=dropout) last_dim = 807 elif model_size == '13': blocks['conv1'] = ConvGLUBlock(4, args.emb_dim, 1268, dropout=dropout) for i in range(1, 13, 1): blocks['conv2-%d' % i] = ConvGLUBlock(4, 1268, 1268, dropout=dropout) last_dim = 1268 elif model_size == '14': for i in range(1, 4, 1): blocks['conv1-%d' % i] = ConvGLUBlock( 6, args.emb_dim if i == 1 else 850, 850, dropout=dropout) blocks['conv2'] = ConvGLUBlock(1, 850, 850, dropout=dropout) for i in range(1, 5, 1): blocks['conv3-%d' % i] = ConvGLUBlock(5, 850, 850, dropout=dropout) blocks['conv4'] = ConvGLUBlock(1, 850, 850, dropout=dropout) for i in range(1, 4, 1): blocks['conv5-%d' % i] = ConvGLUBlock(4, 850, 850, dropout=dropout) blocks['conv6'] = ConvGLUBlock(4, 850, 1024, dropout=dropout) blocks['conv7'] = ConvGLUBlock(4, 1024, 2048, dropout=dropout) last_dim = 2048 elif model_size == '14B': blocks['conv1'] = ConvGLUBlock(5, args.emb_dim, 512, dropout=dropout) for i in range(1, 4, 1): blocks['conv2-%d' % i] = ConvGLUBlock(5, 512, 512, bottlececk_dim=128, dropout=dropout) for i in range(1, 4, 1): blocks['conv3-%d' % i] = ConvGLUBlock(5, 512 if i == 1 else 1024, 1024, bottlececk_dim=512, dropout=dropout) for i in range(1, 7, 1): blocks['conv4-%d' % i] = ConvGLUBlock(5, 1024 if i == 1 else 2048, 2048, bottlececk_dim=1024, dropout=dropout) blocks['conv5'] = ConvGLUBlock(5, 2048, 4096, bottlececk_dim=1024, dropout=dropout) last_dim = 4096 else: raise NotImplementedError(model_size) self.blocks = nn.Sequential(blocks) if args.adaptive_softmax: self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss( last_dim, self.vocab, # cutoffs=[self.vocab // 10, 3 * self.vocab // 10], cutoffs=[self.vocab // 25, self.vocab // 5], div_value=4.0) self.output = None else: self.adaptive_softmax = None self.output = nn.Linear(last_dim, self.vocab) if args.tie_embedding: if args.n_units != args.emb_dim: raise ValueError( 'When using the tied flag, n_units must be equal to emb_dim.' ) self.output.weight = self.embed.weight self.reset_parameters(args.param_init)
def __init__(self, args, save_path=None): super(LMBase, self).__init__() logger.info(self.__class__.__name__) self.lm_type = args.lm_type self.save_path = save_path self.emb_dim = args.emb_dim self.rnn_type = args.lm_type assert args.lm_type in ['lstm', 'gru'] self.n_units = args.n_units self.n_projs = args.n_projs self.n_layers = args.n_layers self.residual = args.residual self.n_units_cv = args.n_units_null_context self.lsm_prob = args.lsm_prob self.vocab = args.vocab self.eos = 2 self.pad = 3 # NOTE: reserved in advance # for cache self.cache_theta = 0.2 # smoothing parameter self.cache_lambda = 0.2 # cache weight self.cache_ids = [] self.cache_keys = [] self.cache_attn = [] self.embed_cache = None self.embed = nn.Embedding(self.vocab, args.emb_dim, padding_idx=self.pad) self.dropout_emb = nn.Dropout(p=args.dropout_in) rnn = nn.LSTM if args.lm_type == 'lstm' else nn.GRU self.rnn = nn.ModuleList() self.dropout = nn.Dropout(p=args.dropout_hidden) if args.n_projs > 0: self.proj = repeat(nn.Linear(args.n_units, args.n_projs), args.n_layers) rnn_idim = args.emb_dim + args.n_units_null_context for _ in range(args.n_layers): self.rnn += [rnn(rnn_idim, args.n_units, 1, batch_first=True)] rnn_idim = args.n_units if args.n_projs > 0: rnn_idim = args.n_projs self.glu = None if args.use_glu: self.glu = LinearGLUBlock(rnn_idim) self._odim = rnn_idim self.adaptive_softmax = None self.output_proj = None self.output = None if args.adaptive_softmax: self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss( rnn_idim, self.vocab, # cutoffs=[self.vocab // 10, 3 * self.vocab // 10], cutoffs=[self.vocab // 25, self.vocab // 5], div_value=4.0) elif args.tie_embedding: if rnn_idim != args.emb_dim: self.output_proj = nn.Linear(rnn_idim, args.emb_dim) rnn_idim = args.emb_dim self._odim = rnn_idim self.output = nn.Linear(rnn_idim, self.vocab) self.output.weight = self.embed.weight else: self.output = nn.Linear(rnn_idim, self.vocab) self.reset_parameters(args.param_init)
def __init__(self, args, save_path=None): super(LMBase, self).__init__() logger = logging.getLogger('training') logger.info(self.__class__.__name__) self.save_path = save_path self.emb_dim = args.emb_dim self.rnn_type = args.lm_type assert args.lm_type in ['lstm', 'gru'] self.n_units = args.n_units self.n_projs = args.n_projs self.n_layers = args.n_layers self.residual = args.residual self.use_glu = args.use_glu self.n_units_cv = args.n_units_null_context self.lsm_prob = args.lsm_prob self.vocab = args.vocab self.eos = 2 self.pad = 3 # NOTE: reserved in advance # for cache self.cache_theta = 0.2 # smoothing parameter self.cache_lambda = 0.2 # cache weight self.cache_ids = [] self.cache_keys = [] self.cache_attn = [] self.embed = Embedding(vocab=self.vocab, emb_dim=args.emb_dim, dropout=args.dropout_in, ignore_index=self.pad) rnn = nn.LSTM if args.lm_type == 'lstm' else nn.GRU self.rnn = nn.ModuleList() self.dropout = nn.ModuleList( [nn.Dropout(p=args.dropout_hidden) for _ in range(args.n_layers)]) if args.n_projs > 0: self.proj = nn.ModuleList([ Linear(args.n_units, args.n_projs) for _ in range(args.n_layers) ]) rnn_idim = args.emb_dim + args.n_units_null_context for l in range(args.n_layers): self.rnn += [ rnn(rnn_idim, args.n_units, 1, bias=True, batch_first=True, dropout=0, bidirectional=False) ] rnn_idim = args.n_units if args.n_projs > 0: rnn_idim = args.n_projs if self.use_glu: self.fc_glu = Linear(rnn_idim, rnn_idim * 2, dropout=args.dropout_hidden) if args.adaptive_softmax: self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss( rnn_idim, self.vocab, # cutoffs=[self.vocab // 10, 3 * self.vocab // 10], cutoffs=[self.vocab // 25, self.vocab // 5], div_value=4.0) self.output = None else: self.adaptive_softmax = None self.output = Linear(rnn_idim, self.vocab, dropout=args.dropout_out) # NOTE: include bias even when tying weights # Optionally tie weights as in: # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) # https://arxiv.org/abs/1608.05859 # and # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) # https://arxiv.org/abs/1611.01462 if args.tie_embedding: if args.n_units != args.emb_dim: raise ValueError( 'When using the tied flag, n_units must be equal to emb_dim.' ) self.output.fc.weight = self.embed.embed.weight # Initialize parameters self.reset_parameters(args.param_init) # Recurrent weights are orthogonalized if args.rec_weight_orthogonal: self.reset_parameters(args.param_init, dist='orthogonal', keys=['rnn', 'weight'])
ntokens = len(corpus.dictionary) print("vocabulary size (ntokens): " + str(ntokens)) if not args.adaptivesoftmax: criterion = nn.CrossEntropyLoss().to(device) else: print( "Adaptive Softmax is on: the performance depends on cutoff values. check if the cutoff is properly set" ) print("Cutoffs: " + str(args.cutoffs)) if args.cutoffs[-1] > ntokens: raise ValueError( "the last element of cutoff list must be lower than vocab size of the dataset" ) criterion_adaptive = nn.AdaptiveLogSoftmaxWithLoss( args.nhid, ntokens, cutoffs=args.cutoffs).to(device) model = rnn_models.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied, use_cudnn_version=args.cudnn, use_adaptive_softmax=args.adaptivesoftmax, cutoffs=args.cutoffs).to(device) total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print("model built, total trainable params: " + str(total_params)) if not args.cudnn:
def __init__(self, mem_slots, head_size, input_size, num_tokens, num_heads=1, num_blocks=1, forget_bias=1., input_bias=0., gate_style='unit', attention_mlp_layers=2, key_size=None, use_adaptive_softmax=False, cutoffs=None): super(RelationalMemory, self).__init__() ########## generic parameters for RMC ########## self.mem_slots = mem_slots self.head_size = head_size self.num_heads = num_heads self.mem_size = self.head_size * self.num_heads # a new fixed params needed for pytorch port of RMC # +1 is the concatenated input per time step : we do self-attention with the concatenated memory & input # so if the mem_slots = 1, this value is 2 self.mem_slots_plus_input = self.mem_slots + 1 if num_blocks < 1: raise ValueError( 'num_blocks must be >=1. Got: {}.'.format(num_blocks)) self.num_blocks = num_blocks if gate_style not in ['unit', 'memory', None]: raise ValueError( 'gate_style must be one of [\'unit\', \'memory\', None]. got: ' '{}.'.format(gate_style)) self.gate_style = gate_style if attention_mlp_layers < 1: raise ValueError( 'attention_mlp_layers must be >= 1. Got: {}.'.format( attention_mlp_layers)) self.attention_mlp_layers = attention_mlp_layers self.key_size = key_size if key_size else self.head_size ########## parameters for multihead attention ########## # value_size is same as head_size self.value_size = self.head_size # total size for query-key-value self.qkv_size = 2 * self.key_size + self.value_size self.total_qkv_size = self.qkv_size * self.num_heads # denoted as F # each head has qkv_sized linear projector # just using one big param is more efficient, rather than this line # self.qkv_projector = [nn.Parameter(torch.randn((self.qkv_size, self.qkv_size))) for _ in range(self.num_heads)] self.qkv_projector = nn.Linear(self.mem_size, self.total_qkv_size) self.qkv_layernorm = nn.LayerNorm( [self.mem_slots_plus_input, self.total_qkv_size]) # used for attend_over_memory function self.attention_mlp = nn.ModuleList( [nn.Linear(self.mem_size, self.mem_size)] * self.attention_mlp_layers) self.attended_memory_layernorm = nn.LayerNorm( [self.mem_slots_plus_input, self.mem_size]) self.attended_memory_layernorm2 = nn.LayerNorm( [self.mem_slots_plus_input, self.mem_size]) ########## parameters for initial embedded input projection ########## self.input_size = input_size self.input_projector = nn.Linear(self.input_size, self.mem_size) ########## parameters for gating ########## self.num_gates = 2 * self.calculate_gate_size() self.input_gate_projector = nn.Linear(self.mem_size, self.num_gates) self.memory_gate_projector = nn.Linear(self.mem_size, self.num_gates) # trainable scalar gate bias tensors self.forget_bias = nn.Parameter( torch.tensor(forget_bias, dtype=torch.float32)) self.input_bias = nn.Parameter( torch.tensor(input_bias, dtype=torch.float32)) ########## parameters for token-to-embed & output-to-token logit for softmax self.dropout = nn.Dropout() self.num_tokens = num_tokens self.token_to_input_encoder = nn.Embedding(self.num_tokens, self.input_size) # needs 2 linear layers for tying weights for embedding layers # first match the "output" of the RMC to input_size, which is the embed dim self.output_to_embed_decoder = nn.Linear( self.mem_slots * self.mem_size, self.input_size) self.use_adaptive_softmax = use_adaptive_softmax if not self.use_adaptive_softmax: # then, this layer's weight can be tied to the embedding layer self.embed_to_logit_decoder = nn.Linear(self.input_size, self.num_tokens) # tie embedding weights of encoder & decoder self.embed_to_logit_decoder.weight = self.token_to_input_encoder.weight ########## loss function self.criterion = nn.CrossEntropyLoss() else: # use adaptive softmax from the self.input_size logits, instead of the tied embed weights above self.criterion_adaptive = nn.AdaptiveLogSoftmaxWithLoss( self.input_size, self.num_tokens, cutoffs=cutoffs)
def __init__(self, args, save_path=None): super(LMBase, self).__init__() logger = logging.getLogger('training') logger.info(self.__class__.__name__) self.save_path = save_path self.emb_dim = args.emb_dim self.n_units = args.n_units self.n_layers = args.n_layers self.lsm_prob = args.lsm_prob self.vocab = args.vocab self.eos = 2 self.pad = 3 # NOTE: reserved in advance # for cache self.cache_theta = 0.2 # smoothing parameter self.cache_lambda = 0.2 # cache weight self.cache_ids = [] self.cache_keys = [] self.cache_attn = [] self.embed = Embedding(vocab=self.vocab, emb_dim=args.emb_dim, dropout=args.dropout_in, ignore_index=self.pad) model_size = args.lm_type.replace('gated_conv_', '') blocks = OrderedDict() if model_size == 'custom': blocks['conv1'] = GLUBlock(args.kernel_size, args.emb_dim, args.n_units, bottlececk_dim=args.n_projs, dropout=args.dropout_hidden) for l in range(args.n_layers - 1): blocks['conv%d' % (l + 2)] = GLUBlock( args.kernel_size, args.n_units, args.n_units, bottlececk_dim=args.n_projs, dropout=args.dropout_hidden) last_dim = args.n_units elif model_size == '8': blocks['conv1'] = GLUBlock(4, args.emb_dim, 900, dropout=args.dropout_hidden) for i in range(1, 8, 1): blocks['conv2-%d' % i] = GLUBlock(4, 900, 900, dropout=args.dropout_hidden) last_dim = 900 elif model_size == '8B': blocks['conv1'] = GLUBlock(1, args.emb_dim, 512, dropout=args.dropout_hidden) for i in range(1, 4, 1): blocks['conv2-%d' % i] = GLUBlock(5, 512, 512, bottlececk_dim=128, dropout=args.dropout_hidden) for i in range(1, 4, 1): blocks['conv3-%d' % i] = GLUBlock(5, 512, 512, bottlececk_dim=256, dropout=args.dropout_hidden) blocks['conv4'] = GLUBlock(1, 512, 2048, bottlececk_dim=1024, dropout=args.dropout_hidden) last_dim = 2048 elif model_size == '9': blocks['conv1'] = GLUBlock(4, args.emb_dim, 807, dropout=args.dropout_hidden) for i in range(1, 4, 1): blocks['conv2-%d-1' % i] = GLUBlock( 4, 807, 807, dropout=args.dropout_hidden) blocks['conv2-%d-2' % i] = GLUBlock( 4, 807, 807, dropout=args.dropout_hidden) last_dim = 807 elif model_size == '13': blocks['conv1'] = GLUBlock(4, args.emb_dim, 1268, dropout=args.dropout_hidden) for i in range(1, 13, 1): blocks['conv2-%d' % i] = GLUBlock(4, 1268, 1268, dropout=args.dropout_hidden) last_dim = 1268 elif model_size == '14': for i in range(1, 4, 1): blocks['conv1-%d' % i] = GLUBlock( 6, args.emb_dim if i == 1 else 850, 850, dropout=args.dropout_hidden) blocks['conv2'] = GLUBlock(1, 850, 850, dropout=args.dropout_hidden) for i in range(1, 5, 1): blocks['conv3-%d' % i] = GLUBlock(5, 850, 850, dropout=args.dropout_hidden) blocks['conv4'] = GLUBlock(1, 850, 850, dropout=args.dropout_hidden) for i in range(1, 4, 1): blocks['conv5-%d' % i] = GLUBlock(4, 850, 850, dropout=args.dropout_hidden) blocks['conv6'] = GLUBlock(4, 850, 1024, dropout=args.dropout_hidden) blocks['conv7'] = GLUBlock(4, 1024, 2048, dropout=args.dropout_hidden) last_dim = 2048 elif model_size == '14B': blocks['conv1'] = GLUBlock(5, args.emb_dim, 512, dropout=args.dropout_hidden) for i in range(1, 4, 1): blocks['conv2-%d' % i] = GLUBlock(5, 512, 512, bottlececk_dim=128, dropout=args.dropout_hidden) for i in range(1, 4, 1): blocks['conv3-%d' % i] = GLUBlock(5, 512 if i == 1 else 1024, 1024, bottlececk_dim=512, dropout=args.dropout_hidden) for i in range(1, 7, 1): blocks['conv4-%d' % i] = GLUBlock(5, 1024 if i == 1 else 2048, 2048, bottlececk_dim=1024, dropout=args.dropout_hidden) blocks['conv5'] = GLUBlock(5, 2048, 4096, bottlececk_dim=1024, dropout=args.dropout_hidden) last_dim = 4096 else: raise NotImplementedError(model_size) self.blocks = nn.Sequential(blocks) if args.adaptive_softmax: self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss( last_dim, self.vocab, # cutoffs=[self.vocab // 10, 3 * self.vocab // 10], cutoffs=[self.vocab // 25, self.vocab // 5], div_value=4.0) self.output = None else: self.adaptive_softmax = None self.output = LinearND(last_dim, self.vocab, dropout=args.dropout_out) # NOTE: include bias even when tying weights # Optionally tie weights as in: # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) # https://arxiv.org/abs/1608.05859 # and # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) # https://arxiv.org/abs/1611.01462 if args.tie_embedding: if args.n_units != args.emb_dim: raise ValueError( 'When using the tied flag, n_units must be equal to emb_dim.' ) self.output.fc.weight = self.embed.embed.weight # Initialize parameters self.reset_parameters(args.param_init)
def __init__(self, args, save_path=None): super(LMBase, self).__init__() logger.info(self.__class__.__name__) self.lm_type = args.lm_type self.save_path = save_path self.d_model = args.transformer_d_model self.n_layers = args.n_layers self.n_heads = args.transformer_n_heads self.lsm_prob = args.lsm_prob if args.mem_len > 0: self.mem_len = args.mem_len else: self.mem_len = args.bptt if args.recog_mem_len > 0: self.mem_len = args.recog_mem_len self.vocab = args.vocab self.eos = 2 self.pad = 3 # NOTE: reserved in advance # for cache self.cache_theta = 0.2 # smoothing parameter self.cache_lambda = 0.2 # cache weight self.cache_ids = [] self.cache_keys = [] self.cache_attn = [] self.embed_cache = None # positional embedding self.pos_emb = XLPositionalEmbedding(self.d_model, args.dropout_in) self.u_bias = nn.Parameter(torch.Tensor(self.n_heads, self.d_model // self.n_heads)) self.v_bias = nn.Parameter(torch.Tensor(self.n_heads, self.d_model // self.n_heads)) # NOTE: u_bias and v_bias are global parameters self.embed = nn.Embedding(self.vocab, self.d_model, padding_idx=self.pad) self.scale = math.sqrt(self.d_model) # for token embedding self.dropout_emb = nn.Dropout(p=args.dropout_in) # for token embedding self.layers = nn.ModuleList([copy.deepcopy(TransformerDecoderBlock( self.d_model, args.transformer_d_ff, 'scaled_dot', self.n_heads, args.dropout_hidden, args.dropout_att, args.dropout_layer, args.transformer_layer_norm_eps, args.transformer_ffn_activation, args.transformer_param_init, src_tgt_attention=False, memory_transformer=True)) for lth in range(self.n_layers)]) self.norm_out = nn.LayerNorm(self.d_model, eps=args.transformer_layer_norm_eps) self.adaptive_softmax = None self.output = None if args.adaptive_softmax: self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss( self.d_model, self.vocab, cutoffs=[round(self.vocab / 15), 3 * round(self.vocab / 15)], # cutoffs=[self.vocab // 25, 3 * self.vocab // 5], div_value=4.0) else: self.output = nn.Linear(self.d_model, self.vocab) if args.tie_embedding: self.output.weight = self.embed.weight self.reset_parameters()