def forward(self, inputs: PackedSequence, # pylint: disable=arguments-differ # pylint: disable=unused-argument initial_state: torch.Tensor = None)-> Tuple[PackedSequence, torch.Tensor]: """ Parameters ---------- inputs : ``PackedSequence``, required. A batch first ``PackedSequence`` to run the stacked LSTM over. initial_state : Tuple[torch.Tensor, torch.Tensor], optional, (default = None) Currently, this is ignored. Returns ------- output_sequence : ``PackedSequence`` The encoded sequence of shape (batch_size, sequence_length, hidden_size) final_states: ``torch.Tensor`` The per-layer final (state, memory) states of the LSTM, each with shape (num_layers, batch_size, hidden_size). """ inputs, lengths = pad_packed_sequence(inputs, batch_first=True) # Kernel takes sequence length first tensors. inputs = inputs.transpose(0, 1) sequence_length, batch_size, _ = inputs.size() accumulator_shape = [self.num_layers, sequence_length + 1, batch_size, self.hidden_size] state_accumulator = Variable(inputs.data.new(*accumulator_shape).zero_(), requires_grad=False) memory_accumulator = Variable(inputs.data.new(*accumulator_shape).zero_(), requires_grad=False) dropout_weights = inputs.data.new().resize_(self.num_layers, batch_size, self.hidden_size).fill_(1.0) if self.training: # Normalize by 1 - dropout_prob to preserve the output statistics of the layer. dropout_weights.bernoulli_(1 - self.recurrent_dropout_probability)\ .div_((1 - self.recurrent_dropout_probability)) dropout_weights = Variable(dropout_weights, requires_grad=False) gates = Variable(inputs.data.new().resize_(self.num_layers, sequence_length, batch_size, 6 * self.hidden_size)) lengths_variable = Variable(torch.IntTensor(lengths)) implementation = _AlternatingHighwayLSTMFunction(self.input_size, self.hidden_size, num_layers=self.num_layers, train=self.training) output, _ = implementation(inputs, self.weight, self.bias, state_accumulator, memory_accumulator, dropout_weights, lengths_variable, gates) # TODO(Mark): Also return the state here by using index_select with the lengths so we can use # it as a Seq2VecEncoder. output = output.transpose(0, 1) output = pack_padded_sequence(output, lengths, batch_first=True) return output, None
def _cudaize_packed(t): data = Variable(t.data) if torch.cuda.is_available(): data = data.cuda() return PackedSequence(data, t.batch_sizes)
def forward(self, node_features, edge_features, neighbor_indices, neighbor_masks, h=None, c=None): """ Update node_features and edge_features via graph convolution and pooling. Most of the complexity arises from the need to deal with different number of neighbors for each atom. We use PackedSequence, pad_packed_sequence and pack_padded_sequence of torch.nn.utils.rnn to realize the transition. Args: node_features (Tensor, (batch_size, node_embedding_len)): edge_features (Tensor, (batch_size, neighbor_len, edge_embedding_len)): neighbor_indices (Tensor, (batch_size, neighbor_len)): neighbor_masks (Tensor, (batch_size, neighbor_len)): h: for lstm c: for lstm Returns: node_features_updated, edge_features_updated, (h, c) """ batch_len, neighbor_len, _ = edge_features.shape # calculate the neighbor length of each atom (in the batch) from the # neighbor_masks. In the neighbor_masks with fixed length, "1" means the # real neighbor and "0" is for filling the void. neighbor_lens = neighbor_masks.sum(dim=1) # make the atom with no neighbors in the batch to neighbor of itself? neighbor_masks[neighbor_lens == 0] = torch.Tensor( [1.] + [0.] * (neighbor_len - 1)).to(self.device) neighbor_lens[neighbor_lens == 0] = 1 # concat node_features, neighbor's node_features, edge_features pair_features = torch.cat([ node_features.unsqueeze(1).expand(batch_len, neighbor_len, self.node_embedding_len), node_features[neighbor_indices, :] ], dim=2) concat_features = torch.cat((pair_features, edge_features), dim=2) # change concat_features with fixed length to variable length sequence. packed_concat_features = pack_padded_sequence(concat_features, neighbor_lens, batch_first=True, enforce_sorted=False) # update edge_features and change to fixed length sequence, edge_features_updated, _ = pad_packed_sequence( PackedSequence( self.edge_bn(self.edge_linear(packed_concat_features.data)), packed_concat_features.batch_sizes, packed_concat_features.sorted_indices, packed_concat_features.unsorted_indices), batch_first=True, total_length=neighbor_len) # use residual link in the edge feature update # edge_features_updated = self.activation(edge_features + padding_tensor( # edge_features_updated, neighbor_len, batch_len, self.device)) edge_features_updated = self.activation(edge_features + edge_features_updated) # update packed_concat_features packed_concat_features = pack_padded_sequence(torch.cat( (pair_features, edge_features_updated), dim=2), neighbor_lens, batch_first=True, enforce_sorted=False) # calculate multi-head features for nodes head_features_list = list() for attention_linear, value_linear, attention_bn in zip( self.attention_linears, self.value_linears, self.attention_bns): # apply attention_linear to packed_concat_features head_attention, _ = pad_packed_sequence(PackedSequence( attention_linear(packed_concat_features.data), packed_concat_features.batch_sizes, packed_concat_features.sorted_indices, packed_concat_features.unsorted_indices), batch_first=True, total_length=neighbor_len) # Masked softmax: calculate the standard softmax and ignore zero values masked_attention = head_attention[:, :, -1:].masked_fill( (1 - neighbor_masks.unsqueeze(2)).bool(), float('-inf')) head_attention = self.attention_softmax(masked_attention) # change head_attention to variable length PackedSequence. packed_head_attentions = pack_padded_sequence(head_attention, neighbor_lens, batch_first=True, enforce_sorted=False) packed_head_values = PackedSequence( value_linear(packed_concat_features.data), packed_concat_features.batch_sizes, packed_concat_features.sorted_indices, packed_concat_features.unsorted_indices) # head_features tensor head_features = self.activation( attention_bn( self.attention_drop_layer(packed_head_attentions.data) * packed_head_values.data)) # change head_features to tensor of fixed length head_features, _ = pad_packed_sequence(PackedSequence( head_features, packed_head_attentions.batch_sizes, packed_head_attentions.sorted_indices, packed_head_attentions.unsorted_indices), batch_first=True, total_length=neighbor_len) # use sum pooling over neighbors as default pooled_head_features = torch.sum(head_features, dim=1) head_features_list.append(pooled_head_features) # concat multi-head node_features concat_heads_features = torch.cat(head_features_list, dim=1) # if n_head * attention_len != node_embedding_len if self.attention_out_linear is not None: node_features_updated = self.output_bn( self.activation( self.after_concat_heads_bn( self.after_concat_heads_linear( concat_heads_features)))) else: node_features_updated = self.output_bn(concat_heads_features) if self.remember_func == "residual": node_features_updated = node_features + node_features_updated elif self.remember_func == "lstm": node_features_updated, (h, c) = self.lstm_func( node_features_updated[None, :], h, c) node_features_updated = node_features_updated[0] node_features_updated = self.lstm_bn(node_features_updated) else: raise ValueError("remember_func invalid.") return node_features_updated, edge_features_updated, (h, c)
def forward(self, input, hx=None): is_packed = isinstance(input, PackedSequence) '''if packed, input contains max_batch_size information''' if is_packed: input, batch_sizes = input batch_size = batch_sizes[0] else: batch_sizes = None batch_size = input.size(0) if self.batch_first else input.size(1) '''if user don't provide the hx and cx, a zero tensor will be created.''' if hx is None: num_directions = 2 if self.bidirectional else 1 hx = torch.autograd.Variable(input.data.new( self.num_layers * num_directions, batch_size, self.hidden_size).zero_(), requires_grad=False) if self.mode == 'LSTM': #LSTM requires a tuple in hx hx = (hx, hx) has_flat_weights = None #= list(p.data.data_ptr() for p in self.parameters()) == self._data_ptrs '''TODO: add assert to avoid shape mismatch''' #get all weight from self.parameters() seq_length = input.size(0) weight_idx = 1 bx = None bh = None for weight in self.parameters(): if weight_idx == 1: wx = weight elif weight_idx == 2: wh = weight elif weight_idx == 3: bx = weight elif weight_idx == 4: bh = weight weight_idx = weight_idx + 1 # Pytorch makes the assumption that all parameters passed to Function.forward() must # be an instance of "Variable", and it doesn't accept NoneType. Make it happy. if bx is None: bx = Variable(torch.Tensor((0))) if bh is None: bh = Variable(torch.Tensor((0))) #check if input seq_length or batch_size exceed the max value in self. if seq_length > self.max_seq_length: self.max_seq_length = seq_length self.update_workspace = True if batch_size > self.max_batch_size: self.max_batch_size = batch_size self.update_workspace = True #update workspace in the first call, or exceed happens if self.update_workspace: #print("Updating the workspace ...") buffer_size = get_workspace_size(self.mode, self.training, self.num_layers, self.bidirectional, self.max_seq_length, self.max_batch_size, self.input_size, self.hidden_size) self.workspace = Variable(torch.zeros(buffer_size), requires_grad=False) self.update_workspace = False _func = self.IRNNFunc if self.mode == 'LSTM': cx = hx[1] hx = hx[0] self.y, self.hy, self.cy = _func(self.workspace, input, hx, cx, wx, wh, bx, bh) if is_packed: output = PackedSequence(self.y, batch_sizes) return self.y, (self.hy, self.cy) elif self.mode == 'GRU': self.y, self.hy = _func(self.workspace, input, hx, wx, wh, bx, bh) if is_packed: output = PackedSequence(self.y, batch_sizes) return self.y, self.hy
def forward(self, docs, doc_lengths, sent_lengths, attention_masks, token_type_ids): """ :param docs: encoded document-level data; LongTensor (num_docs, padded_doc_length, padded_sent_length) :param doc_lengths: unpadded document lengths; LongTensor (num_docs) :param sent_lengths: unpadded sentence lengths; LongTensor (num_docs, max_sent_len) :param attention_masks: BERT attention masks; LongTensor (num_docs, padded_doc_length, padded_sent_length) :param token_type_ids: BERT token type IDs; LongTensor (num_docs, padded_doc_length, padded_sent_length) :return: sentences embeddings, docs permutation indices, docs batch sizes, word attention weights """ # Sort documents by decreasing order in length doc_lengths, doc_perm_idx = doc_lengths.sort(dim=0, descending=True) docs = docs[doc_perm_idx] sent_lengths = sent_lengths[doc_perm_idx] # Make a long batch of sentences by removing pad-sentences # i.e. `docs` was of size (num_docs, padded_doc_length, padded_sent_length) # -> `packed_sents.data` is now of size (num_sents, padded_sent_length) packed_sents = pack_padded_sequence(docs, lengths=doc_lengths.tolist(), batch_first=True) # effective batch size at each timestep docs_valid_bsz = packed_sents.batch_sizes # Make a long batch of sentence lengths by removing pad-sentences # i.e. `sent_lengths` was of size (num_docs, padded_doc_length) # -> `packed_sent_lengths.data` is now of size (num_sents) packed_sent_lengths = pack_padded_sequence( sent_lengths, lengths=doc_lengths.tolist(), batch_first=True) # Make a long batch of attention masks by removing pad-sentences # i.e. `docs` was of size (num_docs, padded_doc_length, padded_sent_length) # -> `packed_attention_masks.data` is now of size (num_sents, padded_sent_length) packed_attention_masks = pack_padded_sequence( attention_masks, lengths=doc_lengths.tolist(), batch_first=True) # Make a long batch of token_type_ids by removing pad-sentences # i.e. `docs` was of size (num_docs, padded_doc_length, padded_sent_length) # -> `token_type_ids.data` is now of size (num_sents, padded_sent_length) packed_token_type_ids = pack_padded_sequence( token_type_ids, lengths=doc_lengths.tolist(), batch_first=True) sents, sent_lengths, attn_masks, token_types = ( packed_sents.data, packed_sent_lengths.data, packed_attention_masks.data, packed_token_type_ids.data) # Sort sents by decreasing order in sentence lengths sent_lengths, sent_perm_idx = sent_lengths.sort(dim=0, descending=True) sents = sents[sent_perm_idx] embeddings, pooled_out = self.bert_model(sents, attention_mask=attn_masks, token_type_ids=token_types) packed_words = pack_padded_sequence(embeddings, lengths=sent_lengths.tolist(), batch_first=True) # effective batch size at each timestep sentences_valid_bsz = packed_words.batch_sizes u_i = torch.tanh(self.word_weight(packed_words.data)) u_w = self.context_weight(u_i).squeeze(1) val = u_w.max() att = torch.exp(u_w - val) # Restore as sentences by repadding att, _ = pad_packed_sequence(PackedSequence(att, sentences_valid_bsz), batch_first=True) att_weights = att / torch.sum(att, dim=1, keepdim=True) # Restore as sentences by repadding sents, _ = pad_packed_sequence(packed_words, batch_first=True) sents = sents * att_weights.unsqueeze(2) sents = sents.sum(dim=1) # Restore the original order of sentences (undo the first sorting) _, sent_unperm_idx = sent_perm_idx.sort(dim=0, descending=False) sents = sents[sent_unperm_idx] att_weights = att_weights[sent_unperm_idx] return sents, doc_perm_idx, docs_valid_bsz, att_weights
def forward(self, word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, word_orig_idx, sentlens, wordlens, orig_idx=None, morph_dict=None, start=None, end=None): def pack(x): # Packs a Tensor containing padded sequences of variable length. return pack_padded_sequence(x, sentlens, batch_first=True) inputs = [] if self.args['word_emb_dim'] > 0: word_emb = self.word_emb(word) word_emb = pack(word_emb) inputs += [word_emb] if self.args['pretrain']: pretrained_emb = self.pretrained_emb(pretrained) pretrained_emb = self.trans_pretrained(pretrained_emb) pretrained_emb = pack(pretrained_emb) inputs += [pretrained_emb] def pad(x): # inverse operation to pack_padded_sequence(). Pads a packed batch of variable length sequences. return pad_packed_sequence(PackedSequence(x, word_emb.batch_sizes), batch_first=True)[0] if self.args['char'] and self.args['char_emb_dim'] > 0: char_reps = self.charmodel(wordchars, wordchars_mask, word_orig_idx, sentlens, wordlens) char_reps = PackedSequence(self.trans_char(self.drop(char_reps.data)), char_reps.batch_sizes) inputs += [char_reps] lstm_inputs = torch.cat([x.data for x in inputs],1) lstm_inputs = self.worddrop(lstm_inputs, self.drop_replacement) lstm_inputs = self.drop(lstm_inputs) lstm_inputs = PackedSequence(lstm_inputs, inputs[0].batch_sizes) lstm_outputs, _ = self.taggerlstm(lstm_inputs, sentlens, hx=( self.taggerlstm_h_init.expand(2 * self.args['num_layers'], word.size(0), self.args['hidden_dim']).contiguous(), self.taggerlstm_c_init.expand(2 * self.args['num_layers'], word.size(0), self.args['hidden_dim']).contiguous())) lstm_outputs = lstm_outputs.data upos_hid = F.relu(self.upos_hid(self.drop(lstm_outputs))) upos_pred = self.upos_clf(self.drop(upos_hid)) preds = [pad(upos_pred).max(2)[1]] upos = pack(upos).data loss = self.crit(upos_pred.view(-1, upos_pred.size(-1)), upos.view(-1)) if self.share_hid: xpos_hid = upos_hid ufeats_hid = upos_hid clffunc = lambda clf, hid: clf(self.drop(hid)) else: xpos_hid = F.relu(self.xpos_hid(self.drop(lstm_outputs))) ufeats_hid = F.relu(self.ufeats_hid(self.drop(lstm_outputs))) # this is where we get upos embeddings if self.training: upos_emb = self.upos_emb(upos) else: # get the top 5 upos predictions best_5 = [sorted(range(len(x)), key=lambda i: x[i], reverse=True)[:5] for x in upos_pred] # save upos emb for later upos_temp = self.upos_emb upos_emb = self.upos_emb(upos_pred.max(1)[1]) clffunc = lambda clf, hid: clf(self.drop(hid), self.drop(upos_emb)) # ORG xpos = pack(xpos).data if isinstance(self.vocab['xpos'], CompositeVocab): xpos_preds = [] for i in range(len(self.vocab['xpos'])): xpos_pred = clffunc(self.xpos_clf[i], xpos_hid) loss += self.crit(xpos_pred.view(-1, xpos_pred.size(-1)), xpos[:, i].view(-1)) xpos_preds.append(pad(xpos_pred).max(2, keepdim=True)[1]) preds.append(torch.cat(xpos_preds, 2)) else: xpos_pred = clffunc(self.xpos_clf, xpos_hid) loss += self.crit(xpos_pred.view(-1, xpos_pred.size(-1)), xpos.view(-1)) preds.append(pad(xpos_pred).max(2)[1]) ufeats_preds = [] ufeats = pack(ufeats).data for i in range(len(self.vocab['feats'])): ufeats_pred = clffunc(self.ufeats_clf[i], ufeats_hid) loss += self.crit(ufeats_pred.view(-1, ufeats_pred.size(-1)), ufeats[:, i].view(-1)) ufeats_preds.append(pad(ufeats_pred).max(2, keepdim=True)[1]) preds.append(torch.cat(ufeats_preds,2)) # post-filter only if a morphological dictionary is present if morph_dict: # get the most likely ufeats tag for each top 5 upos tags predicted for a word feats_coeffs = list() for r in range(5): # condition ufeats on a different upos tag embedding each time upos_2 = torch.LongTensor([x[r] for x in best_5]) upos_emb2 = upos_temp(upos_2) clffunc_temp = lambda clf, hid: clf(self.drop(hid), self.drop(upos_emb2)) ufeats_preds_temp = [] for i in range(len(self.vocab['feats'])): ufeats_pred = clffunc_temp(self.ufeats_clf[i], ufeats_hid) ufeats_preds_temp.append(pad(ufeats_pred).max(2, keepdim=True)[1]) feats_coeffs.append(torch.cat(ufeats_preds_temp, 2)) # unmap all tags into readable format and unsort them into the original order that matches the sentence order upos_seqs = [self.vocab['upos'].unmap(up) for up in preds[0].tolist()] xpos_seqs = [self.vocab['xpos'].unmap(up) for up in preds[1].tolist()] feats_seqs = [self.vocab['feats'].unmap(up) for up in preds[2].tolist()] pred_tokens = [[[upos_seqs[i][j], xpos_seqs[i][j], feats_seqs[i][j]] for j in range(sentlens[i])] for i in range(word.size(0))] pred_tokens = utils.unsort(pred_tokens, orig_idx) # pair the tags with the right words in the right sentences. sntncs = self.doc.sentences[start:end] sent_tokens = [[x.text for x in sent.tokens] for sent in sntncs] pair = [x for x in zip(sent_tokens, pred_tokens)] # 5 most likely upos tags for the token coeff = utils.unsort(pad(upos_pred).tolist(), orig_idx) coeff_max = [[sorted(range(len(x)), key=lambda i: x[i], reverse=True)[:5] for x in y] for y in coeff] # the most likely feats tag for each top 5 predicted upos tag fct = [] for f in feats_coeffs: fct.append(utils.unsort(f, orig_idx)) fct2 = [list(zip(*[fct[0][i], fct[1][i], fct[2][i], fct[3][i], fct[4][i]])) for i in range(len(fct[0]))] feats_coeffs = [[list(j[i]) for i in range(len(j))] for j in fct2] # initialise hunspell for Lithuanian if self.args['lang'] == 'lt': root = os.path.dirname(os.getcwd()) hunspell = Hunchecker('lt-LT_morphology', root + '/data_files/hunspell') print('Post-filtering...') for p in range(len(pair)): # get a sentence words = pair[p][0] tags = pair[p][1] a = 0 while a < len(words): lemma, upos, xpos, feats = morph_dict.find(words[a]) if upos is None: lemma, upos, xpos, feats = morph_dict.find(words[a].lower()) else: lemma2, upos2, xpos2, feats2 = morph_dict.find(words[a].lower()) if lemma2: for i in range(len(lemma2)): if upos2[i] not in upos or feats2[i] not in feats: lemma += [lemma2[i]] upos += [upos2[i]] xpos += [xpos2[i]] feats += [feats2[i]] if self.args['lang'] == 'lt': if upos is None: lemma, upos, xpos, feats = hunspell.hunspell_to_conll(words[a]) else: lemma_h, upos_h, xpos_h, feats_h = hunspell.hunspell_to_conll(words[a]) if upos_h is not None: for i in range(len(upos_h)): if upos_h[i] not in upos or feats_h[i] not in feats: lemma += [lemma_h[i]] upos += [upos_h[i]] xpos += [xpos_h[i]] feats += [feats_h[i]] if upos is not None: if tags[a][0] not in upos: new_upos = None tag_idx = None if len(upos) > 1: max_values = self.vocab['upos'].unmap(coeff_max[p][a][1:]) # go through the values in the order of the most likely one for m in range(len(max_values)): # for every max upos tag # found one of the possible predicted values in the upos list if max_values[m] in upos: indices = [i for i, x in enumerate(upos) if x == max_values[m]] if len(indices) > 1: # more than one upos list items matches the max value item # check if an exact match can be found, using the most informative ufeats tag for d in indices: if feats[d] == self.vocab['feats'].unmap(feats_coeffs[p][a][1:])[m] and \ upos[d] == max_values[m]: new_upos = upos[d] tag_idx = d break if len(indices) == 1 or new_upos is None: new_upos = max_values[m] tag_idx = upos.index(max_values[m]) break if new_upos is None: # last resort new_upos = upos[0] tag_idx = 0 else: # only one item in upos list new_upos = upos[0] tag_idx = 0 new_xpos = xpos[tag_idx] new_feats = feats[tag_idx] # let the tagger deal with multiword tokens itself if ('Hyph=Yes' not in new_feats and 'Hyph=Yes' in tags[a][2]) or ( 'Hyph=Yes' in new_feats and 'Hyph=Yes' not in tags[a][2]): new_upos = new_xpos = new_feats = None if new_upos is not None: preds[0][orig_idx.index(p)][a] = self.vocab['upos'].map([new_upos])[0] # sme has a 2D torch here, LT has 3D if not isinstance(self.vocab['xpos'], CompositeVocab): preds[1][orig_idx.index(p)][a] = self.vocab['xpos'].map([new_xpos])[0] else: preds[1][orig_idx.index(p)][a] = torch.LongTensor( self.vocab['xpos'].map([new_xpos])[0]) preds[2][orig_idx.index(p)][a] = torch.LongTensor( self.vocab['feats'].map([new_feats])[0]) else: new_xpos = new_feats = None all_found = False for x in range(len(xpos)): if tags[a][1] == xpos[x] and tags[a][2] == feats[x] and upos[x] == tags[a][0]: all_found = True break if not all_found: if len(upos) == 1 or (False not in [feats[a] == feats[a + 1] for a in range(len(feats) - 1)] and False not in [ upos[a] == upos[a + 1] for a in range(len(upos) - 1)]): new_feats = feats[0] if '*' not in tags[a][1]: new_xpos = xpos[0] all_found = True if not all_found: if len([i for i, x in enumerate(upos) if x == tags[a][0]]) == 1: new_feats = feats[upos.index(tags[a][0])] if '*' not in tags[a][1]: new_xpos = xpos[upos.index(tags[a][0])] all_found = True if not all_found: found_ft = False for x in range(len(xpos)): if tags[a][2] == feats[x] and upos[x] == tags[a][0]: found_ft = True if xpos[x] != tags[a][1] and '*' not in tags[a][1]: new_xpos = xpos[x] break if not found_ft: for x in range(len(xpos)): if tags[a][1] == xpos[x] and tags[a][2] != feats[x] and upos[x] == tags[a][0]: new_feats = feats[x] break if new_feats: if ('Hyph=Yes' not in new_feats and 'Hyph=Yes' in tags[a][2]) or ( 'Hyph=Yes' in new_feats and 'Hyph=Yes' not in tags[a][2]): # let the tagger deal with multiword tokens itself new_xpos = new_feats = None if new_xpos is not None: # non composite has a 2D torch here, composite has 3D if not isinstance(self.vocab['xpos'], CompositeVocab): preds[1][orig_idx.index(p)][a] = self.vocab['xpos'].map([new_xpos])[0] else: preds[1][orig_idx.index(p)][a] = torch.LongTensor( self.vocab['xpos'].map([new_xpos])[0]) if new_feats is not None: preds[2][orig_idx.index(p)][a] = torch.LongTensor( self.vocab['feats'].map([new_feats])[0]) a += 1 print('Post-filtering complete.') return loss, preds
def forward(self, input_seqs): """ Forward pass. # Arguments: input_seqs: Can be one of Numpy array, Torch.LongTensor, Torch.Variable, Torch.PackedSequence. # Return: Same format as input format (except for PackedSequence returned as Variable). """ # Check if we have Torch.LongTensor inputs or not Torch.Variable (assume Numpy array in this case), take note to return same format return_numpy = False return_tensor = False if isinstance(input_seqs, (torch.LongTensor, torch.cuda.LongTensor)): input_seqs = Variable(input_seqs) return_tensor = True elif not isinstance(input_seqs, Variable): input_seqs = Variable( torch.from_numpy(input_seqs.astype('int64')).long()) return_numpy = True # If we don't have a packed inputs, let's pack it reorder_output = False if not isinstance(input_seqs, PackedSequence): ho = self.lstm_0.weight_hh_l0.data.new(2, input_seqs.size()[0], self.hidden_size).zero_() co = self.lstm_0.weight_hh_l0.data.new(2, input_seqs.size()[0], self.hidden_size).zero_() # Reorder batch by sequence length input_lengths = torch.LongTensor([ torch.max(input_seqs[i, :].data.nonzero()) + 1 for i in range(input_seqs.size()[0]) ]) input_lengths, perm_idx = input_lengths.sort(0, descending=True) input_seqs = input_seqs[perm_idx][:, :input_lengths.max()] # Pack sequence and work on data tensor to reduce embeddings/dropout computations packed_input = pack_padded_sequence(input_seqs, input_lengths.cpu().numpy(), batch_first=True) reorder_output = True else: ho = self.lstm_0.weight_hh_l0.data.data.new( 2, input_seqs.size()[0], self.hidden_size).zero_() co = self.lstm_0.weight_hh_l0.data.data.new( 2, input_seqs.size()[0], self.hidden_size).zero_() input_lengths = input_seqs.batch_sizes packed_input = input_seqs hidden = (Variable(ho, requires_grad=False), Variable(co, requires_grad=False)) # Embed with an activation function to bound the values of the embeddings x = self.embed(packed_input.data) x = nn.Tanh()(x) # pyTorch 2D dropout2d operate on axis 1 which is fine for us x = self.embed_dropout(x) # Update packed sequence data for RNN packed_input = PackedSequence(x, packed_input.batch_sizes) # skip-connection from embedding to output eases gradient-flow and allows access to lower-level features # ordering of the way the merge is done is important for consistency with the pretrained model lstm_0_output, _ = self.lstm_0(packed_input, hidden) lstm_1_output, _ = self.lstm_1(lstm_0_output, hidden) # Update packed sequence data for attention layer packed_input = PackedSequence( torch.cat( (lstm_1_output.data, lstm_0_output.data, packed_input.data), dim=1), packed_input.batch_sizes) input_seqs, _ = pad_packed_sequence(packed_input, batch_first=True) x, att_weights = self.attention_layer(input_seqs, input_lengths) # output class probabilities or penultimate feature vector if not self.feature_output: x = self.final_dropout(x) outputs = self.output_layer(x) else: outputs = x # Reorder output if needed if reorder_output: reorered = Variable(outputs.data.new(outputs.size())) reorered[perm_idx] = outputs outputs = reorered # Adapt return format if needed if return_tensor: outputs = outputs.data if return_numpy: outputs = outputs.data.numpy() if self.return_attention: return outputs, att_weights else: return outputs
def main(): import argparse parser = argparse.ArgumentParser('Script for training embedding model on SCOP.') parser.add_argument('--dev', action='store_true', help='use train/dev split') parser.add_argument('-m', '--model', choices=['ssa', 'ua', 'me'], default='ssa', help='alignment scoring method for comparing sequences in embedding space [ssa: soft symmetric alignment, ua: uniform alignment, me: mean embedding] (default: ssa)') parser.add_argument('--allow-insert', action='store_true', help='model insertions (default: false)') parser.add_argument('--norm', choices=['l1', 'l2'], default='l1', help='comparison norm (default: l1)') parser.add_argument('--rnn-type', choices=['lstm', 'gru'], default='lstm', help='type of RNN block to use (default: lstm)') parser.add_argument('--embedding-dim', type=int, default=100, help='embedding dimension (default: 100)') parser.add_argument('--input-dim', type=int, default=512, help='dimension of input to RNN (default: 512)') parser.add_argument('--rnn-dim', type=int, default=512, help='hidden units of RNNs (default: 512)') parser.add_argument('--num-layers', type=int, default=3, help='number of RNN layers (default: 3)') parser.add_argument('--dropout', type=float, default=0, help='dropout probability (default: 0)') parser.add_argument('--epoch-size', type=int, default=100000, help='number of examples per epoch (default: 100,000)') parser.add_argument('--epoch-scale', type=int, default=5, help='scaling on epoch size (default: 5)') parser.add_argument('--num-epochs', type=int, default=100, help='number of epochs (default: 100)') parser.add_argument('--batch-size', type=int, default=64, help='minibatch size (default: 64)') parser.add_argument('--weight-decay', type=float, default=0, help='L2 regularization (default: 0)') parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--tau', type=float, default=0.5, help='sampling proportion exponent (default: 0.5)') parser.add_argument('--augment', type=float, default=0, help='probability of resampling amino acid for data augmentation (default: 0)') parser.add_argument('--lm', help='pretrained LM to use as initial embedding') parser.add_argument('-o', '--output', help='output file path (default: stdout)') parser.add_argument('--save-prefix', help='path prefix for saving models') parser.add_argument('-d', '--device', type=int, default=-2, help='compute device to use') args = parser.parse_args() prefix = args.output ## set the device d = args.device use_cuda = (d != -1) and torch.cuda.is_available() if d >= 0: torch.cuda.set_device(d) ## make the datasets astral_train_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.train.fa' astral_testpairs_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.test.sampledpairs.txt' if args.dev: astral_train_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.train.train.fa' astral_testpairs_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.train.dev.sampledpairs.txt' alphabet = Uniprot21() print('# loading training sequences:', astral_train_path, file=sys.stderr) with open(astral_train_path, 'rb') as f: names_train, structs_train, sequences_train = scop.parse_astral(f, encoder=alphabet) x_train = [torch.from_numpy(x).long() for x in sequences_train] if use_cuda: x_train = [x.cuda() for x in x_train] y_train = torch.from_numpy(structs_train) print('# loaded', len(x_train), 'training sequences', file=sys.stderr) print('# loading test sequence pairs:', astral_testpairs_path, file=sys.stderr) test_pairs_table = pd.read_csv(astral_testpairs_path, sep='\t') x0_test = [x.encode('utf-8').upper() for x in test_pairs_table['sequence_A']] x0_test = [torch.from_numpy(alphabet.encode(x)).long() for x in x0_test] x1_test = [x.encode('utf-8').upper() for x in test_pairs_table['sequence_B']] x1_test = [torch.from_numpy(alphabet.encode(x)).long() for x in x1_test] if use_cuda: x0_test = [x.cuda() for x in x0_test] x1_test = [x.cuda() for x in x1_test] y_test = test_pairs_table['similarity'].values y_test = torch.from_numpy(y_test).long() dataset_test = PairedDataset(x0_test, x1_test, y_test) print('# loaded', len(x0_test), 'test pairs', file=sys.stderr) ## make the dataset iterators scale = args.epoch_scale epoch_size = args.epoch_size batch_size = args.batch_size # precompute the similarity pairs y_train_levels = torch.cumprod((y_train.unsqueeze(1) == y_train.unsqueeze(0)).long(), 2) # data augmentation by resampling amino acids augment = None p = 0 if args.augment > 0: p = args.augment trans = torch.ones(len(alphabet),len(alphabet)) trans = trans/trans.sum(1, keepdim=True) if use_cuda: trans = trans.cuda() augment = MultinomialResample(trans, p) print('# resampling amino acids with p:', p, file=sys.stderr) dataset_train = AllPairsDataset(x_train, y_train_levels, augment=augment) similarity = y_train_levels.numpy().sum(2) levels,counts = np.unique(similarity, return_counts=True) order = np.argsort(levels) levels = levels[order] counts = counts[order] print('#', levels, file=sys.stderr) print('#', counts/np.sum(counts), file=sys.stderr) weight = counts**0.5 print('#', weight/np.sum(weight), file=sys.stderr) weight = counts**0.33 print('#', weight/np.sum(weight), file=sys.stderr) weight = counts**0.25 print('#', weight/np.sum(weight), file=sys.stderr) tau = args.tau print('# using tau:', tau, file=sys.stderr) print('#', counts**tau/np.sum(counts**tau), file=sys.stderr) weights = counts**tau/counts weights = weights[similarity].ravel() #weights = np.ones(len(dataset_train)) sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, epoch_size) # two training dataset iterators for sampling pairs of sequences for training train_iterator = torch.utils.data.DataLoader(dataset_train , batch_size=batch_size , sampler=sampler , collate_fn=collate_paired_sequences ) test_iterator = torch.utils.data.DataLoader(dataset_test , batch_size=batch_size , collate_fn=collate_paired_sequences ) ## initialize the model rnn_type = args.rnn_type rnn_dim = args.rnn_dim num_layers = args.num_layers embedding_size = args.embedding_dim input_dim = args.input_dim dropout = args.dropout allow_insert = args.allow_insert print('# initializing model with:', file=sys.stderr) print('# embedding_size:', embedding_size, file=sys.stderr) print('# input_dim:', input_dim, file=sys.stderr) print('# rnn_dim:', rnn_dim, file=sys.stderr) print('# num_layers:', num_layers, file=sys.stderr) print('# dropout:', dropout, file=sys.stderr) print('# allow_insert:', allow_insert, file=sys.stderr) compare_type = args.model print('# comparison method:', compare_type, file=sys.stderr) lm = None if args.lm is not None: lm = torch.load(args.lm) lm.eval() ## do not update the LM parameters for param in lm.parameters(): param.requires_grad = False print('# using LM:', args.lm, file=sys.stderr) if num_layers > 0: embedding = src.models.embedding.StackedRNN(len(alphabet), input_dim, rnn_dim, embedding_size , nlayers=num_layers, dropout=dropout, lm=lm) else: embedding = src.models.embedding.Linear(len(alphabet), input_dim, embedding_size, lm=lm) if args.norm == 'l1': norm = src.models.comparison.L1() print('# norm: l1', file=sys.stderr) elif args.norm == 'l2': norm = src.models.comparison.L2() print('# norm: l2', file=sys.stderr) model = src.models.comparison.OrdinalRegression(embedding, 5, align_method=compare_type , compare=norm, allow_insertions=allow_insert ) if use_cuda: model.cuda() ## setup training parameters and optimizer num_epochs = args.num_epochs weight_decay = args.weight_decay lr = args.lr print('# training with Adam: lr={}, weight_decay={}'.format(lr, weight_decay), file=sys.stderr) params = [p for p in model.parameters() if p.requires_grad] optim = torch.optim.Adam(params, lr=lr, weight_decay=weight_decay) ## train the model print('# training model', file=sys.stderr) save_prefix = args.save_prefix output = args.output if output is None: output = sys.stdout else: output = open(output, 'w') digits = int(np.floor(np.log10(num_epochs))) + 1 line = '\t'.join(['epoch', 'split', 'loss', 'mse', 'accuracy', 'r', 'rho' ]) print(line, file=output) for epoch in range(num_epochs): # train epoch model.train() it = 0 n = 0 loss_estimate = 0 mse_estimate = 0 acc_estimate = 0 for x0,x1,y in train_iterator: # zip(train_iterator_0, train_iterator_1): if use_cuda: y = y.cuda() y = Variable(y) b = len(x0) x = x0 + x1 x,order = pack_sequences(x) x = PackedSequence(Variable(x.data), x.batch_sizes) z = model(x) # embed the sequences z = unpack_sequences(z, order) z0 = z[:b] z1 = z[b:] logits = [] for i in range(b): z_a = z0[i] z_b = z1[i] logits.append(model.score(z_a, z_b)) logits = torch.stack(logits, 0) loss = F.binary_cross_entropy_with_logits(logits, y.float()) loss.backward() optim.step() optim.zero_grad() model.clip() # projected gradient for bounding ordinal regressionn parameters p = F.sigmoid(logits) ones = p.new(b,1).zero_() + 1 p_ge = torch.cat([ones, p], 1) p_lt = torch.cat([1-p, ones], 1) p = p_ge*p_lt p = p/p.sum(1,keepdim=True) # make sure p is normalized _,y_hard = torch.max(p, 1) levels = torch.arange(5).to(p.device) y_hat = torch.sum(p*levels, 1) y = torch.sum(y.data, 1) loss = F.cross_entropy(p, y) # calculate cross entropy loss from p vector correct = torch.sum((y == y_hard).float()) mse = torch.sum((y.float() - y_hat)**2) n += b delta = b*(loss.item() - loss_estimate) loss_estimate += delta/n delta = correct.item() - b*acc_estimate acc_estimate += delta/n delta = mse.item() - b*mse_estimate mse_estimate += delta/n if (n - b)//100 < n//100: print('# [{}/{}] training {:.1%} loss={:.5f}, mse={:.5f}, acc={:.5f}'.format(epoch+1 , num_epochs , n/epoch_size , loss_estimate , mse_estimate , acc_estimate ) , end='\r', file=sys.stderr) print(' '*80, end='\r', file=sys.stderr) line = '\t'.join([str(epoch+1).zfill(digits), 'train', str(loss_estimate) , str(mse_estimate), str(acc_estimate), '-', '-']) print(line, file=output) output.flush() # eval and save model model.eval() y = [] logits = [] with torch.no_grad(): for x0,x1,y_mb in test_iterator: if use_cuda: y_mb = y_mb.cuda() y.append(y_mb.long()) b = len(x0) x = x0 + x1 x,order = pack_sequences(x) x = PackedSequence(Variable(x.data), x.batch_sizes) z = model(x) # embed the sequences z = unpack_sequences(z, order) z0 = z[:b] z1 = z[b:] for i in range(b): z_a = z0[i] z_b = z1[i] logits.append(model.score(z_a, z_b)) y = torch.cat(y, 0) logits = torch.stack(logits, 0) p = F.sigmoid(logits).data ones = p.new(p.size(0),1).zero_() + 1 p_ge = torch.cat([ones, p], 1) p_lt = torch.cat([1-p, ones], 1) p = p_ge*p_lt p = p/p.sum(1,keepdim=True) # make sure p is normalized loss = F.cross_entropy(p, y).item() _,y_hard = torch.max(p, 1) levels = torch.arange(5).to(p.device) y_hat = torch.sum(p*levels, 1) accuracy = torch.mean((y == y_hard).float()).item() mse = torch.mean((y.float() - y_hat)**2).item() y = y.cpu().numpy() y_hat = y_hat.cpu().numpy() r,_ = pearsonr(y_hat, y) rho,_ = spearmanr(y_hat, y) line = '\t'.join([str(epoch+1).zfill(digits), 'test', str(loss), str(mse) , str(accuracy), str(r), str(rho)]) print(line, file=output) output.flush() # save the model if save_prefix is not None: save_path = save_prefix + '_epoch' + str(epoch+1).zfill(digits) + '.sav' model.cpu() torch.save(model, save_path) if use_cuda: model.cuda()
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): word_ids_lengths = attention_mask.sum(axis=1) word_embeddings = self.lookup(input_ids) packed_word_embeddings = pack_padded_sequence(word_embeddings, lengths=word_ids_lengths, batch_first=True, enforce_sorted=False) words_representation, _ = self.rnn(packed_word_embeddings) # This implementation uses the feature sentence_embeddings. Paper uses hidden state word_attention = self.word_attention(words_representation.data) word_attention = torch.tanh(word_attention) # Take the dot-product of the attention vectors with the context vector (i.e. parameter of linear layer) word_attention = self.word_context_vector(word_attention).squeeze( 1) # (n_words) # Compute softmax over the dot-product manually # Manually because they have to be computed only over words in the same sentence # First, take the exponent max_value = word_attention.max( ) # scalar, for numerical stability during exponent calculation word_attention = torch.exp(word_attention - max_value) # (n_words) # Re-arrange as sentences by re-padding with 0s (WORDS -> SENTENCES) word_attention, _ = pad_packed_sequence( PackedSequence( data=word_attention, batch_sizes=words_representation.batch_sizes, sorted_indices=words_representation.sorted_indices, unsorted_indices=words_representation.unsorted_indices), batch_first=True) # (n_sentences, max(words_per_sentence)) # Calculate softmax values as now words are arranged in their respective sentences word_alphas = word_attention / torch.sum( word_attention, dim=1, keepdim=True) # (n_sentences, max(words_per_sentence)) # Similarly re-arrange word-level RNN outputs as sentences by re-padding with 0s (WORDS -> SENTENCES) sentences, _ = pad_packed_sequence( words_representation, batch_first=True ) # (n_sentences, max(words_per_sentence), 2 * word_rnn_size) # Find sentence embeddings sentences = sentences * word_alphas.unsqueeze( 2) # (n_sentences, max(words_per_sentence), 2 * word_rnn_size) # gets the representation for the sentence sentences = sentences.sum(dim=1) # (n_sentences) logits = self.classifier(sentences) outputs = (logits, sentences) if labels is not None: loss_fct = nn.BCEWithLogitsLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels)) outputs = (loss, ) + outputs return outputs
def forward(self, list_progs, context_embeds, ll=None, target_list=None, gen_method='sample', sizes=None, has_stopped=None): n_prog = len(list_progs) prog_int_seqs = [ torch.LongTensor([self.vocab[c] for c in expr] + [self.tok_stop]).to(context_embeds.device) for expr in list_progs ] lengths = [v.size(0) for v in prog_int_seqs] padded_int_seqs = pad_sequence(prog_int_seqs, batch_first=False, padding_value=self.tok_pad) packed_seq = pack_padded_sequence(padded_int_seqs, lengths=lengths, batch_first=False, enforce_sorted=False) tok_embed = self.tok_embed(packed_seq.data) packed_input = PackedSequence( data=tok_embed, batch_sizes=packed_seq.batch_sizes, sorted_indices=packed_seq.sorted_indices, unsorted_indices=packed_seq.unsorted_indices) h = self.ctx2h(context_embeds).view(n_prog, 2 * self.rnn_layers, -1).transpose(0, 1) c = self.ctx2c(context_embeds).view(n_prog, 2 * self.rnn_layers, -1).transpose(0, 1) packed_out, _ = self.lstm(packed_input, (h, c)) unpacked_out, _ = pad_packed_sequence(packed_out) # positions to mod/del expr_poses = (padded_int_seqs == self.tok_constexpr) | ( padded_int_seqs == self.tok_subexpr) embed_expr = unpacked_out[expr_poses] if embed_expr.shape[0]: mod_scores = self.modify_score(embed_expr) del_scores = self.del_score(embed_expr) else: mod_scores = del_scores = None # positions to insert ins_poses = padded_int_seqs == self.tok_start insert_scores = self.insert_score(unpacked_out[ins_poses]) # positions to stop stop_poses = padded_int_seqs == self.tok_stop stop_scores = self.stop_score(unpacked_out[stop_poses]) logits = loc_score(mod_scores, del_scores, insert_scores, stop_scores, expr_poses, ins_poses, stop_poses, has_stopped) log_prob = F.log_softmax(logits, dim=0).t().contiguous() ll_target = None predecessors = None if target_list is None: if gen_method == 'sample': target = torch.multinomial(torch.exp(log_prob), 1) elif gen_method == 'argmax': target = torch.argmax(log_prob, dim=1) elif gen_method.startswith('beam'): beam_size = int(gen_method.split('-')[-1]) raw_scores = log_prob + ll if ll is not None else log_prob predecessors, target, ll_target, sizes = beam_step( raw_scores, sizes, beam_size) update_embed = unpacked_out[target, predecessors] else: raise NotImplementedError else: target = torch.LongTensor(target_list).to(log_prob.device) target = target.view(-1) if predecessors is None: ll_step = log_prob[range(n_prog), target] ll_target = ll_step.view( ll.shape) + ll if ll is not None else ll_step update_embed = unpacked_out[target, range(n_prog)] return ll_target.view(-1, 1), target, update_embed, predecessors, sizes
def iterate(self, src_tuple, target_tuple, training=True): # limit number of tokens o avoid gpu overload if self.limit_num_tokens is not None: src_tuple, target_tuple = self._batch_limit_tokens( src_tuple, target_tuple) src, src_length = src_tuple target, target_length = target_tuple batch_dim, time_dim = (0, 1) if self.batch_first else (1, 0) num_words = sum(target_length) - target.size(batch_dim) if isinstance(src, PackedSequence) or \ not isinstance(self.model_with_loss, DataParallel): if isinstance(src, PackedSequence): src = PackedSequence(src.data.to(self.device), src.batch_sizes.to(self.device)) else: src = src.to(self.device) target = target.to(self.device) if self.batch_first: inputs = (src, target[:, :-1]) target_labels = target[:, 1:].contiguous() else: inputs = (src, target[:-1]) target_labels = target[1:] # compute output loss, accuracy = self.model_with_loss(inputs, target_labels) loss = loss.sum() loss_measure = float(loss / num_words) if self.avg_loss_time: loss /= num_words else: loss /= target.size(batch_dim) accuracy = float(accuracy.sum().float() / num_words) if training: # compute gradient and do SGD step self.optimizer.zero_grad() loss.backward() if self.grad_clip is not None: if isinstance(self.grad_clip, dict): clip_encoder = self.grad_clip.get('encoder', 0) clip_decoder = self.grad_clip.get('decoder', 0) if clip_encoder > 0: clip_grad_norm_( self.model.encoder.parameters(), clip_encoder) if clip_decoder > 0: clip_grad_norm_( self.model.decoder.parameters(), clip_decoder) elif self.grad_clip > 0: # grad_clip is a number clip_grad_norm_(self.model.parameters(), self.grad_clip) if self.embedding_grad_clip is not None and self.embedding_grad_clip > 0: if hasattr(self.model.encoder, 'embedder'): clip_grad_norm_(self.model.encoder.embedder.parameters(), self.embedding_grad_clip) if hasattr(self.model.decoder, 'embedder'): clip_grad_norm_(self.model.decoder.embedder.parameters(), self.embedding_grad_clip) self.optimizer.step() return loss_measure, accuracy, num_words
def forward(self, sentences, words_per_sentence): """ Forward propagation. :param sentences: encoded sentence-level data, a tensor of dimension (n_sentences, word_pad_len, emb_size) :param words_per_sentence: sentence lengths, a tensor of dimension (n_sentences) :return: sentence embeddings, attention weights of words """ # Get word embeddings, apply dropout sentences = self.dropout(self.embeddings( sentences)) # (n_sentences, word_pad_len, emb_size) # Re-arrange as words by removing word-pads (SENTENCES -> WORDS) packed_words = pack_padded_sequence( sentences, lengths=words_per_sentence.tolist(), batch_first=True, enforce_sorted=False ) # a PackedSequence object, where 'data' is the flattened words (n_words, word_emb) # Apply the word-level RNN over the word embeddings (PyTorch automatically applies it on the PackedSequence) packed_words, _ = self.word_rnn( packed_words ) # a PackedSequence object, where 'data' is the output of the RNN (n_words, 2 * word_rnn_size) # # Find attention vectors by applying the attention linear layer on the output of the RNN att_w = self.word_attention(packed_words.data) # (n_words, att_size) att_w = torch.tanh(att_w) # (n_words, att_size) # Take the dot-product of the attention vectors with the context vector (i.e. parameter of linear layer) att_w = self.word_context_vector(att_w).squeeze(1) # (n_words) # Compute softmax over the dot-product manually # Manually because they have to be computed only over words in the same sentence # First, take the exponent max_value = att_w.max( ) # scalar, for numerical stability during exponent calculation att_w = torch.exp(att_w - max_value) # (n_words) # Re-arrange as sentences by re-padding with 0s (WORDS -> SENTENCES) att_w, _ = pad_packed_sequence( PackedSequence(data=att_w, batch_sizes=packed_words.batch_sizes, sorted_indices=packed_words.sorted_indices, unsorted_indices=packed_words.unsorted_indices), batch_first=True) # (n_sentences, max(words_per_sentence)) # Calculate softmax values as now words are arranged in their respective sentences word_alphas = att_w / torch.sum( att_w, dim=1, keepdim=True) # (n_sentences, max(words_per_sentence)) # Similarly re-arrange word-level RNN outputs as sentences by re-padding with 0s (WORDS -> SENTENCES) sentences, _ = pad_packed_sequence( packed_words, batch_first=True ) # (n_sentences, max(words_per_sentence), 2 * word_rnn_size) # print(sentences.size()) # Find sentence embeddings sentences = sentences * word_alphas.unsqueeze( 2) # (n_sentences, max(words_per_sentence), 2 * word_rnn_size) sentences = sentences.sum(dim=1) # (n_sentences, 2 * word_rnn_size) return sentences, word_alphas
def forward(self, documents, sentences_per_document, words_per_sentence): """ Forward propagation. :param documents: encoded document-level data, a tensor of dimensions (n_documents, sent_pad_len, word_pad_len) :param sentences_per_document: document lengths, a tensor of dimensions (n_documents) :param words_per_sentence: sentence lengths, a tensor of dimensions (n_documents, sent_pad_len) :return: document embeddings, attention weights of words, attention weights of sentences """ # Re-arrange as sentences by removing sentence-pads (DOCUMENTS -> SENTENCES) packed_sentences = pack_padded_sequence( documents, lengths=sentences_per_document.tolist(), batch_first=True, enforce_sorted=False ) # a PackedSequence object, where 'data' is the flattened sentences (n_sentences, word_pad_len) # Re-arrange sentence lengths in the same way (DOCUMENTS -> SENTENCES) packed_words_per_sentence = pack_padded_sequence( words_per_sentence, lengths=sentences_per_document.tolist(), batch_first=True, enforce_sorted=False ) # a PackedSequence object, where 'data' is the flattened sentence lengths (n_sentences) # Find sentence embeddings by applying the word-level attention module sentences, word_alphas = self.word_attention( packed_sentences.data, packed_words_per_sentence.data ) # (n_sentences, 2 * word_rnn_size), (n_sentences, max(words_per_sentence)) sentences = self.dropout(sentences) # Apply the sentence-level RNN over the sentence embeddings (PyTorch automatically applies it on the PackedSequence) packed_sentences, _ = self.sentence_rnn( PackedSequence(data=sentences, batch_sizes=packed_sentences.batch_sizes, sorted_indices=packed_sentences.sorted_indices, unsorted_indices=packed_sentences.unsorted_indices)) documents, _ = pad_packed_sequence(packed_sentences, batch_first=True) # Find attention vectors by applying the attention linear layer on the output of the RNN att_s = self.sentence_attention( packed_sentences.data) # (n_sentences, att_size) att_s = torch.tanh(att_s) # (n_sentences, att_size) # Take the dot-product of the attention vectors with the context vector (i.e. parameter of linear layer) att_s = self.sentence_context_vector(att_s).squeeze(1) # (n_sentences) # Compute softmax over the dot-product manually # Manually because they have to be computed only over sentences in the same document # First, take the exponent max_value = att_s.max( ) # scalar, for numerical stability during exponent calculation att_s = torch.exp(att_s - max_value) # (n_sentences) # Re-arrange as documents by re-padding with 0s (SENTENCES -> DOCUMENTS) att_s, _ = pad_packed_sequence( PackedSequence(data=att_s, batch_sizes=packed_sentences.batch_sizes, sorted_indices=packed_sentences.sorted_indices, unsorted_indices=packed_sentences.unsorted_indices), batch_first=True) # (n_documents, max(sentences_per_document)) # Calculate softmax values as now sentences are arranged in their respective documents sentence_alphas = att_s / torch.sum( att_s, dim=1, keepdim=True) # (n_documents, max(sentences_per_document)) # Similarly re-arrange sentence-level RNN outputs as documents by re-padding with 0s (SENTENCES -> DOCUMENTS) documents, _ = pad_packed_sequence( packed_sentences, batch_first=True ) # (n_documents, max(sentences_per_document), 2 * sentence_rnn_size) # Find document embeddings documents = documents * sentence_alphas.unsqueeze( 2 ) # (n_documents, max(sentences_per_document), 2 * sentence_rnn_size) return documents
def forward( self, x: Union[torch.Tensor, PackedSequence], state_init: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: r""" Implements the forward pass of the DPLSTM when a sequence is input. Dimensions as follows: - B: Batch size - T: Sequence length - D: LSTM input hidden size (eg from a word embedding) - H: LSTM output hidden size - L: number of layers in the LSTM - P: number of directions (2 if bidirectional, else 1) Args: x: Input sequence to the DPLSTM of shape ``[T, B, D]``. Or it can be a PackedSequence. state_init: Initial state of the LSTM as a tuple ``(h_0, c_0)``, where: - ``h_0`` of shape ``[L*P, B, H]`` contains the initial hidden state - ``c_0`` of shape ``[L*P, B, H]`` contains the initial cell state This argument can be (and defaults to) None, in which case zero tensors will be used. Returns: ``output, (h_n, c_n)`` where, ``output`` is of shape ``[T, B, H * P]`` and is a tensor containing the output features (``h_t``) from the last layer of the DPLSTM for each timestep ``t``. ``h_n`` is of shape ``[L * P, B, H]`` and contains the hidden state for ``t = T``. ``c_n`` is of shape ``[L * P, B, H]`` and contains the cell state for ``t = T``. """ if isinstance(x, PackedSequence): x, batch_sizes, sorted_indices, unsorted_indices = x B = batch_sizes[0].item() _, D = x.shape x = x.split(tuple(batch_sizes)) for layer in self.layers: layer.set_max_batch_length(B) else: sorted_indices = None unsorted_indices = None batch_sizes = None x = self._rearrange_batch_dim(x) T, B, D = x.shape L = self.num_layers P = 2 if self.bidirectional else 1 H = self.hidden_size h_0s, c_0s = state_init or (None, None) if h_0s is None: h_0s = torch.zeros( L, P, B, self.hidden_size, dtype=x[0].dtype, device=x[0].device, ) else: h_0s = h_0s.reshape([L, P, B, H]) h_0s = self._permute_hidden(h_0s, sorted_indices, 2) if c_0s is None: c_0s = torch.zeros( L, P, B, self.hidden_size, dtype=x[0].dtype, device=x[0].device, ) else: c_0s = c_0s.reshape([L, P, B, H]) c_0s = self._permute_hidden(c_0s, sorted_indices, 2) hs: List[torch.Tensor] = [] cs: List[torch.Tensor] = [] for layer, h0, c0 in zip(self.layers, h_0s, c_0s): if not self.bidirectional: h0 = h0.squeeze(0) c0 = c0.squeeze(0) x, (h, c) = layer(x, (h0, c0), batch_sizes) if not self.bidirectional: h = h.unsqueeze(0) # [1, B, H] c = c.unsqueeze(0) # [1, B, H] hs.append(h) cs.append(c) hs = torch.cat(hs, dim=0) # [L * P, B, H] cs = torch.cat(cs, dim=0) # [L * P, B, H] if batch_sizes is not None: seq_lengths = _compute_seq_lengths(batch_sizes) packed_data = pack_padded_sequence( pad_sequence(x, batch_first=False), seq_lengths, batch_first=True )[0] out = PackedSequence( packed_data, batch_sizes, sorted_indices, unsorted_indices ) else: out = self._rearrange_batch_dim(x) return out, ( self._permute_hidden(hs, unsorted_indices), self._permute_hidden(cs, unsorted_indices), )
def forward(self, x, hx=None): r""" :param x: [batch, seq_len, input_size] 输入序列 :param hx: [batch, hidden_size] 初始隐状态, 若为 ``None`` , 设为全1向量. Default: ``None`` :return (output, ht): [batch, seq_len, hidden_size*num_direction] 输出序列 和 [batch, hidden_size*num_direction] 最后时刻隐状态 """ is_lstm = self.is_lstm is_packed = isinstance(x, PackedSequence) if not is_packed: seq_len = x.size(1) if self.batch_first else x.size(0) max_batch_size = x.size(0) if self.batch_first else x.size(1) seq_lens = torch.LongTensor( [seq_len for _ in range(max_batch_size)]) x = pack_padded_sequence(x, seq_lens, batch_first=self.batch_first) else: max_batch_size = int(x.batch_sizes[0]) x, batch_sizes = x.data, x.batch_sizes if hx is None: hx = x.new_zeros(self.num_layers * self.num_directions, max_batch_size, self.hidden_size, requires_grad=True) if is_lstm: hx = (hx, hx.new_zeros(hx.size(), requires_grad=True)) mask_x = x.new_ones((max_batch_size, self.input_size)) mask_out = x.new_ones( (max_batch_size, self.hidden_size * self.num_directions)) mask_h_ones = x.new_ones((max_batch_size, self.hidden_size)) nn.functional.dropout(mask_x, p=self.input_dropout, training=self.training, inplace=True) nn.functional.dropout(mask_out, p=self.hidden_dropout, training=self.training, inplace=True) hidden = x.new_zeros((self.num_layers * self.num_directions, max_batch_size, self.hidden_size)) if is_lstm: cellstate = x.new_zeros((self.num_layers * self.num_directions, max_batch_size, self.hidden_size)) for layer in range(self.num_layers): output_list = [] input_seq = PackedSequence(x, batch_sizes) mask_h = nn.functional.dropout(mask_h_ones, p=self.hidden_dropout, training=self.training, inplace=False) for direction in range(self.num_directions): output_x, hidden_x = self._forward_one( layer, direction, input_seq, hx, mask_x if layer == 0 else mask_out, mask_h) output_list.append(output_x.data) idx = self.num_directions * layer + direction if is_lstm: hidden[idx] = hidden_x[0] cellstate[idx] = hidden_x[1] else: hidden[idx] = hidden_x x = torch.cat(output_list, dim=-1) if is_lstm: hidden = (hidden, cellstate) if is_packed: output = PackedSequence(x, batch_sizes) else: x = PackedSequence(x, batch_sizes) output, _ = pad_packed_sequence(x, batch_first=self.batch_first) return output, hidden
def forward(self, word, word_mask, wordchars, wordchars_mask, upos, pretrained, head, deprel, word_orig_idx, sentlens, wordlens): def pack(x): return pack_padded_sequence(x, sentlens, batch_first=True) inputs = [] if self.args['pretrain']: pretrained_emb = self.pretrained_emb(pretrained) pretrained_emb = self.trans_pretrained(pretrained_emb) pretrained_emb = pack(pretrained_emb) inputs += [pretrained_emb] #def pad(x): # return pad_packed_sequence(PackedSequence(x, pretrained_emb.batch_sizes), batch_first=True)[0] if self.args['word_emb_dim'] > 0: word_emb = self.word_emb(word) word_emb = pack(word_emb) inputs += [word_emb] if self.args['tag_emb_dim'] > 0: pos_emb = self.upos_emb(upos) pos_emb = pack(pos_emb) inputs += [pos_emb] if self.args['char'] and self.args['char_emb_dim'] > 0: char_reps = self.charmodel(wordchars, wordchars_mask, word_orig_idx, sentlens, wordlens) char_reps = PackedSequence(self.trans_char(self.drop(char_reps.data)), char_reps.batch_sizes) inputs += [char_reps] lstm_inputs = torch.cat([x.data for x in inputs], 1) lstm_inputs = self.worddrop(lstm_inputs, self.drop_replacement) lstm_inputs = self.drop(lstm_inputs) lstm_inputs = PackedSequence(lstm_inputs, inputs[0].batch_sizes) lstm_outputs, _ = self.parserlstm(lstm_inputs, sentlens, hx=(self.parserlstm_h_init.expand(2 * self.args['num_layers'], word.size(0), self.args['hidden_dim']).contiguous(), self.parserlstm_c_init.expand(2 * self.args['num_layers'], word.size(0), self.args['hidden_dim']).contiguous())) lstm_outputs, _ = pad_packed_sequence(lstm_outputs, batch_first=True) unlabeled_scores = self.unlabeled(self.drop(lstm_outputs), self.drop(lstm_outputs)).squeeze(3) deprel_scores = self.deprel(self.drop(lstm_outputs), self.drop(lstm_outputs)) if self.args['linearization'] or self.args['distance']: head_offset = torch.arange(word.size(1), device=head.device).view(1, 1, -1).expand(word.size(0), -1, -1) - torch.arange(word.size(1), device=head.device).view(1, -1, 1).expand(word.size(0), -1, -1) if self.args['linearization']: lin_scores = self.linearization(self.drop(lstm_outputs), self.drop(lstm_outputs)).squeeze(3) unlabeled_scores += F.logsigmoid(lin_scores * torch.sign(head_offset).float()).detach() if self.args['distance']: dist_scores = self.distance(self.drop(lstm_outputs), self.drop(lstm_outputs)).squeeze(3) dist_pred = 1 + F.softplus(dist_scores) dist_target = torch.abs(head_offset) dist_kld = -torch.log((dist_target.float() - dist_pred)**2/2 + 1) unlabeled_scores += dist_kld.detach() diag = torch.eye(head.size(-1)+1, dtype=torch.uint8, device=head.device).unsqueeze(0) unlabeled_scores.masked_fill_(diag, -float('inf')) preds = [] if self.training: unlabeled_scores = unlabeled_scores[:, 1:, :] # exclude attachment for the root symbol unlabeled_scores = unlabeled_scores.masked_fill(word_mask.unsqueeze(1), -float('inf')) unlabeled_target = head.masked_fill(word_mask[:, 1:], -1) loss = self.crit(unlabeled_scores.contiguous().view(-1, unlabeled_scores.size(2)), unlabeled_target.view(-1)) deprel_scores = deprel_scores[:, 1:] # exclude attachment for the root symbol deprel_scores = torch.gather(deprel_scores, 2, head.unsqueeze(2).unsqueeze(3).expand(-1, -1, -1, len(self.vocab['deprel']))).view(-1, len(self.vocab['deprel'])) deprel_target = deprel.masked_fill(word_mask[:, 1:], -1) loss += self.crit(deprel_scores.contiguous(), deprel_target.view(-1)) if self.args['linearization']: #lin_scores = lin_scores[:, 1:].masked_select(goldmask) lin_scores = torch.gather(lin_scores[:, 1:], 2, head.unsqueeze(2)).view(-1) lin_scores = torch.cat([-lin_scores.unsqueeze(1)/2, lin_scores.unsqueeze(1)/2], 1) #lin_target = (head_offset[:, 1:] > 0).long().masked_select(goldmask) lin_target = torch.gather((head_offset[:, 1:] > 0).long(), 2, head.unsqueeze(2)) loss += self.crit(lin_scores.contiguous(), lin_target.view(-1)) if self.args['distance']: #dist_kld = dist_kld[:, 1:].masked_select(goldmask) dist_kld = torch.gather(dist_kld[:, 1:], 2, head.unsqueeze(2)) loss -= dist_kld.sum() loss /= wordchars.size(0) # number of words else: loss = 0 preds.append(F.log_softmax(unlabeled_scores, 2).detach().cpu().numpy()) preds.append(deprel_scores.max(3)[1].detach().cpu().numpy()) return loss, preds
def iterate(self, src, target, training=True): # limit number of tokens o avoid gpu overload if self.limit_num_tokens is not None: src, target = self._batch_limit_tokens(src, target) src, src_length = src target, target_length = target batch_dim, time_dim = (0, 1) if self.batch_first else (1, 0) num_words = sum(target_length) - target.size(batch_dim) # Allow packed source sequences - for cudnn rnns if isinstance(src, PackedSequence): src_pack = src src = src.data else: src_pack = None if self.cuda and not isinstance(self.model_with_loss, DataParallel): src = src.cuda() target = target.cuda() src_var = Variable(src, volatile=not training) target_var = Variable(target, volatile=not training) if src_pack is not None: src_var = PackedSequence(src_var, src_pack[1]) if self.batch_first: inputs = (src_var, target_var[:, :-1]) target_labels = target_var[:, 1:].contiguous() else: inputs = (src_var, target_var[:-1]) target_labels = target_var[1:] # compute output loss = self.model_with_loss(inputs, target_labels).sum() loss /= num_words if training: # compute gradient and do SGD step self.optimizer.zero_grad() loss.backward() if self.grad_clip is not None: if isinstance(self.grad_clip, dict): clip_encoder = self.grad_clip.get('encoder', 0) clip_decoder = self.grad_clip.get('decoder', 0) if clip_encoder > 0: clip_grad_norm(self.model.encoder.parameters(), clip_encoder) if clip_decoder > 0: clip_grad_norm(self.model.decoder.parameters(), clip_decoder) elif self.grad_clip > 0: # grad_clip is a number clip_grad_norm(self.model.parameters(), self.grad_clip) if self.embedding_grad_clip is not None and self.embedding_grad_clip > 0: if hasattr(self.model.encoder, 'embedder'): clip_grad_norm(self.model.encoder.embedder.parameters(), self.embedding_grad_clip) if hasattr(self.model.decoder, 'embedder'): clip_grad_norm(self.model.decoder.embedder.parameters(), self.embedding_grad_clip) self.optimizer.step() return loss.data[0], num_words
def forward(self, docs, doc_lengths, sent_lengths): """ :param docs: encoded document-level data; LongTensor (num_docs, padded_doc_length, padded_sent_length) :param doc_lengths: unpadded document lengths; LongTensor (num_docs) :param sent_lengths: unpadded sentence lengths; LongTensor (num_docs, padded_doc_length) :return: document embeddings, attention weights of words, attention weights of sentences """ # Sort documents by decreasing order in length doc_lengths, doc_perm_idx = doc_lengths.sort(dim=0, descending=True) docs = docs[doc_perm_idx] sent_lengths = sent_lengths[doc_perm_idx] # Make a long batch of sentences by removing pad-sentences # i.e. `docs` was of size (num_docs, padded_doc_length, padded_sent_length) # -> `packed_sents.data` is now of size (num_sents, padded_sent_length) packed_sents = pack_padded_sequence(docs, lengths=doc_lengths.tolist(), batch_first=True) # effective batch size at each timestep valid_bsz = packed_sents.batch_sizes # Make a long batch of sentence lengths by removing pad-sentences # i.e. `sent_lengths` was of size (num_docs, padded_doc_length) # -> `packed_sent_lengths.data` is now of size (num_sents) packed_sent_lengths = pack_padded_sequence( sent_lengths, lengths=doc_lengths.tolist(), batch_first=True) # Word attention module sents, word_att_weights = self.word_attention(packed_sents.data, packed_sent_lengths.data) # NOTE MODIFICATION (FEATURES) sents = self.dropout(sents) # Sentence-level GRU over sentence embeddings packed_sents, _ = self.gru(PackedSequence(sents, valid_bsz)) # NOTE MODIFICATION (FEATURES) if self.use_layer_norm: normed_sents = self.layer_norm(packed_sents.data) else: normed_sents = packed_sents # Sentence attention att = torch.tanh(self.sent_attention(normed_sents)) att = self.sentence_context_vector(att).squeeze(1) # NOTE MODIFICATION (BUG) val = att.max() att = torch.exp(att - val) # Restore as documents by repadding att, _ = pad_packed_sequence(PackedSequence(att, valid_bsz), batch_first=True) # Note MODIFICATION (BUG) sent_att_weights = att / torch.sum(att, dim=1, keepdim=True) # Restore as documents by repadding docs, _ = pad_packed_sequence(packed_sents, batch_first=True) # Compute document vectors docs = docs * sent_att_weights.unsqueeze(2) docs = docs.sum(dim=1) # Restore as documents by repadding word_att_weights, _ = pad_packed_sequence(PackedSequence( word_att_weights, valid_bsz), batch_first=True) # Restore the original order of documents (undo the first sorting) _, doc_unperm_idx = doc_perm_idx.sort(dim=0, descending=False) docs = docs[doc_unperm_idx] # NOTE MODIFICATION (BUG) word_att_weights = word_att_weights[doc_unperm_idx] sent_att_weights = sent_att_weights[doc_unperm_idx] return docs, word_att_weights, sent_att_weights
def forward(self, input, hx=None): is_lstm = self.is_lstm is_packed = isinstance(input, PackedSequence) if not is_packed: seq_len = input.size(1) if self.batch_first else input.size(0) max_batch_size = input.size(0) if self.batch_first else input.size( 1) seq_lens = torch.LongTensor( [seq_len for _ in range(max_batch_size)]) input = pack_padded_sequence(input, seq_lens, batch_first=self.batch_first) else: max_batch_size = int(input.batch_sizes[0]) input, batch_sizes = input.data, input.batch_sizes if hx is None: hx = input.new_zeros(self.num_layers * self.num_directions, max_batch_size, self.hidden_size, requires_grad=True) if is_lstm: hx = (hx, hx.new_zeros(hx.size(), requires_grad=True)) mask_x = input.new_ones((max_batch_size, self.input_size)) mask_out = input.new_ones( (max_batch_size, self.hidden_size * self.num_directions)) mask_h_ones = input.new_ones((max_batch_size, self.hidden_size)) nn.functional.dropout(mask_x, p=self.input_dropout, training=self.training, inplace=True) nn.functional.dropout(mask_out, p=self.hidden_dropout, training=self.training, inplace=True) hidden = input.new_zeros((self.num_layers * self.num_directions, max_batch_size, self.hidden_size)) if is_lstm: cellstate = input.new_zeros((self.num_layers * self.num_directions, max_batch_size, self.hidden_size)) for layer in range(self.num_layers): output_list = [] input_seq = PackedSequence(input, batch_sizes) mask_h = nn.functional.dropout(mask_h_ones, p=self.hidden_dropout, training=self.training, inplace=False) for direction in range(self.num_directions): output_x, hidden_x = self._forward_one( layer, direction, input_seq, hx, mask_x if layer == 0 else mask_out, mask_h) output_list.append(output_x.data) idx = self.num_directions * layer + direction if is_lstm: hidden[idx] = hidden_x[0] cellstate[idx] = hidden_x[1] else: hidden[idx] = hidden_x input = torch.cat(output_list, dim=-1) if is_lstm: hidden = (hidden, cellstate) if is_packed: output = PackedSequence(input, batch_sizes) else: input = PackedSequence(input, batch_sizes) output, _ = pad_packed_sequence(input, batch_first=self.batch_first) return output, hidden
def forward(self, word, word_mask, wordchars, wordchars_mask, pos, feats, pretrained, word_orig_idx, sentlens, wordlens): def pack(x): return pack_padded_sequence(x, sentlens, batch_first=True) def get_batch_sizes(sentlens): b = [] for i in range(max(sentlens)): c = len([x for x in sentlens if x > i]) b.append(c) return torch.tensor(b) def pad(x): return pad_packed_sequence(PackedSequence(x, batch_sizes), batch_first=True)[0] inputs = [] if self.use_word: word_emb = self.word_emb(word) word_emb = pack(word_emb) inputs += [word_emb] batch_sizes = word_emb.batch_sizes else: batch_sizes = get_batch_sizes(sentlens) if self.use_pretrained: pretrained_emb = self.pretrained_emb(pretrained) pretrained_emb = self.trans_pretrained(pretrained_emb) pretrained_emb = pack(pretrained_emb) inputs += [pretrained_emb] if self.use_char: char_reps = self.charmodel(wordchars, wordchars_mask, word_orig_idx, sentlens, wordlens) char_reps = PackedSequence( self.trans_char(self.drop(char_reps.data)), char_reps.batch_sizes) inputs += [char_reps] lstm_inputs = torch.cat([x.data for x in inputs], 1) lstm_inputs = self.worddrop(lstm_inputs, self.drop_replacement) lstm_inputs = self.drop(lstm_inputs) lstm_inputs = PackedSequence(lstm_inputs, inputs[0].batch_sizes) lstm_outputs, _ = self.taggerlstm( lstm_inputs, sentlens, hx=(self.taggerlstm_h_init.expand( 2 * self.args['tag_num_layers'], word.size(0), self.args['tag_hidden_dim']).contiguous(), self.taggerlstm_c_init.expand( 2 * self.args['tag_num_layers'], word.size(0), self.args['tag_hidden_dim']).contiguous())) lstm_outputs = lstm_outputs.data pos_hid = F.relu(self.pos_hid(self.drop(lstm_outputs))) pos_pred = self.pos_clf(self.drop(pos_hid)) preds = [pad(pos_pred).max(2)[1]] pos = pack(pos).data loss = self.crit(pos_pred.view(-1, pos_pred.size(-1)), pos.view(-1)) if self.share_hid: feats_hid = pos_hid clffunc = lambda clf, hid: clf(self.drop(hid)) else: feats_hid = F.relu(self.feats_hid(self.drop(lstm_outputs))) # TODO: self.training is never set, but check if this is a bug #if self.training: pos_emb = self.pos_emb(pos) else: pos_emb = self.pos_emb(pos_pred.max(1)[1]) clffunc = lambda clf, hid: clf(self.drop(hid), self.drop(pos_emb)) feats_preds = [] feats = pack(feats).data for i in range(len(self.vocab['feats'])): feats_pred = clffunc(self.feats_clf[i], feats_hid) loss += self.crit(feats_pred.view(-1, feats_pred.size(-1)), feats[:, i].view(-1)) feats_preds.append(pad(feats_pred).max(2, keepdim=True)[1]) preds.append(torch.cat(feats_preds, 2)) return loss, preds
def _add_embeddings_internal(self, sentences: Union[List[Sentence], Sentence]): """Add embeddings to all sentences in the given list of sentences. If embeddings are already added, update only if embeddings are non-static.""" if type(sentences) is Sentence: sentences = [sentences] self.rnn.zero_grad() # embed words in the sentence self.embeddings.embed(sentences) lengths: List[int] = [len(sentence.tokens) for sentence in sentences] longest_token_sequence_in_batch: int = max(lengths) pre_allocated_zero_tensor = torch.zeros( self.embeddings.embedding_length * longest_token_sequence_in_batch, dtype=torch.float, device=flair.device, ) all_embs: List[torch.Tensor] = list() for sentence in sentences: all_embs += [ emb for token in sentence for emb in token.get_each_embedding() ] nb_padding_tokens = longest_token_sequence_in_batch - len(sentence) if nb_padding_tokens > 0: t = pre_allocated_zero_tensor[:self.embeddings. embedding_length * nb_padding_tokens] all_embs.append(t) sentence_tensor = torch.cat(all_embs).view([ len(sentences), longest_token_sequence_in_batch, self.embeddings.embedding_length, ]) # before-RNN dropout if self.dropout: sentence_tensor = self.dropout(sentence_tensor) if self.locked_dropout: sentence_tensor = self.locked_dropout(sentence_tensor) if self.word_dropout: sentence_tensor = self.word_dropout(sentence_tensor) # reproject if set if self.reproject_words: sentence_tensor = self.word_reprojection_map(sentence_tensor) # push through RNN packed = pack_padded_sequence(sentence_tensor, lengths, enforce_sorted=False, batch_first=True) rnn_out, hidden = self.rnn(packed) # Attention mechanism is inspired by word attention network in: # https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Text-Classification/blob/ec11e234bbbae2adcd7d665489999410911a9fb4/model.py#L173 # Feed word annotation through one layer MLP to get hidden representation hidden_rep = self.word_attention(rnn_out.data) hidden_rep = torch.tanh(hidden_rep) # Measure importance of word as similarity of hidden representation with word level context vector # To get normalized attention weights perform softmax function in steps # 1. Take the dot-product of the attention vectors with the context vector (i.e. parameter of linear layer) att_weights = self.word_context_vector(hidden_rep).squeeze( 1) # (n_words) # 2. Take the exponent max_value = att_weights.max( ) # scalar, for numerical stability during exponent calculation att_weights = torch.exp(att_weights - max_value) # (n_words) # Re-arrange attention weights as sentences packed_att_w = PackedSequence( data=att_weights, batch_sizes=rnn_out.batch_sizes, sorted_indices=rnn_out.sorted_indices, unsorted_indices=rnn_out.unsorted_indices) att_weights, output_lengths = pad_packed_sequence( packed_att_w, batch_first=True) # (n_sentences, max(words_per_sentence)) # 3. Calculate softmax values: could have called F.softmax here instead of doing exp before re-arrangement? att_weights = att_weights / torch.sum( att_weights, dim=1, keepdim=True) # (n_sentences, max(words_per_sentence)) # Re-arrange word-level RNN outputs as sentences by re-padding with 0s (WORDS -> SENTENCES) outputs, _ = pad_packed_sequence( rnn_out, batch_first=True ) # (n_sentences, max(words_per_sentence), 2 * word_rnn_size) # Compute sentence embeddings as weighted sum of word annotations based on the attention weights outputs = outputs * att_weights.unsqueeze( 2) # (n_sentences, max(words_per_sentence), 2 * word_rnn_size) outputs = outputs.sum(dim=1) # (n_sentences, 2 * word_rnn_size) # after-RNN dropout if self.dropout: outputs = self.dropout(outputs) if self.locked_dropout: outputs = self.locked_dropout(outputs) # extract sentence embeddings for sentence_no, length in enumerate(lengths): embedding = outputs[sentence_no] if self.static_embeddings: embedding = embedding.detach() sentence = sentences[sentence_no] sentence.set_embedding(self.name, embedding)
def pad(x): return pad_packed_sequence(PackedSequence(x, batch_sizes), batch_first=True)[0]
def pad(x): # inverse operation to pack_padded_sequence(). Pads a packed batch of variable length sequences. return pad_packed_sequence(PackedSequence(x, word_emb.batch_sizes), batch_first=True)[0]
def forward(self, src_tokens, src_lengths): if LanguagePairDataset.LEFT_PAD_SOURCE: # convert left-padding to right-padding src_tokens.data = utils.convert_padding_direction( src_tokens.data, src_lengths.data, self.padding_idx, left_to_right=True, ) if self.word_dropout_module is not None: src_tokens.data = self.word_dropout_module(src_tokens.data) bsz, seqlen = src_tokens.size() # embed tokens x = self.embed_tokens(src_tokens) x = F.dropout(x, p=self.dropout_in, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) # Generate packed seq to deal with varying source seq length packed_input, batch_sizes = pack_padded_sequence( x, src_lengths, ) final_hiddens, final_cells = [], [] next_hiddens = [] for i, rnn_layer in enumerate(self.layers): current_hidden_size = self.hidden_dim // 2 if \ rnn_layer.is_bidirectional else self.hidden_dim if self.cell_type in ['lstm', 'milstm', 'layer_norm_lstm']: prev_hidden = ( x.data.new(bsz, current_hidden_size).zero_(), x.data.new(bsz, current_hidden_size).zero_(), ) else: raise Exception(f'{self.cell_type} not implemented') hidden, current_output = rnn_layer.forward( packed_input, prev_hidden, batch_sizes, ) next_hiddens.append(hidden) prev_hidden = next_hiddens[-1] if self.dropout_out != 0: current_output = F.dropout( current_output, p=self.dropout_out, training=self.training, ) if self.residual_level is not None and i >= self.residual_level: packed_input = packed_input.clone() + current_output else: packed_input = current_output final_hiddens, final_cells = zip(*next_hiddens) # Reshape to [num_layer, batch_size, hidden_dim] final_hiddens = torch.cat( final_hiddens, dim=0, ).view(self.num_layers, *final_hiddens[0].size()) final_cells = torch.cat( final_cells, dim=0, ).view(self.num_layers, *final_cells[0].size()) # [max_seqlen, batch_size, hidden_dim] padding_value = -np.inf if self.add_encoder_output_as_decoder_input else 0 unpacked_output, _ = pad_packed_sequence( PackedSequence(packed_input, batch_sizes), padding_value=padding_value, ) return ( unpacked_output, final_hiddens, final_cells, src_lengths, src_tokens, )
def forward(self, documents, sentences_per_document, words_per_sentence): # pack sequences (remove word-pads, DOCUMENTS -> SENTENCES) packed_sentences = pack_padded_sequence( documents, lengths=sentences_per_document.tolist(), batch_first=True, enforce_sorted=False ) # a PackedSequence object, where 'data' is the flattened sentences (n_sentences, word_pad_len) # re-arrange sentence lengths in the same way (DOCUMENTS -> SENTENCES) packed_words_per_sentence = pack_padded_sequence( words_per_sentence, lengths=sentences_per_document.tolist(), batch_first=True, enforce_sorted=False ) # a PackedSequence object, where 'data' is the flattened sentence lengths (n_sentences) # word encoder, get sentence vectors sentences, word_alphas = self.word_encoder( packed_sentences.data, packed_words_per_sentence.data ) # (n_sentences, 2 * word_rnn_size), (n_sentences, max(words_per_sentence)) sentences = self.dropout(sentences) # run through sentence-level RNN (PyTorch automatically applies it on the PackedSequence) packed_sentences, _ = self.sentence_rnn( PackedSequence(data=sentences, batch_sizes=packed_sentences.batch_sizes, sorted_indices=packed_sentences.sorted_indices, unsorted_indices=packed_sentences.unsorted_indices) ) # a PackedSequence object, where 'data' is the output of the RNN (n_sentences, 2 * sentence_rnn_size) # unpack sequences (re-pad with 0s, SENTENCES -> DOCUMENTS) # we do unpacking here because attention weights have to be computed only over sentences in the same document documents, _ = pad_packed_sequence( packed_sentences, batch_first=True ) # (n_documents, max(sentences_per_document), 2 * sentence_rnn_size) # sentence-level attention # eq.8: u_i = tanh(W_s h_i + b_s) u_i = self.W_s( documents) # (n_documents, max(sentences_per_document), att_size) u_i = self.tanh( u_i) # (n_documents, max(sentences_per_document), att_size) # eq.9: alpha_i = softmax(u_i u_s) sent_alphas = self.u_s(u_i).squeeze( 2) # (n_documents, max(sentences_per_document)) sent_alphas = self.softmax( sent_alphas) # (n_documents, max(sentences_per_document)) # form document vectors # eq.10: v = \sum_i α_i h_i documents = documents * sent_alphas.unsqueeze( 2 ) # (n_documents, max(sentences_per_document), 2 * sentence_rnn_size) documents = documents.sum( dim=1) # (n_documents, 2 * sentence_rnn_size) # also re-arrange word_alphas (SENTENCES -> DOCUMENTS) word_alphas, _ = pad_packed_sequence( PackedSequence(data=word_alphas, batch_sizes=packed_sentences.batch_sizes, sorted_indices=packed_sentences.sorted_indices, unsorted_indices=packed_sentences.unsorted_indices), batch_first=True ) # (n_documents, max(sentences_per_document), max(words_per_sentence)) return documents, word_alphas, sent_alphas
def to_cuda(batch_data): sentences, gazetteers, batch_tags = batch_data return (PackedSequence(sentences.data.cuda(), sentences.batch_sizes), PackedSequence(gazetteers.data.cuda(), gazetteers.batch_sizes), PackedSequence(batch_tags.data.cuda(), batch_tags.batch_sizes))
def val_emotion(encoder, decoder, vocab, criterion, data_loaders, tags): decoder.eval() encoder.eval() batch_time = AverageMeter() losses = [AverageMeter() for _ in range(len(tags))] top5accs = [AverageMeter() for _ in range(len(tags))] bleu4s = [] start = time.time() for j in range(len(tags)): # references (true captions) for calculating BLEU-4 score references = list() # hypotheses (predictions) hypotheses = list() for i, (images, captions, lengths, all_captions) in enumerate(data_loaders[j]): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) lengths = [l - 1 for l in lengths] packed_targets = pack_padded_sequence(input=captions[:, 1:], lengths=lengths, batch_first=True) targets = packed_targets.data # Forward, backward and optimize with torch.no_grad(): features = encoder(images) outputs, alphas = decoder(captions[:, :-1], lengths, features, teacher_forcing_ratio=0) loss = criterion(outputs, targets) alpha_c = 1. loss += alpha_c * ((1. - alphas.sum(dim=1))**2).mean() # Keep track of metrics losses[j].update(loss.item(), sum(lengths)) top5 = accuracy(outputs, targets, 5) top5accs[j].update(top5, sum(lengths)) batch_time.update(time.time() - start) # unpacked outputs scores = outputs.clone() scores = PackedSequence(scores, packed_targets.batch_sizes) scores = pad_packed_sequence(scores, batch_first=True) start = vocab.word2idx['<start>'] end = vocab.word2idx['<end>'] all_caps = deepcopy(all_captions) for caps in all_caps: caps = [c.long().tolist() for c in caps] caps = [[w for w in c if w != start and w != end] for c in caps] references.append(caps) preds = list() for s, l in zip(scores[0], scores[1]): _, pred = torch.max(s, dim=1) pred = pred.tolist()[:l] pred = [w for w in pred if w != start and w != end] preds.append(pred) hypotheses.extend(preds) assert len(references) == len(hypotheses) # free del images del captions del lengths del all_captions del packed_targets del outputs del alphas torch.cuda.empty_cache() # Calculate BLEU-4 scores bleu4 = corpus_bleu(references, hypotheses) bleu4s.append(bleu4) feature = features[0].unsqueeze(0) start = vocab.word2idx['<start>'] end = vocab.word2idx['<end>'] sampled_ids = decoder.sample(feature, start_token=start, end_token=end) sampled_ids = sampled_ids[0].cpu().numpy() # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break print(sampled_caption) top5accs = [top5acc.avg for top5acc in top5accs] losses = [loss.avg for loss in losses] return batch_time.val, top5accs, losses, bleu4s
def forward(self,W): X = PackedSequence(self.embedding_dropout(self.char_embedding(W.data)),W.batch_sizes) H,h = self.gru(X) Y = PackedSequence(self.out(H.data),W.batch_sizes) return Y
def pack_wrapper(module, att_feats, att_masks): if att_masks is not None: packed, inv_ix = sort_pack_padded_sequence(att_feats, att_masks.data.long().sum(1)) return pad_unsort_packed_sequence(PackedSequence(module(packed[0]), packed[1]), inv_ix) else: return module(att_feats)
def extract(strings,modelPackage,batch_size=50,return_char_scores=False,dropout_samples=0): resultsDF = pd.DataFrame(list(strings),columns=['string']) uniqueDF = resultsDF.drop_duplicates() uniqueDF['chars'] = uniqueDF['string'].apply(stringToAscii) uniqueDF['len'] = [len(c) for c in uniqueDF['chars']] uniqueDF = uniqueDF[uniqueDF['len']>0] batchResults = [] for batchDF in dfChunks(uniqueDF,batch_size): batchDF = batchDF.sort_values('len',ascending=False) with torch.no_grad(): packedChars,_ = bytesToPacked1Hot(list(batchDF['chars']),clamp_range=(31,126),presorted=True) if next(modelPackage['model'].parameters()).is_cuda: packedChars = packedToCuda(packedChars) # Compute point estimates (no dropout) modelPackage['model'].eval() packedOutput = modelPackage['model'](packedChars) packedProbs = PackedSequence(F.sigmoid(packedOutput.data),packedOutput.batch_sizes) paddedProbs,lengths = torch.nn.utils.rnn.pad_packed_sequence(packedProbs) packedEntropies = PackedSequence(F.binary_cross_entropy_with_logits(packedOutput.data,packedProbs.data,reduce=False),packedOutput.batch_sizes) paddedEntropies,lengths = torch.nn.utils.rnn.pad_packed_sequence(packedEntropies) batchDF['probs'] = [x[:l].ravel() for x,l in zip(paddedProbs.t().cpu().numpy(),lengths)] # batchDF['entropies'] = [x[:l].cpu().numpy() for x,l in zip(paddedEntropies.t(),lengths)] batchDF['entropy'] = [x[:l].sum() for x,l in zip(paddedEntropies.t().cpu().numpy(),lengths)] batchDF['matches'] = [tuple(matchesFromProbs(c,p)) for i,c,p in batchDF[['chars','probs']].itertuples()] # Estimate uncertainty using dropout samples (if dropout samples > 0) if dropout_samples: samples = [paddedProbs] #Use point estimates as first sample for i in range(dropout_samples): modelPackage['model'].train() packedOutput = modelPackage['model'](packedChars) packedProbs = PackedSequence(F.sigmoid(packedOutput.data),packedOutput.batch_sizes) paddedProbs,lengths = torch.nn.utils.rnn.pad_packed_sequence(packedProbs) samples.append(paddedProbs) stds = torch.cat(samples,dim=2).std(dim=2) if return_char_scores: batchDF['dropout_sds'] = [x[:l].ravel() for x,l in zip(stds.t().cpu().numpy(),lengths)] batchDF['dropout_sd'] = stds.sum(dim=0).cpu().numpy() if not return_char_scores: batchDF = batchDF.drop(['probs'],axis=1) batchResults.append(batchDF) allBatchesDF = pd.concat(batchResults) resultsDF = pd.merge_ordered(resultsDF,allBatchesDF,on='string') return resultsDF
def iterate(self, src_tuple, target_tuple, training=True): # limit number of tokens o avoid gpu overload if self.limit_num_tokens is not None: src_tuple, target_tuple = self._batch_limit_tokens( src_tuple, target_tuple) src, src_length = src_tuple target, target_length = target_tuple batch_dim, time_dim = (0, 1) if self.batch_first else (1, 0) num_words = sum(target_length) - target.size(batch_dim) if isinstance(src, PackedSequence) or \ not isinstance(self.model_with_loss, DataParallel): if isinstance(src, PackedSequence): src = PackedSequence(src.data.to(self.device), src.batch_sizes.to(self.device)) else: src = src.to(self.device) target = target.to(self.device) if self.batch_first: inputs = (src, target[:, :-1]) target_labels = target[:, 1:].contiguous() else: inputs = (src, target[:-1]) target_labels = target[1:] # compute output loss, accuracy = self.model_with_loss(inputs, target_labels) loss = loss.sum() loss_measure = float(loss / num_words) if self.avg_loss_time: loss /= num_words else: loss /= target.size(batch_dim) accuracy = float(accuracy.sum().float() / num_words) if training: # compute gradient and do SGD step self.optimizer.zero_grad() loss.backward() if self.grad_clip is not None: if isinstance(self.grad_clip, dict): clip_encoder = self.grad_clip.get('encoder', 0) clip_decoder = self.grad_clip.get('decoder', 0) if clip_encoder > 0: clip_grad_norm_( self.model.encoder.parameters(), clip_encoder) if clip_decoder > 0: clip_grad_norm_( self.model.decoder.parameters(), clip_decoder) elif self.grad_clip > 0: # grad_clip is a number clip_grad_norm_(self.model.parameters(), self.grad_clip) if self.embedding_grad_clip is not None and self.embedding_grad_clip > 0: if hasattr(self.model.encoder, 'embedder'): clip_grad_norm_(self.model.encoder.embedder.parameters(), self.embedding_grad_clip) if hasattr(self.model.decoder, 'embedder'): clip_grad_norm_(self.model.decoder.embedder.parameters(), self.embedding_grad_clip) self.optimizer.step() return loss_measure, accuracy, num_words
def forward(self, sequence, hx=None): r""" Args: sequence (~torch.nn.utils.rnn.PackedSequence): A packed variable length sequence. hx (~torch.Tensor, ~torch.Tensor): A tuple composed of two tensors `h` and `c`. `h` of shape ``[num_layers*num_directions, batch_size, hidden_size]`` holds the initial hidden state for each element in the batch. `c` of shape ``[num_layers*num_directions, batch_size, hidden_size]`` holds the initial cell state for each element in the batch. If `hx` is not provided, both `h` and `c` default to zero. Default: ``None``. Returns: ~torch.nn.utils.rnn.PackedSequence, (~torch.Tensor, ~torch.Tensor): The first is a packed variable length sequence. The second is a tuple of tensors `h` and `c`. `h` of shape ``[num_layers*num_directions, batch_size, hidden_size]`` holds the hidden state for `t=seq_len`. Like output, the layers can be separated using ``h.view(num_layers, num_directions, batch_size, hidden_size)`` and similarly for c. `c` of shape ``[num_layers*num_directions, batch_size, hidden_size]`` holds the cell state for `t=seq_len`. """ x, batch_sizes = sequence.data, sequence.batch_sizes.tolist() batch_size = batch_sizes[0] h_n, c_n = [], [] if hx is None: ih = x.new_zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size) h, c = ih, ih else: h, c = self.permute_hidden(hx, sequence.sorted_indices) h = h.view(self.num_layers, self.num_directions, batch_size, self.hidden_size) c = c.view(self.num_layers, self.num_directions, batch_size, self.hidden_size) for i in range(self.num_layers): x = torch.split(x, batch_sizes) if self.training: mask = SharedDropout.get_mask(x[0], self.dropout) x = [i * mask[:len(i)] for i in x] x_i, (h_i, c_i) = self.layer_forward(x, (h[i, 0], c[i, 0]), self.f_cells[i], batch_sizes) if self.bidirectional: x_b, (h_b, c_b) = self.layer_forward(x, (h[i, 1], c[i, 1]), self.b_cells[i], batch_sizes, True) x_i = torch.cat((x_i, x_b), -1) h_i = torch.stack((h_i, h_b)) c_i = torch.stack((c_i, c_b)) x = x_i h_n.append(h_i) c_n.append(h_i) x = PackedSequence(x, sequence.batch_sizes, sequence.sorted_indices, sequence.unsorted_indices) hx = torch.cat(h_n, 0), torch.cat(c_n, 0) hx = self.permute_hidden(hx, sequence.unsorted_indices) return x, hx