def forward(self, inputs: PackedSequence,  # pylint: disable=arguments-differ
                # pylint: disable=unused-argument
                initial_state: torch.Tensor = None)-> Tuple[PackedSequence, torch.Tensor]:
        """
        Parameters
        ----------
        inputs : ``PackedSequence``, required.
            A batch first ``PackedSequence`` to run the stacked LSTM over.
        initial_state : Tuple[torch.Tensor, torch.Tensor], optional, (default = None)
            Currently, this is ignored.

        Returns
        -------
        output_sequence : ``PackedSequence``
            The encoded sequence of shape (batch_size, sequence_length, hidden_size)
        final_states: ``torch.Tensor``
            The per-layer final (state, memory) states of the LSTM, each with shape
            (num_layers, batch_size, hidden_size).
        """
        inputs, lengths = pad_packed_sequence(inputs, batch_first=True)

        # Kernel takes sequence length first tensors.
        inputs = inputs.transpose(0, 1)

        sequence_length, batch_size, _ = inputs.size()
        accumulator_shape = [self.num_layers, sequence_length + 1, batch_size, self.hidden_size]
        state_accumulator = Variable(inputs.data.new(*accumulator_shape).zero_(), requires_grad=False)
        memory_accumulator = Variable(inputs.data.new(*accumulator_shape).zero_(), requires_grad=False)

        dropout_weights = inputs.data.new().resize_(self.num_layers, batch_size, self.hidden_size).fill_(1.0)
        if self.training:
            # Normalize by 1 - dropout_prob to preserve the output statistics of the layer.
            dropout_weights.bernoulli_(1 - self.recurrent_dropout_probability)\
                .div_((1 - self.recurrent_dropout_probability))

        dropout_weights = Variable(dropout_weights, requires_grad=False)
        gates = Variable(inputs.data.new().resize_(self.num_layers,
                                                   sequence_length,
                                                   batch_size, 6 * self.hidden_size))

        lengths_variable = Variable(torch.IntTensor(lengths))
        implementation = _AlternatingHighwayLSTMFunction(self.input_size,
                                                         self.hidden_size,
                                                         num_layers=self.num_layers,
                                                         train=self.training)
        output, _ = implementation(inputs, self.weight, self.bias, state_accumulator,
                                   memory_accumulator, dropout_weights, lengths_variable, gates)

        # TODO(Mark): Also return the state here by using index_select with the lengths so we can use
        # it as a Seq2VecEncoder.
        output = output.transpose(0, 1)
        output = pack_padded_sequence(output, lengths, batch_first=True)
        return output, None
Exemplo n.º 2
0
 def _cudaize_packed(t):
     data = Variable(t.data)
     if torch.cuda.is_available():
         data = data.cuda()
     return PackedSequence(data, t.batch_sizes)
Exemplo n.º 3
0
    def forward(self,
                node_features,
                edge_features,
                neighbor_indices,
                neighbor_masks,
                h=None,
                c=None):
        """
        Update node_features and edge_features via graph convolution and pooling.
        Most of the complexity arises from the need to deal with different number
        of neighbors for each atom. We use PackedSequence, pad_packed_sequence
        and pack_padded_sequence of torch.nn.utils.rnn to realize the transition.
        Args:
            node_features (Tensor, (batch_size, node_embedding_len)):
            edge_features (Tensor, (batch_size, neighbor_len, edge_embedding_len)):
            neighbor_indices (Tensor, (batch_size, neighbor_len)):
            neighbor_masks (Tensor, (batch_size, neighbor_len)):
            h: for lstm
            c: for lstm
        Returns:
            node_features_updated, edge_features_updated, (h, c)
        """
        batch_len, neighbor_len, _ = edge_features.shape

        # calculate the neighbor length of each atom (in the batch) from the
        # neighbor_masks. In the neighbor_masks with fixed length, "1" means the
        # real neighbor and "0" is for filling the void.
        neighbor_lens = neighbor_masks.sum(dim=1)
        # make the atom with no neighbors in the batch to neighbor of itself?
        neighbor_masks[neighbor_lens == 0] = torch.Tensor(
            [1.] + [0.] * (neighbor_len - 1)).to(self.device)
        neighbor_lens[neighbor_lens == 0] = 1

        # concat node_features, neighbor's node_features, edge_features
        pair_features = torch.cat([
            node_features.unsqueeze(1).expand(batch_len, neighbor_len,
                                              self.node_embedding_len),
            node_features[neighbor_indices, :]
        ],
                                  dim=2)

        concat_features = torch.cat((pair_features, edge_features), dim=2)

        # change concat_features with fixed length to variable length sequence.
        packed_concat_features = pack_padded_sequence(concat_features,
                                                      neighbor_lens,
                                                      batch_first=True,
                                                      enforce_sorted=False)

        # update edge_features and change to fixed length sequence,
        edge_features_updated, _ = pad_packed_sequence(
            PackedSequence(
                self.edge_bn(self.edge_linear(packed_concat_features.data)),
                packed_concat_features.batch_sizes,
                packed_concat_features.sorted_indices,
                packed_concat_features.unsorted_indices),
            batch_first=True,
            total_length=neighbor_len)

        # use residual link in the edge feature update
        # edge_features_updated = self.activation(edge_features + padding_tensor(
        #     edge_features_updated, neighbor_len, batch_len, self.device))
        edge_features_updated = self.activation(edge_features +
                                                edge_features_updated)

        # update packed_concat_features
        packed_concat_features = pack_padded_sequence(torch.cat(
            (pair_features, edge_features_updated), dim=2),
                                                      neighbor_lens,
                                                      batch_first=True,
                                                      enforce_sorted=False)

        # calculate multi-head features for nodes
        head_features_list = list()
        for attention_linear, value_linear, attention_bn in zip(
                self.attention_linears, self.value_linears,
                self.attention_bns):

            # apply attention_linear to packed_concat_features
            head_attention, _ = pad_packed_sequence(PackedSequence(
                attention_linear(packed_concat_features.data),
                packed_concat_features.batch_sizes,
                packed_concat_features.sorted_indices,
                packed_concat_features.unsorted_indices),
                                                    batch_first=True,
                                                    total_length=neighbor_len)

            # Masked softmax: calculate the standard softmax and ignore zero values
            masked_attention = head_attention[:, :, -1:].masked_fill(
                (1 - neighbor_masks.unsqueeze(2)).bool(), float('-inf'))
            head_attention = self.attention_softmax(masked_attention)

            # change head_attention to variable length PackedSequence.
            packed_head_attentions = pack_padded_sequence(head_attention,
                                                          neighbor_lens,
                                                          batch_first=True,
                                                          enforce_sorted=False)

            packed_head_values = PackedSequence(
                value_linear(packed_concat_features.data),
                packed_concat_features.batch_sizes,
                packed_concat_features.sorted_indices,
                packed_concat_features.unsorted_indices)

            # head_features tensor
            head_features = self.activation(
                attention_bn(
                    self.attention_drop_layer(packed_head_attentions.data) *
                    packed_head_values.data))

            # change head_features to tensor of fixed length
            head_features, _ = pad_packed_sequence(PackedSequence(
                head_features, packed_head_attentions.batch_sizes,
                packed_head_attentions.sorted_indices,
                packed_head_attentions.unsorted_indices),
                                                   batch_first=True,
                                                   total_length=neighbor_len)

            # use sum pooling over neighbors as default
            pooled_head_features = torch.sum(head_features, dim=1)
            head_features_list.append(pooled_head_features)

        # concat multi-head node_features
        concat_heads_features = torch.cat(head_features_list, dim=1)

        # if n_head * attention_len != node_embedding_len
        if self.attention_out_linear is not None:
            node_features_updated = self.output_bn(
                self.activation(
                    self.after_concat_heads_bn(
                        self.after_concat_heads_linear(
                            concat_heads_features))))
        else:
            node_features_updated = self.output_bn(concat_heads_features)

        if self.remember_func == "residual":
            node_features_updated = node_features + node_features_updated
        elif self.remember_func == "lstm":
            node_features_updated, (h, c) = self.lstm_func(
                node_features_updated[None, :], h, c)
            node_features_updated = node_features_updated[0]
            node_features_updated = self.lstm_bn(node_features_updated)
        else:
            raise ValueError("remember_func invalid.")

        return node_features_updated, edge_features_updated, (h, c)
Exemplo n.º 4
0
    def forward(self, input, hx=None):
        is_packed = isinstance(input, PackedSequence)
        '''if packed, input contains max_batch_size information'''
        if is_packed:
            input, batch_sizes = input
            batch_size = batch_sizes[0]
        else:
            batch_sizes = None
            batch_size = input.size(0) if self.batch_first else input.size(1)
        '''if user don't provide the hx and cx, a zero tensor will be created.'''
        if hx is None:
            num_directions = 2 if self.bidirectional else 1
            hx = torch.autograd.Variable(input.data.new(
                self.num_layers * num_directions, batch_size,
                self.hidden_size).zero_(),
                                         requires_grad=False)
            if self.mode == 'LSTM':  #LSTM requires a tuple in hx
                hx = (hx, hx)

        has_flat_weights = None  #= list(p.data.data_ptr() for p in self.parameters()) == self._data_ptrs
        '''TODO: add assert to avoid shape mismatch'''

        #get all weight from self.parameters()
        seq_length = input.size(0)
        weight_idx = 1
        bx = None
        bh = None
        for weight in self.parameters():
            if weight_idx == 1:
                wx = weight
            elif weight_idx == 2:
                wh = weight
            elif weight_idx == 3:
                bx = weight
            elif weight_idx == 4:
                bh = weight
            weight_idx = weight_idx + 1

        # Pytorch makes the assumption that all parameters passed to Function.forward() must
        # be an instance of "Variable", and it doesn't accept NoneType. Make it happy.
        if bx is None:
            bx = Variable(torch.Tensor((0)))
        if bh is None:
            bh = Variable(torch.Tensor((0)))
        #check if input seq_length or batch_size exceed the max value in self.
        if seq_length > self.max_seq_length:
            self.max_seq_length = seq_length
            self.update_workspace = True
        if batch_size > self.max_batch_size:
            self.max_batch_size = batch_size
            self.update_workspace = True
        #update workspace in the first call, or exceed happens
        if self.update_workspace:
            #print("Updating the workspace ...")
            buffer_size = get_workspace_size(self.mode, self.training,
                                             self.num_layers,
                                             self.bidirectional,
                                             self.max_seq_length,
                                             self.max_batch_size,
                                             self.input_size, self.hidden_size)
            self.workspace = Variable(torch.zeros(buffer_size),
                                      requires_grad=False)
            self.update_workspace = False

        _func = self.IRNNFunc

        if self.mode == 'LSTM':
            cx = hx[1]
            hx = hx[0]
            self.y, self.hy, self.cy = _func(self.workspace, input, hx, cx, wx,
                                             wh, bx, bh)
            if is_packed:
                output = PackedSequence(self.y, batch_sizes)
            return self.y, (self.hy, self.cy)
        elif self.mode == 'GRU':
            self.y, self.hy = _func(self.workspace, input, hx, wx, wh, bx, bh)
            if is_packed:
                output = PackedSequence(self.y, batch_sizes)
            return self.y, self.hy
    def forward(self, docs, doc_lengths, sent_lengths, attention_masks,
                token_type_ids):
        """
        :param docs: encoded document-level data; LongTensor (num_docs, padded_doc_length, padded_sent_length)
        :param doc_lengths: unpadded document lengths; LongTensor (num_docs)
        :param sent_lengths: unpadded sentence lengths; LongTensor (num_docs, max_sent_len)
        :param attention_masks: BERT attention masks; LongTensor (num_docs, padded_doc_length, padded_sent_length)
        :param token_type_ids: BERT token type IDs; LongTensor (num_docs, padded_doc_length, padded_sent_length)
        :return: sentences embeddings, docs permutation indices, docs batch sizes, word attention weights
        """

        # Sort documents by decreasing order in length
        doc_lengths, doc_perm_idx = doc_lengths.sort(dim=0, descending=True)
        docs = docs[doc_perm_idx]
        sent_lengths = sent_lengths[doc_perm_idx]

        # Make a long batch of sentences by removing pad-sentences
        # i.e. `docs` was of size (num_docs, padded_doc_length, padded_sent_length)
        # -> `packed_sents.data` is now of size (num_sents, padded_sent_length)
        packed_sents = pack_padded_sequence(docs,
                                            lengths=doc_lengths.tolist(),
                                            batch_first=True)

        # effective batch size at each timestep
        docs_valid_bsz = packed_sents.batch_sizes

        # Make a long batch of sentence lengths by removing pad-sentences
        # i.e. `sent_lengths` was of size (num_docs, padded_doc_length)
        # -> `packed_sent_lengths.data` is now of size (num_sents)
        packed_sent_lengths = pack_padded_sequence(
            sent_lengths, lengths=doc_lengths.tolist(), batch_first=True)

        # Make a long batch of attention masks by removing pad-sentences
        # i.e. `docs` was of size (num_docs, padded_doc_length, padded_sent_length)
        # -> `packed_attention_masks.data` is now of size (num_sents, padded_sent_length)
        packed_attention_masks = pack_padded_sequence(
            attention_masks, lengths=doc_lengths.tolist(), batch_first=True)

        # Make a long batch of token_type_ids by removing pad-sentences
        # i.e. `docs` was of size (num_docs, padded_doc_length, padded_sent_length)
        # -> `token_type_ids.data` is now of size (num_sents, padded_sent_length)
        packed_token_type_ids = pack_padded_sequence(
            token_type_ids, lengths=doc_lengths.tolist(), batch_first=True)

        sents, sent_lengths, attn_masks, token_types = (
            packed_sents.data, packed_sent_lengths.data,
            packed_attention_masks.data, packed_token_type_ids.data)

        # Sort sents by decreasing order in sentence lengths
        sent_lengths, sent_perm_idx = sent_lengths.sort(dim=0, descending=True)
        sents = sents[sent_perm_idx]

        embeddings, pooled_out = self.bert_model(sents,
                                                 attention_mask=attn_masks,
                                                 token_type_ids=token_types)

        packed_words = pack_padded_sequence(embeddings,
                                            lengths=sent_lengths.tolist(),
                                            batch_first=True)

        # effective batch size at each timestep
        sentences_valid_bsz = packed_words.batch_sizes

        u_i = torch.tanh(self.word_weight(packed_words.data))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)

        # Restore as sentences by repadding
        att, _ = pad_packed_sequence(PackedSequence(att, sentences_valid_bsz),
                                     batch_first=True)

        att_weights = att / torch.sum(att, dim=1, keepdim=True)

        # Restore as sentences by repadding
        sents, _ = pad_packed_sequence(packed_words, batch_first=True)

        sents = sents * att_weights.unsqueeze(2)
        sents = sents.sum(dim=1)

        # Restore the original order of sentences (undo the first sorting)
        _, sent_unperm_idx = sent_perm_idx.sort(dim=0, descending=False)
        sents = sents[sent_unperm_idx]

        att_weights = att_weights[sent_unperm_idx]

        return sents, doc_perm_idx, docs_valid_bsz, att_weights
Exemplo n.º 6
0
    def forward(self, word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, word_orig_idx,
                sentlens, wordlens, orig_idx=None, morph_dict=None, start=None, end=None):

        def pack(x):  # Packs a Tensor containing padded sequences of variable length.
            return pack_padded_sequence(x, sentlens, batch_first=True)

        inputs = []
        if self.args['word_emb_dim'] > 0:
            word_emb = self.word_emb(word)
            word_emb = pack(word_emb)
            inputs += [word_emb]

        if self.args['pretrain']:
            pretrained_emb = self.pretrained_emb(pretrained)
            pretrained_emb = self.trans_pretrained(pretrained_emb)
            pretrained_emb = pack(pretrained_emb)
            inputs += [pretrained_emb]

        def pad(x):  # inverse operation to pack_padded_sequence(). Pads a packed batch of variable length sequences.
            return pad_packed_sequence(PackedSequence(x, word_emb.batch_sizes), batch_first=True)[0]

        if self.args['char'] and self.args['char_emb_dim'] > 0:
            char_reps = self.charmodel(wordchars, wordchars_mask, word_orig_idx, sentlens, wordlens)
            char_reps = PackedSequence(self.trans_char(self.drop(char_reps.data)), char_reps.batch_sizes)
            inputs += [char_reps]

        lstm_inputs = torch.cat([x.data for x in inputs],1)
        lstm_inputs = self.worddrop(lstm_inputs, self.drop_replacement)
        lstm_inputs = self.drop(lstm_inputs)
        lstm_inputs = PackedSequence(lstm_inputs, inputs[0].batch_sizes)

        lstm_outputs, _ = self.taggerlstm(lstm_inputs, sentlens, hx=(
        self.taggerlstm_h_init.expand(2 * self.args['num_layers'], word.size(0), self.args['hidden_dim']).contiguous(),
        self.taggerlstm_c_init.expand(2 * self.args['num_layers'], word.size(0), self.args['hidden_dim']).contiguous()))
        lstm_outputs = lstm_outputs.data

        upos_hid = F.relu(self.upos_hid(self.drop(lstm_outputs)))
        upos_pred = self.upos_clf(self.drop(upos_hid))
        preds = [pad(upos_pred).max(2)[1]]

        upos = pack(upos).data
        loss = self.crit(upos_pred.view(-1, upos_pred.size(-1)), upos.view(-1))

        if self.share_hid:
            xpos_hid = upos_hid
            ufeats_hid = upos_hid

            clffunc = lambda clf, hid: clf(self.drop(hid))
        else:
            xpos_hid = F.relu(self.xpos_hid(self.drop(lstm_outputs)))
            ufeats_hid = F.relu(self.ufeats_hid(self.drop(lstm_outputs)))

            # this is where we get upos embeddings
            if self.training:
                upos_emb = self.upos_emb(upos)
            else:
                # get the top 5 upos predictions
                best_5 = [sorted(range(len(x)), key=lambda i: x[i], reverse=True)[:5] for x in upos_pred]
                # save upos emb for later
                upos_temp = self.upos_emb
                upos_emb = self.upos_emb(upos_pred.max(1)[1])

            clffunc = lambda clf, hid: clf(self.drop(hid), self.drop(upos_emb))  # ORG

        xpos = pack(xpos).data
        if isinstance(self.vocab['xpos'], CompositeVocab):
            xpos_preds = []
            for i in range(len(self.vocab['xpos'])):
                xpos_pred = clffunc(self.xpos_clf[i], xpos_hid)
                loss += self.crit(xpos_pred.view(-1, xpos_pred.size(-1)), xpos[:, i].view(-1))
                xpos_preds.append(pad(xpos_pred).max(2, keepdim=True)[1])
            preds.append(torch.cat(xpos_preds, 2))
        else:
            xpos_pred = clffunc(self.xpos_clf, xpos_hid)
            loss += self.crit(xpos_pred.view(-1, xpos_pred.size(-1)), xpos.view(-1))
            preds.append(pad(xpos_pred).max(2)[1])

        ufeats_preds = []
        ufeats = pack(ufeats).data
        for i in range(len(self.vocab['feats'])):
            ufeats_pred = clffunc(self.ufeats_clf[i], ufeats_hid)
            loss += self.crit(ufeats_pred.view(-1, ufeats_pred.size(-1)), ufeats[:, i].view(-1))
            ufeats_preds.append(pad(ufeats_pred).max(2, keepdim=True)[1])
        preds.append(torch.cat(ufeats_preds,2))

        # post-filter only if a morphological dictionary is present
        if morph_dict:

            # get the most likely ufeats tag for each top 5 upos tags predicted for a word
            feats_coeffs = list()
            for r in range(5):  # condition ufeats on a different upos tag embedding each time
                upos_2 = torch.LongTensor([x[r] for x in best_5])
                upos_emb2 = upos_temp(upos_2)
                clffunc_temp = lambda clf, hid: clf(self.drop(hid), self.drop(upos_emb2))

                ufeats_preds_temp = []
                for i in range(len(self.vocab['feats'])):
                    ufeats_pred = clffunc_temp(self.ufeats_clf[i], ufeats_hid)
                    ufeats_preds_temp.append(pad(ufeats_pred).max(2, keepdim=True)[1])
                feats_coeffs.append(torch.cat(ufeats_preds_temp, 2))

            # unmap all tags into readable format and unsort them into the original order that matches the sentence order
            upos_seqs = [self.vocab['upos'].unmap(up) for up in preds[0].tolist()]
            xpos_seqs = [self.vocab['xpos'].unmap(up) for up in preds[1].tolist()]
            feats_seqs = [self.vocab['feats'].unmap(up) for up in preds[2].tolist()]
            pred_tokens = [[[upos_seqs[i][j], xpos_seqs[i][j], feats_seqs[i][j]] for j in range(sentlens[i])] for i in
                           range(word.size(0))]
            pred_tokens = utils.unsort(pred_tokens, orig_idx)

            # pair the tags with the right words in the right sentences.
            sntncs = self.doc.sentences[start:end]
            sent_tokens = [[x.text for x in sent.tokens] for sent in sntncs]
            pair = [x for x in zip(sent_tokens, pred_tokens)]

            # 5 most likely upos tags for the token
            coeff = utils.unsort(pad(upos_pred).tolist(), orig_idx)
            coeff_max = [[sorted(range(len(x)), key=lambda i: x[i], reverse=True)[:5] for x in y] for y in coeff]

            # the most likely feats tag for each top 5 predicted upos tag
            fct = []
            for f in feats_coeffs:
                fct.append(utils.unsort(f, orig_idx))
            fct2 = [list(zip(*[fct[0][i], fct[1][i], fct[2][i], fct[3][i], fct[4][i]])) for i in range(len(fct[0]))]
            feats_coeffs = [[list(j[i]) for i in range(len(j))] for j in fct2]

            # initialise hunspell for Lithuanian
            if self.args['lang'] == 'lt':
                root = os.path.dirname(os.getcwd())
                hunspell = Hunchecker('lt-LT_morphology', root + '/data_files/hunspell')

            print('Post-filtering...')
            for p in range(len(pair)):  # get a sentence
                words = pair[p][0]
                tags = pair[p][1]

                a = 0
                while a < len(words):

                    lemma, upos, xpos, feats = morph_dict.find(words[a])
                    if upos is None:
                        lemma, upos, xpos, feats = morph_dict.find(words[a].lower())
                    else:
                        lemma2, upos2, xpos2, feats2 = morph_dict.find(words[a].lower())
                        if lemma2:
                            for i in range(len(lemma2)):
                                if upos2[i] not in upos or feats2[i] not in feats:
                                    lemma += [lemma2[i]]
                                    upos += [upos2[i]]
                                    xpos += [xpos2[i]]
                                    feats += [feats2[i]]

                    if self.args['lang'] == 'lt':
                        if upos is None:
                            lemma, upos, xpos, feats = hunspell.hunspell_to_conll(words[a])
                        else:
                            lemma_h, upos_h, xpos_h, feats_h = hunspell.hunspell_to_conll(words[a])
                            if upos_h is not None:
                                for i in range(len(upos_h)):
                                    if upos_h[i] not in upos or feats_h[i] not in feats:
                                        lemma += [lemma_h[i]]
                                        upos += [upos_h[i]]
                                        xpos += [xpos_h[i]]
                                        feats += [feats_h[i]]

                    if upos is not None:
                        if tags[a][0] not in upos:
                            new_upos = None
                            tag_idx = None
                            if len(upos) > 1:
                                max_values = self.vocab['upos'].unmap(coeff_max[p][a][1:])
                                # go through the values in the order of the most likely one
                                for m in range(len(max_values)):  # for every max upos tag
                                    # found one of the possible predicted values in the upos list
                                    if max_values[m] in upos:
                                        indices = [i for i, x in enumerate(upos) if x == max_values[m]]
                                        if len(indices) > 1:  # more than one upos list items matches the max value item
                                            # check if an exact match can be found, using the most informative ufeats tag
                                            for d in indices:
                                                if feats[d] == self.vocab['feats'].unmap(feats_coeffs[p][a][1:])[m] and \
                                                        upos[d] == max_values[m]:
                                                    new_upos = upos[d]
                                                    tag_idx = d
                                                    break
                                        if len(indices) == 1 or new_upos is None:
                                            new_upos = max_values[m]
                                            tag_idx = upos.index(max_values[m])
                                        break
                                if new_upos is None:  # last resort
                                    new_upos = upos[0]
                                    tag_idx = 0
                            else:  # only one item in upos list
                                new_upos = upos[0]
                                tag_idx = 0

                            new_xpos = xpos[tag_idx]
                            new_feats = feats[tag_idx]
                            # let the tagger deal with multiword tokens itself
                            if ('Hyph=Yes' not in new_feats and 'Hyph=Yes' in tags[a][2]) or (
                                    'Hyph=Yes' in new_feats and 'Hyph=Yes' not in tags[a][2]):
                                new_upos = new_xpos = new_feats = None

                            if new_upos is not None:
                                preds[0][orig_idx.index(p)][a] = self.vocab['upos'].map([new_upos])[0]
                                # sme has a 2D torch here, LT has 3D
                                if not isinstance(self.vocab['xpos'], CompositeVocab):
                                    preds[1][orig_idx.index(p)][a] = self.vocab['xpos'].map([new_xpos])[0]
                                else:
                                    preds[1][orig_idx.index(p)][a] = torch.LongTensor(
                                        self.vocab['xpos'].map([new_xpos])[0])
                                preds[2][orig_idx.index(p)][a] = torch.LongTensor(
                                    self.vocab['feats'].map([new_feats])[0])

                        else:
                            new_xpos = new_feats = None
                            all_found = False
                            for x in range(len(xpos)):
                                if tags[a][1] == xpos[x] and tags[a][2] == feats[x] and upos[x] == tags[a][0]:
                                    all_found = True
                                    break

                            if not all_found:
                                if len(upos) == 1 or (False not in [feats[a] == feats[a + 1] for a in
                                                                    range(len(feats) - 1)] and False not in [
                                                          upos[a] == upos[a + 1] for a in range(len(upos) - 1)]):
                                    new_feats = feats[0]
                                    if '*' not in tags[a][1]:
                                        new_xpos = xpos[0]
                                    all_found = True

                            if not all_found:
                                if len([i for i, x in enumerate(upos) if x == tags[a][0]]) == 1:
                                    new_feats = feats[upos.index(tags[a][0])]
                                    if '*' not in tags[a][1]:
                                        new_xpos = xpos[upos.index(tags[a][0])]
                                    all_found = True

                            if not all_found:
                                found_ft = False
                                for x in range(len(xpos)):
                                    if tags[a][2] == feats[x] and upos[x] == tags[a][0]:
                                        found_ft = True
                                        if xpos[x] != tags[a][1] and '*' not in tags[a][1]:
                                            new_xpos = xpos[x]
                                        break

                                if not found_ft:
                                    for x in range(len(xpos)):
                                        if tags[a][1] == xpos[x] and tags[a][2] != feats[x] and upos[x] == tags[a][0]:
                                            new_feats = feats[x]
                                            break

                            if new_feats:
                                if ('Hyph=Yes' not in new_feats and 'Hyph=Yes' in tags[a][2]) or (
                                        'Hyph=Yes' in new_feats and 'Hyph=Yes' not in tags[a][2]):
                                    # let the tagger deal with multiword tokens itself
                                    new_xpos = new_feats = None

                            if new_xpos is not None:
                                # non composite has a 2D torch here, composite has 3D
                                if not isinstance(self.vocab['xpos'], CompositeVocab):
                                    preds[1][orig_idx.index(p)][a] = self.vocab['xpos'].map([new_xpos])[0]
                                else:
                                    preds[1][orig_idx.index(p)][a] = torch.LongTensor(
                                        self.vocab['xpos'].map([new_xpos])[0])
                            if new_feats is not None:
                                preds[2][orig_idx.index(p)][a] = torch.LongTensor(
                                    self.vocab['feats'].map([new_feats])[0])

                    a += 1

        print('Post-filtering complete.')
        return loss, preds
Exemplo n.º 7
0
    def forward(self, input_seqs):
        """ Forward pass.

        # Arguments:
            input_seqs: Can be one of Numpy array, Torch.LongTensor, Torch.Variable, Torch.PackedSequence.

        # Return:
            Same format as input format (except for PackedSequence returned as Variable).
        """
        # Check if we have Torch.LongTensor inputs or not Torch.Variable (assume Numpy array in this case), take note to return same format
        return_numpy = False
        return_tensor = False
        if isinstance(input_seqs, (torch.LongTensor, torch.cuda.LongTensor)):
            input_seqs = Variable(input_seqs)
            return_tensor = True
        elif not isinstance(input_seqs, Variable):
            input_seqs = Variable(
                torch.from_numpy(input_seqs.astype('int64')).long())
            return_numpy = True

        # If we don't have a packed inputs, let's pack it
        reorder_output = False
        if not isinstance(input_seqs, PackedSequence):
            ho = self.lstm_0.weight_hh_l0.data.new(2,
                                                   input_seqs.size()[0],
                                                   self.hidden_size).zero_()
            co = self.lstm_0.weight_hh_l0.data.new(2,
                                                   input_seqs.size()[0],
                                                   self.hidden_size).zero_()

            # Reorder batch by sequence length
            input_lengths = torch.LongTensor([
                torch.max(input_seqs[i, :].data.nonzero()) + 1
                for i in range(input_seqs.size()[0])
            ])
            input_lengths, perm_idx = input_lengths.sort(0, descending=True)
            input_seqs = input_seqs[perm_idx][:, :input_lengths.max()]

            # Pack sequence and work on data tensor to reduce embeddings/dropout computations
            packed_input = pack_padded_sequence(input_seqs,
                                                input_lengths.cpu().numpy(),
                                                batch_first=True)
            reorder_output = True
        else:
            ho = self.lstm_0.weight_hh_l0.data.data.new(
                2,
                input_seqs.size()[0], self.hidden_size).zero_()
            co = self.lstm_0.weight_hh_l0.data.data.new(
                2,
                input_seqs.size()[0], self.hidden_size).zero_()
            input_lengths = input_seqs.batch_sizes
            packed_input = input_seqs

        hidden = (Variable(ho, requires_grad=False),
                  Variable(co, requires_grad=False))

        # Embed with an activation function to bound the values of the embeddings
        x = self.embed(packed_input.data)
        x = nn.Tanh()(x)

        # pyTorch 2D dropout2d operate on axis 1 which is fine for us
        x = self.embed_dropout(x)

        # Update packed sequence data for RNN
        packed_input = PackedSequence(x, packed_input.batch_sizes)

        # skip-connection from embedding to output eases gradient-flow and allows access to lower-level features
        # ordering of the way the merge is done is important for consistency with the pretrained model
        lstm_0_output, _ = self.lstm_0(packed_input, hidden)
        lstm_1_output, _ = self.lstm_1(lstm_0_output, hidden)

        # Update packed sequence data for attention layer
        packed_input = PackedSequence(
            torch.cat(
                (lstm_1_output.data, lstm_0_output.data, packed_input.data),
                dim=1), packed_input.batch_sizes)

        input_seqs, _ = pad_packed_sequence(packed_input, batch_first=True)

        x, att_weights = self.attention_layer(input_seqs, input_lengths)

        # output class probabilities or penultimate feature vector
        if not self.feature_output:
            x = self.final_dropout(x)
            outputs = self.output_layer(x)
        else:
            outputs = x

        # Reorder output if needed
        if reorder_output:
            reorered = Variable(outputs.data.new(outputs.size()))
            reorered[perm_idx] = outputs
            outputs = reorered

        # Adapt return format if needed
        if return_tensor:
            outputs = outputs.data
        if return_numpy:
            outputs = outputs.data.numpy()

        if self.return_attention:
            return outputs, att_weights
        else:
            return outputs
Exemplo n.º 8
0
def main():
    import argparse
    parser = argparse.ArgumentParser('Script for training embedding model on SCOP.')

    parser.add_argument('--dev', action='store_true', help='use train/dev split')

    parser.add_argument('-m', '--model', choices=['ssa', 'ua', 'me'], default='ssa', help='alignment scoring method for comparing sequences in embedding space [ssa: soft symmetric alignment, ua: uniform alignment, me: mean embedding] (default: ssa)')
    parser.add_argument('--allow-insert', action='store_true', help='model insertions (default: false)')

    parser.add_argument('--norm', choices=['l1', 'l2'], default='l1', help='comparison norm (default: l1)')

    parser.add_argument('--rnn-type', choices=['lstm', 'gru'], default='lstm', help='type of RNN block to use (default: lstm)')
    parser.add_argument('--embedding-dim', type=int, default=100, help='embedding dimension (default: 100)')
    parser.add_argument('--input-dim', type=int, default=512, help='dimension of input to RNN (default: 512)')
    parser.add_argument('--rnn-dim', type=int, default=512, help='hidden units of RNNs (default: 512)')
    parser.add_argument('--num-layers', type=int, default=3, help='number of RNN layers (default: 3)')
    parser.add_argument('--dropout', type=float, default=0, help='dropout probability (default: 0)')

    parser.add_argument('--epoch-size', type=int, default=100000, help='number of examples per epoch (default: 100,000)')
    parser.add_argument('--epoch-scale', type=int, default=5, help='scaling on epoch size (default: 5)')
    parser.add_argument('--num-epochs', type=int, default=100, help='number of epochs (default: 100)')

    parser.add_argument('--batch-size', type=int, default=64, help='minibatch size (default: 64)')

    parser.add_argument('--weight-decay', type=float, default=0, help='L2 regularization (default: 0)')
    parser.add_argument('--lr', type=float, default=0.001)

    parser.add_argument('--tau', type=float, default=0.5, help='sampling proportion exponent (default: 0.5)')
    parser.add_argument('--augment', type=float, default=0, help='probability of resampling amino acid for data augmentation (default: 0)')
    parser.add_argument('--lm', help='pretrained LM to use as initial embedding')

    parser.add_argument('-o', '--output', help='output file path (default: stdout)')
    parser.add_argument('--save-prefix', help='path prefix for saving models')
    parser.add_argument('-d', '--device', type=int, default=-2, help='compute device to use')

    args = parser.parse_args()


    prefix = args.output


    ## set the device
    d = args.device
    use_cuda = (d != -1) and torch.cuda.is_available()
    if d >= 0:
        torch.cuda.set_device(d)

    ## make the datasets
    astral_train_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.train.fa'
    astral_testpairs_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.test.sampledpairs.txt'
    if args.dev:
        astral_train_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.train.train.fa'
        astral_testpairs_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.train.dev.sampledpairs.txt'

    alphabet = Uniprot21()

    print('# loading training sequences:', astral_train_path, file=sys.stderr)
    with open(astral_train_path, 'rb') as f:
        names_train, structs_train, sequences_train = scop.parse_astral(f, encoder=alphabet)    
    x_train = [torch.from_numpy(x).long() for x in sequences_train]
    if use_cuda:
        x_train = [x.cuda() for x in x_train]
    y_train = torch.from_numpy(structs_train)

    print('# loaded', len(x_train), 'training sequences', file=sys.stderr)


    print('# loading test sequence pairs:', astral_testpairs_path, file=sys.stderr)
    test_pairs_table = pd.read_csv(astral_testpairs_path, sep='\t') 
    x0_test = [x.encode('utf-8').upper() for x in test_pairs_table['sequence_A']]
    x0_test = [torch.from_numpy(alphabet.encode(x)).long() for x in x0_test]
    x1_test = [x.encode('utf-8').upper() for x in test_pairs_table['sequence_B']]
    x1_test = [torch.from_numpy(alphabet.encode(x)).long() for x in x1_test]
    if use_cuda:
        x0_test = [x.cuda() for x in x0_test]
        x1_test = [x.cuda() for x in x1_test]
    y_test = test_pairs_table['similarity'].values
    y_test = torch.from_numpy(y_test).long()

    dataset_test = PairedDataset(x0_test, x1_test, y_test)
    print('# loaded', len(x0_test), 'test pairs', file=sys.stderr)

    ## make the dataset iterators
    scale = args.epoch_scale

    epoch_size = args.epoch_size
    batch_size = args.batch_size

    # precompute the similarity pairs
    y_train_levels = torch.cumprod((y_train.unsqueeze(1) == y_train.unsqueeze(0)).long(), 2)

    # data augmentation by resampling amino acids
    augment = None
    p = 0
    if args.augment > 0:
        p = args.augment
        trans = torch.ones(len(alphabet),len(alphabet))
        trans = trans/trans.sum(1, keepdim=True)
        if use_cuda:
            trans = trans.cuda()
        augment = MultinomialResample(trans, p)
    print('# resampling amino acids with p:', p, file=sys.stderr)
    dataset_train = AllPairsDataset(x_train, y_train_levels, augment=augment)

    similarity = y_train_levels.numpy().sum(2)
    levels,counts = np.unique(similarity, return_counts=True)
    order = np.argsort(levels)
    levels = levels[order]
    counts = counts[order]

    print('#', levels, file=sys.stderr)
    print('#', counts/np.sum(counts), file=sys.stderr)

    weight = counts**0.5
    print('#', weight/np.sum(weight), file=sys.stderr)

    weight = counts**0.33
    print('#', weight/np.sum(weight), file=sys.stderr)

    weight = counts**0.25
    print('#', weight/np.sum(weight), file=sys.stderr)

    tau = args.tau
    print('# using tau:', tau, file=sys.stderr)
    print('#', counts**tau/np.sum(counts**tau), file=sys.stderr)
    weights = counts**tau/counts
    weights = weights[similarity].ravel()
    #weights = np.ones(len(dataset_train))
    sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, epoch_size)

    # two training dataset iterators for sampling pairs of sequences for training
    train_iterator = torch.utils.data.DataLoader(dataset_train
                                                , batch_size=batch_size
                                                , sampler=sampler
                                                , collate_fn=collate_paired_sequences
                                                )
    test_iterator = torch.utils.data.DataLoader(dataset_test
                                               , batch_size=batch_size
                                               , collate_fn=collate_paired_sequences
                                               )
    

    ## initialize the model 
    rnn_type = args.rnn_type
    rnn_dim = args.rnn_dim
    num_layers = args.num_layers

    embedding_size = args.embedding_dim
    input_dim = args.input_dim

    dropout = args.dropout
    
    allow_insert = args.allow_insert

    print('# initializing model with:', file=sys.stderr)
    print('# embedding_size:', embedding_size, file=sys.stderr)
    print('# input_dim:', input_dim, file=sys.stderr)
    print('# rnn_dim:', rnn_dim, file=sys.stderr)
    print('# num_layers:', num_layers, file=sys.stderr)
    print('# dropout:', dropout, file=sys.stderr)
    print('# allow_insert:', allow_insert, file=sys.stderr)

    compare_type = args.model
    print('# comparison method:', compare_type, file=sys.stderr)

    lm = None
    if args.lm is not None:
        lm = torch.load(args.lm)
        lm.eval()
        ## do not update the LM parameters
        for param in lm.parameters():
            param.requires_grad = False
        print('# using LM:', args.lm, file=sys.stderr)

    if num_layers > 0:
        embedding = src.models.embedding.StackedRNN(len(alphabet), input_dim, rnn_dim, embedding_size
                                                   , nlayers=num_layers, dropout=dropout, lm=lm)
    else:
        embedding = src.models.embedding.Linear(len(alphabet), input_dim, embedding_size, lm=lm)

    if args.norm == 'l1':
        norm = src.models.comparison.L1()
        print('# norm: l1', file=sys.stderr)
    elif args.norm == 'l2':
        norm = src.models.comparison.L2()
        print('# norm: l2', file=sys.stderr)
    model = src.models.comparison.OrdinalRegression(embedding, 5, align_method=compare_type
                                                   , compare=norm, allow_insertions=allow_insert
                                                   )

    if use_cuda:
        model.cuda()

    ## setup training parameters and optimizer
    num_epochs = args.num_epochs

    weight_decay = args.weight_decay
    lr = args.lr

    print('# training with Adam: lr={}, weight_decay={}'.format(lr, weight_decay), file=sys.stderr)
    params = [p for p in model.parameters() if p.requires_grad]
    optim = torch.optim.Adam(params, lr=lr, weight_decay=weight_decay)

    ## train the model
    print('# training model', file=sys.stderr)

    save_prefix = args.save_prefix
    output = args.output
    if output is None:
        output = sys.stdout
    else:
        output = open(output, 'w')
    digits = int(np.floor(np.log10(num_epochs))) + 1
    line = '\t'.join(['epoch', 'split', 'loss', 'mse', 'accuracy', 'r', 'rho' ])
    print(line, file=output)


    for epoch in range(num_epochs):
        # train epoch
        model.train()
        it = 0
        n = 0
        loss_estimate = 0
        mse_estimate = 0
        acc_estimate = 0

        for x0,x1,y in train_iterator: # zip(train_iterator_0, train_iterator_1):

            if use_cuda:
                y = y.cuda()
            y = Variable(y)

            b = len(x0)
            x = x0 + x1

            x,order = pack_sequences(x)
            x = PackedSequence(Variable(x.data), x.batch_sizes)
            z = model(x) # embed the sequences
            z = unpack_sequences(z, order)

            z0 = z[:b]
            z1 = z[b:]

            logits = []
            for i in range(b):
                z_a = z0[i]
                z_b = z1[i]
                logits.append(model.score(z_a, z_b))
            logits = torch.stack(logits, 0)

            loss = F.binary_cross_entropy_with_logits(logits, y.float())
            loss.backward()

            optim.step()
            optim.zero_grad()
            model.clip() # projected gradient for bounding ordinal regressionn parameters

            p = F.sigmoid(logits) 
            ones = p.new(b,1).zero_() + 1
            p_ge = torch.cat([ones, p], 1)
            p_lt = torch.cat([1-p, ones], 1)
            p = p_ge*p_lt
            p = p/p.sum(1,keepdim=True) # make sure p is normalized

            _,y_hard = torch.max(p, 1)
            levels = torch.arange(5).to(p.device)
            y_hat = torch.sum(p*levels, 1)
            y = torch.sum(y.data, 1)

            loss = F.cross_entropy(p, y) # calculate cross entropy loss from p vector

            correct = torch.sum((y == y_hard).float())
            mse = torch.sum((y.float() - y_hat)**2)

            n += b
            delta = b*(loss.item() - loss_estimate)
            loss_estimate += delta/n
            delta = correct.item() - b*acc_estimate
            acc_estimate += delta/n
            delta = mse.item() - b*mse_estimate
            mse_estimate += delta/n

            
            if (n - b)//100 < n//100:
                print('# [{}/{}] training {:.1%} loss={:.5f}, mse={:.5f}, acc={:.5f}'.format(epoch+1
                                                                , num_epochs
                                                                , n/epoch_size
                                                                , loss_estimate
                                                                , mse_estimate 
                                                                , acc_estimate 
                                                                )
                     , end='\r', file=sys.stderr)
        print(' '*80, end='\r', file=sys.stderr)
        line = '\t'.join([str(epoch+1).zfill(digits), 'train', str(loss_estimate)
                         , str(mse_estimate), str(acc_estimate), '-', '-'])
        print(line, file=output)
        output.flush()

        # eval and save model
        model.eval()

        y = []
        logits = []
        with torch.no_grad():
            for x0,x1,y_mb in test_iterator:

                if use_cuda:
                    y_mb = y_mb.cuda()
                y.append(y_mb.long())

                b = len(x0)
                x = x0 + x1

                x,order = pack_sequences(x)
                x = PackedSequence(Variable(x.data), x.batch_sizes)
                z = model(x) # embed the sequences
                z = unpack_sequences(z, order)

                z0 = z[:b]
                z1 = z[b:]

                for i in range(b):
                    z_a = z0[i]
                    z_b = z1[i]
                    logits.append(model.score(z_a, z_b))

            y = torch.cat(y, 0)
            logits = torch.stack(logits, 0)

            p = F.sigmoid(logits).data 
            ones = p.new(p.size(0),1).zero_() + 1
            p_ge = torch.cat([ones, p], 1)
            p_lt = torch.cat([1-p, ones], 1)
            p = p_ge*p_lt
            p = p/p.sum(1,keepdim=True) # make sure p is normalized

            loss = F.cross_entropy(p, y).item()

            _,y_hard = torch.max(p, 1)
            levels = torch.arange(5).to(p.device)
            y_hat = torch.sum(p*levels, 1)

            accuracy = torch.mean((y == y_hard).float()).item()
            mse = torch.mean((y.float() - y_hat)**2).item()

            y = y.cpu().numpy()
            y_hat = y_hat.cpu().numpy()

            r,_ = pearsonr(y_hat, y)
            rho,_ = spearmanr(y_hat, y)

        line = '\t'.join([str(epoch+1).zfill(digits), 'test', str(loss), str(mse)
                         , str(accuracy), str(r), str(rho)])
        print(line, file=output)
        output.flush()


        # save the model
        if save_prefix is not None:
            save_path = save_prefix + '_epoch' + str(epoch+1).zfill(digits) + '.sav'
            model.cpu()
            torch.save(model, save_path)
            if use_cuda:
                model.cuda()
Exemplo n.º 9
0
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):

        word_ids_lengths = attention_mask.sum(axis=1)
        word_embeddings = self.lookup(input_ids)

        packed_word_embeddings = pack_padded_sequence(word_embeddings,
                                                      lengths=word_ids_lengths,
                                                      batch_first=True,
                                                      enforce_sorted=False)

        words_representation, _ = self.rnn(packed_word_embeddings)
        # This implementation uses the feature sentence_embeddings. Paper uses hidden state
        word_attention = self.word_attention(words_representation.data)
        word_attention = torch.tanh(word_attention)

        # Take the dot-product of the attention vectors with the context vector (i.e. parameter of linear layer)
        word_attention = self.word_context_vector(word_attention).squeeze(
            1)  # (n_words)

        # Compute softmax over the dot-product manually
        # Manually because they have to be computed only over words in the same sentence

        # First, take the exponent
        max_value = word_attention.max(
        )  # scalar, for numerical stability during exponent calculation
        word_attention = torch.exp(word_attention - max_value)  # (n_words)

        # Re-arrange as sentences by re-padding with 0s (WORDS -> SENTENCES)
        word_attention, _ = pad_packed_sequence(
            PackedSequence(
                data=word_attention,
                batch_sizes=words_representation.batch_sizes,
                sorted_indices=words_representation.sorted_indices,
                unsorted_indices=words_representation.unsorted_indices),
            batch_first=True)  # (n_sentences, max(words_per_sentence))

        # Calculate softmax values as now words are arranged in their respective sentences
        word_alphas = word_attention / torch.sum(
            word_attention, dim=1,
            keepdim=True)  # (n_sentences, max(words_per_sentence))

        # Similarly re-arrange word-level RNN outputs as sentences by re-padding with 0s (WORDS -> SENTENCES)
        sentences, _ = pad_packed_sequence(
            words_representation, batch_first=True
        )  # (n_sentences, max(words_per_sentence), 2 * word_rnn_size)

        # Find sentence embeddings
        sentences = sentences * word_alphas.unsqueeze(
            2)  # (n_sentences, max(words_per_sentence), 2 * word_rnn_size)

        # gets the representation for the sentence
        sentences = sentences.sum(dim=1)  # (n_sentences)

        logits = self.classifier(sentences)

        outputs = (logits, sentences)

        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1, self.num_labels),
                            labels.view(-1, self.num_labels))

            outputs = (loss, ) + outputs

        return outputs
Exemplo n.º 10
0
    def forward(self,
                list_progs,
                context_embeds,
                ll=None,
                target_list=None,
                gen_method='sample',
                sizes=None,
                has_stopped=None):
        n_prog = len(list_progs)
        prog_int_seqs = [
            torch.LongTensor([self.vocab[c] for c in expr] +
                             [self.tok_stop]).to(context_embeds.device)
            for expr in list_progs
        ]
        lengths = [v.size(0) for v in prog_int_seqs]
        padded_int_seqs = pad_sequence(prog_int_seqs,
                                       batch_first=False,
                                       padding_value=self.tok_pad)

        packed_seq = pack_padded_sequence(padded_int_seqs,
                                          lengths=lengths,
                                          batch_first=False,
                                          enforce_sorted=False)
        tok_embed = self.tok_embed(packed_seq.data)
        packed_input = PackedSequence(
            data=tok_embed,
            batch_sizes=packed_seq.batch_sizes,
            sorted_indices=packed_seq.sorted_indices,
            unsorted_indices=packed_seq.unsorted_indices)

        h = self.ctx2h(context_embeds).view(n_prog, 2 * self.rnn_layers,
                                            -1).transpose(0, 1)
        c = self.ctx2c(context_embeds).view(n_prog, 2 * self.rnn_layers,
                                            -1).transpose(0, 1)
        packed_out, _ = self.lstm(packed_input, (h, c))
        unpacked_out, _ = pad_packed_sequence(packed_out)

        # positions to mod/del
        expr_poses = (padded_int_seqs == self.tok_constexpr) | (
            padded_int_seqs == self.tok_subexpr)
        embed_expr = unpacked_out[expr_poses]
        if embed_expr.shape[0]:
            mod_scores = self.modify_score(embed_expr)
            del_scores = self.del_score(embed_expr)
        else:
            mod_scores = del_scores = None
        # positions to insert
        ins_poses = padded_int_seqs == self.tok_start
        insert_scores = self.insert_score(unpacked_out[ins_poses])

        # positions to stop
        stop_poses = padded_int_seqs == self.tok_stop
        stop_scores = self.stop_score(unpacked_out[stop_poses])
        logits = loc_score(mod_scores, del_scores, insert_scores, stop_scores,
                           expr_poses, ins_poses, stop_poses, has_stopped)
        log_prob = F.log_softmax(logits, dim=0).t().contiguous()
        ll_target = None
        predecessors = None
        if target_list is None:
            if gen_method == 'sample':
                target = torch.multinomial(torch.exp(log_prob), 1)
            elif gen_method == 'argmax':
                target = torch.argmax(log_prob, dim=1)
            elif gen_method.startswith('beam'):
                beam_size = int(gen_method.split('-')[-1])
                raw_scores = log_prob + ll if ll is not None else log_prob
                predecessors, target, ll_target, sizes = beam_step(
                    raw_scores, sizes, beam_size)
                update_embed = unpacked_out[target, predecessors]
            else:
                raise NotImplementedError
        else:
            target = torch.LongTensor(target_list).to(log_prob.device)
        target = target.view(-1)
        if predecessors is None:
            ll_step = log_prob[range(n_prog), target]
            ll_target = ll_step.view(
                ll.shape) + ll if ll is not None else ll_step
            update_embed = unpacked_out[target, range(n_prog)]
        return ll_target.view(-1, 1), target, update_embed, predecessors, sizes
Exemplo n.º 11
0
    def iterate(self, src_tuple, target_tuple, training=True):
        # limit number of tokens o avoid gpu overload
        if self.limit_num_tokens is not None:
            src_tuple, target_tuple = self._batch_limit_tokens(
                src_tuple, target_tuple)
        src, src_length = src_tuple
        target, target_length = target_tuple
        batch_dim, time_dim = (0, 1) if self.batch_first else (1, 0)
        num_words = sum(target_length) - target.size(batch_dim)

        if isinstance(src, PackedSequence) or \
                not isinstance(self.model_with_loss, DataParallel):
            if isinstance(src, PackedSequence):
                src = PackedSequence(src.data.to(self.device),
                                     src.batch_sizes.to(self.device))
            else:
                src = src.to(self.device)
            target = target.to(self.device)

        if self.batch_first:
            inputs = (src, target[:, :-1])
            target_labels = target[:, 1:].contiguous()
        else:
            inputs = (src, target[:-1])
            target_labels = target[1:]

        # compute output
        loss, accuracy = self.model_with_loss(inputs, target_labels)

        loss = loss.sum()
        loss_measure = float(loss / num_words)
        if self.avg_loss_time:
            loss /= num_words
        else:
            loss /= target.size(batch_dim)
        accuracy = float(accuracy.sum().float() / num_words)

        if training:
            # compute gradient and do SGD step
            self.optimizer.zero_grad()
            loss.backward()
            if self.grad_clip is not None:
                if isinstance(self.grad_clip, dict):
                    clip_encoder = self.grad_clip.get('encoder', 0)
                    clip_decoder = self.grad_clip.get('decoder', 0)
                    if clip_encoder > 0:
                        clip_grad_norm_(
                            self.model.encoder.parameters(), clip_encoder)
                    if clip_decoder > 0:
                        clip_grad_norm_(
                            self.model.decoder.parameters(), clip_decoder)
                elif self.grad_clip > 0:  # grad_clip is a number
                    clip_grad_norm_(self.model.parameters(), self.grad_clip)
            if self.embedding_grad_clip is not None and self.embedding_grad_clip > 0:
                if hasattr(self.model.encoder, 'embedder'):
                    clip_grad_norm_(self.model.encoder.embedder.parameters(),
                                    self.embedding_grad_clip)
                if hasattr(self.model.decoder, 'embedder'):
                    clip_grad_norm_(self.model.decoder.embedder.parameters(),
                                    self.embedding_grad_clip)
            self.optimizer.step()
        return loss_measure, accuracy, num_words
Exemplo n.º 12
0
    def forward(self, sentences, words_per_sentence):
        """
        Forward propagation.
        :param sentences: encoded sentence-level data, a tensor of dimension (n_sentences, word_pad_len, emb_size)
        :param words_per_sentence: sentence lengths, a tensor of dimension (n_sentences)
        :return: sentence embeddings, attention weights of words
        """

        # Get word embeddings, apply dropout
        sentences = self.dropout(self.embeddings(
            sentences))  # (n_sentences, word_pad_len, emb_size)

        # Re-arrange as words by removing word-pads (SENTENCES -> WORDS)
        packed_words = pack_padded_sequence(
            sentences,
            lengths=words_per_sentence.tolist(),
            batch_first=True,
            enforce_sorted=False
        )  # a PackedSequence object, where 'data' is the flattened words (n_words, word_emb)

        # Apply the word-level RNN over the word embeddings (PyTorch automatically applies it on the PackedSequence)
        packed_words, _ = self.word_rnn(
            packed_words
        )  # a PackedSequence object, where 'data' is the output of the RNN (n_words, 2 * word_rnn_size)

        # # Find attention vectors by applying the attention linear layer on the output of the RNN
        att_w = self.word_attention(packed_words.data)  # (n_words, att_size)
        att_w = torch.tanh(att_w)  # (n_words, att_size)
        # Take the dot-product of the attention vectors with the context vector (i.e. parameter of linear layer)
        att_w = self.word_context_vector(att_w).squeeze(1)  # (n_words)

        # Compute softmax over the dot-product manually
        # Manually because they have to be computed only over words in the same sentence

        # First, take the exponent
        max_value = att_w.max(
        )  # scalar, for numerical stability during exponent calculation
        att_w = torch.exp(att_w - max_value)  # (n_words)

        # Re-arrange as sentences by re-padding with 0s (WORDS -> SENTENCES)
        att_w, _ = pad_packed_sequence(
            PackedSequence(data=att_w,
                           batch_sizes=packed_words.batch_sizes,
                           sorted_indices=packed_words.sorted_indices,
                           unsorted_indices=packed_words.unsorted_indices),
            batch_first=True)  # (n_sentences, max(words_per_sentence))

        # Calculate softmax values as now words are arranged in their respective sentences
        word_alphas = att_w / torch.sum(
            att_w, dim=1,
            keepdim=True)  # (n_sentences, max(words_per_sentence))

        # Similarly re-arrange word-level RNN outputs as sentences by re-padding with 0s (WORDS -> SENTENCES)
        sentences, _ = pad_packed_sequence(
            packed_words, batch_first=True
        )  # (n_sentences, max(words_per_sentence), 2 * word_rnn_size)
        # print(sentences.size())

        # Find sentence embeddings
        sentences = sentences * word_alphas.unsqueeze(
            2)  # (n_sentences, max(words_per_sentence), 2 * word_rnn_size)
        sentences = sentences.sum(dim=1)  # (n_sentences, 2 * word_rnn_size)

        return sentences, word_alphas
Exemplo n.º 13
0
    def forward(self, documents, sentences_per_document, words_per_sentence):
        """
        Forward propagation.
        :param documents: encoded document-level data, a tensor of dimensions (n_documents, sent_pad_len, word_pad_len)
        :param sentences_per_document: document lengths, a tensor of dimensions (n_documents)
        :param words_per_sentence: sentence lengths, a tensor of dimensions (n_documents, sent_pad_len)
        :return: document embeddings, attention weights of words, attention weights of sentences
        """

        # Re-arrange as sentences by removing sentence-pads (DOCUMENTS -> SENTENCES)
        packed_sentences = pack_padded_sequence(
            documents,
            lengths=sentences_per_document.tolist(),
            batch_first=True,
            enforce_sorted=False
        )  # a PackedSequence object, where 'data' is the flattened sentences (n_sentences, word_pad_len)

        # Re-arrange sentence lengths in the same way (DOCUMENTS -> SENTENCES)
        packed_words_per_sentence = pack_padded_sequence(
            words_per_sentence,
            lengths=sentences_per_document.tolist(),
            batch_first=True,
            enforce_sorted=False
        )  # a PackedSequence object, where 'data' is the flattened sentence lengths (n_sentences)

        # Find sentence embeddings by applying the word-level attention module
        sentences, word_alphas = self.word_attention(
            packed_sentences.data, packed_words_per_sentence.data
        )  # (n_sentences, 2 * word_rnn_size), (n_sentences, max(words_per_sentence))
        sentences = self.dropout(sentences)

        # Apply the sentence-level RNN over the sentence embeddings (PyTorch automatically applies it on the PackedSequence)
        packed_sentences, _ = self.sentence_rnn(
            PackedSequence(data=sentences,
                           batch_sizes=packed_sentences.batch_sizes,
                           sorted_indices=packed_sentences.sorted_indices,
                           unsorted_indices=packed_sentences.unsorted_indices))
        documents, _ = pad_packed_sequence(packed_sentences, batch_first=True)
        # Find attention vectors by applying the attention linear layer on the output of the RNN
        att_s = self.sentence_attention(
            packed_sentences.data)  # (n_sentences, att_size)
        att_s = torch.tanh(att_s)  # (n_sentences, att_size)
        # Take the dot-product of the attention vectors with the context vector (i.e. parameter of linear layer)
        att_s = self.sentence_context_vector(att_s).squeeze(1)  # (n_sentences)

        # Compute softmax over the dot-product manually
        # Manually because they have to be computed only over sentences in the same document

        # First, take the exponent
        max_value = att_s.max(
        )  # scalar, for numerical stability during exponent calculation
        att_s = torch.exp(att_s - max_value)  # (n_sentences)

        # Re-arrange as documents by re-padding with 0s (SENTENCES -> DOCUMENTS)
        att_s, _ = pad_packed_sequence(
            PackedSequence(data=att_s,
                           batch_sizes=packed_sentences.batch_sizes,
                           sorted_indices=packed_sentences.sorted_indices,
                           unsorted_indices=packed_sentences.unsorted_indices),
            batch_first=True)  # (n_documents, max(sentences_per_document))

        # Calculate softmax values as now sentences are arranged in their respective documents
        sentence_alphas = att_s / torch.sum(
            att_s, dim=1,
            keepdim=True)  # (n_documents, max(sentences_per_document))

        # Similarly re-arrange sentence-level RNN outputs as documents by re-padding with 0s (SENTENCES -> DOCUMENTS)
        documents, _ = pad_packed_sequence(
            packed_sentences, batch_first=True
        )  # (n_documents, max(sentences_per_document), 2 * sentence_rnn_size)

        # Find document embeddings
        documents = documents * sentence_alphas.unsqueeze(
            2
        )  # (n_documents, max(sentences_per_document), 2 * sentence_rnn_size)
        return documents
    def forward(
        self,
        x: Union[torch.Tensor, PackedSequence],
        state_init: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        r"""
        Implements the forward pass of the DPLSTM when a sequence is input.

        Dimensions as follows:
            - B: Batch size
            - T: Sequence length
            - D: LSTM input hidden size (eg from a word embedding)
            - H: LSTM output hidden size
            - L: number of layers in the LSTM
            - P: number of directions (2 if bidirectional, else 1)

        Args:
            x: Input sequence to the DPLSTM of shape ``[T, B, D]``. Or it can be a PackedSequence.
            state_init: Initial state of the LSTM as a tuple ``(h_0, c_0)``, where:
                - ``h_0`` of shape ``[L*P, B, H]`` contains the initial hidden state
                - ``c_0`` of shape ``[L*P, B, H]`` contains the initial cell state

                This argument can be (and defaults to) None, in which case zero tensors will be used.

         Returns:
            ``output, (h_n, c_n)`` where, ``output`` is of shape ``[T, B, H * P]`` and is a
            tensor containing the output features (``h_t``) from the last layer of the DPLSTM
            for each timestep ``t``. ``h_n`` is of shape ``[L * P, B, H]`` and contains the
            hidden state for ``t = T``. ``c_n`` is of shape ``[L * P, B, H]`` and contains
            the cell state for ``t = T``.
        """

        if isinstance(x, PackedSequence):
            x, batch_sizes, sorted_indices, unsorted_indices = x
            B = batch_sizes[0].item()
            _, D = x.shape
            x = x.split(tuple(batch_sizes))
            for layer in self.layers:
                layer.set_max_batch_length(B)
        else:
            sorted_indices = None
            unsorted_indices = None
            batch_sizes = None
            x = self._rearrange_batch_dim(x)
            T, B, D = x.shape

        L = self.num_layers
        P = 2 if self.bidirectional else 1
        H = self.hidden_size

        h_0s, c_0s = state_init or (None, None)

        if h_0s is None:
            h_0s = torch.zeros(
                L,
                P,
                B,
                self.hidden_size,
                dtype=x[0].dtype,
                device=x[0].device,
            )
        else:
            h_0s = h_0s.reshape([L, P, B, H])
            h_0s = self._permute_hidden(h_0s, sorted_indices, 2)

        if c_0s is None:
            c_0s = torch.zeros(
                L,
                P,
                B,
                self.hidden_size,
                dtype=x[0].dtype,
                device=x[0].device,
            )
        else:
            c_0s = c_0s.reshape([L, P, B, H])
            c_0s = self._permute_hidden(c_0s, sorted_indices, 2)

        hs: List[torch.Tensor] = []
        cs: List[torch.Tensor] = []

        for layer, h0, c0 in zip(self.layers, h_0s, c_0s):
            if not self.bidirectional:
                h0 = h0.squeeze(0)
                c0 = c0.squeeze(0)
            x, (h, c) = layer(x, (h0, c0), batch_sizes)
            if not self.bidirectional:
                h = h.unsqueeze(0)  # [1, B, H]
                c = c.unsqueeze(0)  # [1, B, H]

            hs.append(h)
            cs.append(c)

        hs = torch.cat(hs, dim=0)  # [L * P, B, H]
        cs = torch.cat(cs, dim=0)  # [L * P, B, H]

        if batch_sizes is not None:
            seq_lengths = _compute_seq_lengths(batch_sizes)
            packed_data = pack_padded_sequence(
                pad_sequence(x, batch_first=False), seq_lengths, batch_first=True
            )[0]
            out = PackedSequence(
                packed_data, batch_sizes, sorted_indices, unsorted_indices
            )
        else:
            out = self._rearrange_batch_dim(x)

        return out, (
            self._permute_hidden(hs, unsorted_indices),
            self._permute_hidden(cs, unsorted_indices),
        )
Exemplo n.º 15
0
    def forward(self, x, hx=None):
        r"""

        :param x: [batch, seq_len, input_size] 输入序列
        :param hx: [batch, hidden_size] 初始隐状态, 若为 ``None`` , 设为全1向量. Default: ``None``
        :return (output, ht): [batch, seq_len, hidden_size*num_direction] 输出序列
            和 [batch, hidden_size*num_direction] 最后时刻隐状态
        """
        is_lstm = self.is_lstm
        is_packed = isinstance(x, PackedSequence)
        if not is_packed:
            seq_len = x.size(1) if self.batch_first else x.size(0)
            max_batch_size = x.size(0) if self.batch_first else x.size(1)
            seq_lens = torch.LongTensor(
                [seq_len for _ in range(max_batch_size)])
            x = pack_padded_sequence(x, seq_lens, batch_first=self.batch_first)
        else:
            max_batch_size = int(x.batch_sizes[0])
        x, batch_sizes = x.data, x.batch_sizes

        if hx is None:
            hx = x.new_zeros(self.num_layers * self.num_directions,
                             max_batch_size,
                             self.hidden_size,
                             requires_grad=True)
            if is_lstm:
                hx = (hx, hx.new_zeros(hx.size(), requires_grad=True))

        mask_x = x.new_ones((max_batch_size, self.input_size))
        mask_out = x.new_ones(
            (max_batch_size, self.hidden_size * self.num_directions))
        mask_h_ones = x.new_ones((max_batch_size, self.hidden_size))
        nn.functional.dropout(mask_x,
                              p=self.input_dropout,
                              training=self.training,
                              inplace=True)
        nn.functional.dropout(mask_out,
                              p=self.hidden_dropout,
                              training=self.training,
                              inplace=True)

        hidden = x.new_zeros((self.num_layers * self.num_directions,
                              max_batch_size, self.hidden_size))
        if is_lstm:
            cellstate = x.new_zeros((self.num_layers * self.num_directions,
                                     max_batch_size, self.hidden_size))
        for layer in range(self.num_layers):
            output_list = []
            input_seq = PackedSequence(x, batch_sizes)
            mask_h = nn.functional.dropout(mask_h_ones,
                                           p=self.hidden_dropout,
                                           training=self.training,
                                           inplace=False)
            for direction in range(self.num_directions):
                output_x, hidden_x = self._forward_one(
                    layer, direction, input_seq, hx,
                    mask_x if layer == 0 else mask_out, mask_h)
                output_list.append(output_x.data)
                idx = self.num_directions * layer + direction
                if is_lstm:
                    hidden[idx] = hidden_x[0]
                    cellstate[idx] = hidden_x[1]
                else:
                    hidden[idx] = hidden_x
            x = torch.cat(output_list, dim=-1)

        if is_lstm:
            hidden = (hidden, cellstate)

        if is_packed:
            output = PackedSequence(x, batch_sizes)
        else:
            x = PackedSequence(x, batch_sizes)
            output, _ = pad_packed_sequence(x, batch_first=self.batch_first)

        return output, hidden
Exemplo n.º 16
0
    def forward(self, word, word_mask, wordchars, wordchars_mask, upos, pretrained, head, deprel, word_orig_idx, sentlens, wordlens):
        def pack(x):
            return pack_padded_sequence(x, sentlens, batch_first=True)

        inputs = []
        if self.args['pretrain']:
            pretrained_emb = self.pretrained_emb(pretrained)
            pretrained_emb = self.trans_pretrained(pretrained_emb)
            pretrained_emb = pack(pretrained_emb)
            inputs += [pretrained_emb]

        #def pad(x):
        #    return pad_packed_sequence(PackedSequence(x, pretrained_emb.batch_sizes), batch_first=True)[0]

        if self.args['word_emb_dim'] > 0:
            word_emb = self.word_emb(word)
            word_emb = pack(word_emb)
            inputs += [word_emb]

        if self.args['tag_emb_dim'] > 0:
            pos_emb = self.upos_emb(upos)
            pos_emb = pack(pos_emb)
            inputs += [pos_emb]

        if self.args['char'] and self.args['char_emb_dim'] > 0:
            char_reps = self.charmodel(wordchars, wordchars_mask, word_orig_idx, sentlens, wordlens)
            char_reps = PackedSequence(self.trans_char(self.drop(char_reps.data)), char_reps.batch_sizes)
            inputs += [char_reps]

        lstm_inputs = torch.cat([x.data for x in inputs], 1)

        lstm_inputs = self.worddrop(lstm_inputs, self.drop_replacement)
        lstm_inputs = self.drop(lstm_inputs)

        lstm_inputs = PackedSequence(lstm_inputs, inputs[0].batch_sizes)

        lstm_outputs, _ = self.parserlstm(lstm_inputs, sentlens, hx=(self.parserlstm_h_init.expand(2 * self.args['num_layers'], word.size(0), self.args['hidden_dim']).contiguous(), self.parserlstm_c_init.expand(2 * self.args['num_layers'], word.size(0), self.args['hidden_dim']).contiguous()))
        lstm_outputs, _ = pad_packed_sequence(lstm_outputs, batch_first=True)

        unlabeled_scores = self.unlabeled(self.drop(lstm_outputs), self.drop(lstm_outputs)).squeeze(3)
        deprel_scores = self.deprel(self.drop(lstm_outputs), self.drop(lstm_outputs))


        if self.args['linearization'] or self.args['distance']:
            head_offset = torch.arange(word.size(1), device=head.device).view(1, 1, -1).expand(word.size(0), -1, -1) - torch.arange(word.size(1), device=head.device).view(1, -1, 1).expand(word.size(0), -1, -1)

        if self.args['linearization']:
            lin_scores = self.linearization(self.drop(lstm_outputs), self.drop(lstm_outputs)).squeeze(3)
            unlabeled_scores += F.logsigmoid(lin_scores * torch.sign(head_offset).float()).detach()

        if self.args['distance']:
            dist_scores = self.distance(self.drop(lstm_outputs), self.drop(lstm_outputs)).squeeze(3)
            dist_pred = 1 + F.softplus(dist_scores)
            dist_target = torch.abs(head_offset)
            dist_kld = -torch.log((dist_target.float() - dist_pred)**2/2 + 1)
            unlabeled_scores += dist_kld.detach()

        diag = torch.eye(head.size(-1)+1, dtype=torch.uint8, device=head.device).unsqueeze(0)
        unlabeled_scores.masked_fill_(diag, -float('inf'))

        preds = []

        if self.training:
            unlabeled_scores = unlabeled_scores[:, 1:, :] # exclude attachment for the root symbol
            unlabeled_scores = unlabeled_scores.masked_fill(word_mask.unsqueeze(1), -float('inf'))
            unlabeled_target = head.masked_fill(word_mask[:, 1:], -1)
            loss = self.crit(unlabeled_scores.contiguous().view(-1, unlabeled_scores.size(2)), unlabeled_target.view(-1))

            deprel_scores = deprel_scores[:, 1:] # exclude attachment for the root symbol
            deprel_scores = torch.gather(deprel_scores, 2, head.unsqueeze(2).unsqueeze(3).expand(-1, -1, -1, len(self.vocab['deprel']))).view(-1, len(self.vocab['deprel']))
            deprel_target = deprel.masked_fill(word_mask[:, 1:], -1)
            loss += self.crit(deprel_scores.contiguous(), deprel_target.view(-1))

            if self.args['linearization']:
                #lin_scores = lin_scores[:, 1:].masked_select(goldmask)
                lin_scores = torch.gather(lin_scores[:, 1:], 2, head.unsqueeze(2)).view(-1)
                lin_scores = torch.cat([-lin_scores.unsqueeze(1)/2, lin_scores.unsqueeze(1)/2], 1)
                #lin_target = (head_offset[:, 1:] > 0).long().masked_select(goldmask)
                lin_target = torch.gather((head_offset[:, 1:] > 0).long(), 2, head.unsqueeze(2))
                loss += self.crit(lin_scores.contiguous(), lin_target.view(-1))

            if self.args['distance']:
                #dist_kld = dist_kld[:, 1:].masked_select(goldmask)
                dist_kld = torch.gather(dist_kld[:, 1:], 2, head.unsqueeze(2))
                loss -= dist_kld.sum()

            loss /= wordchars.size(0) # number of words
        else:
            loss = 0
            preds.append(F.log_softmax(unlabeled_scores, 2).detach().cpu().numpy())
            preds.append(deprel_scores.max(3)[1].detach().cpu().numpy())

        return loss, preds
Exemplo n.º 17
0
    def iterate(self, src, target, training=True):
        # limit number of tokens o avoid gpu overload
        if self.limit_num_tokens is not None:
            src, target = self._batch_limit_tokens(src, target)
        src, src_length = src
        target, target_length = target
        batch_dim, time_dim = (0, 1) if self.batch_first else (1, 0)
        num_words = sum(target_length) - target.size(batch_dim)

        # Allow packed source sequences - for cudnn rnns
        if isinstance(src, PackedSequence):
            src_pack = src
            src = src.data
        else:
            src_pack = None

        if self.cuda and not isinstance(self.model_with_loss, DataParallel):
            src = src.cuda()
            target = target.cuda()

        src_var = Variable(src, volatile=not training)
        target_var = Variable(target, volatile=not training)

        if src_pack is not None:
            src_var = PackedSequence(src_var, src_pack[1])

        if self.batch_first:
            inputs = (src_var, target_var[:, :-1])
            target_labels = target_var[:, 1:].contiguous()
        else:
            inputs = (src_var, target_var[:-1])
            target_labels = target_var[1:]

        # compute output
        loss = self.model_with_loss(inputs, target_labels).sum()
        loss /= num_words

        if training:
            # compute gradient and do SGD step
            self.optimizer.zero_grad()
            loss.backward()
            if self.grad_clip is not None:
                if isinstance(self.grad_clip, dict):
                    clip_encoder = self.grad_clip.get('encoder', 0)
                    clip_decoder = self.grad_clip.get('decoder', 0)
                    if clip_encoder > 0:
                        clip_grad_norm(self.model.encoder.parameters(),
                                       clip_encoder)
                    if clip_decoder > 0:
                        clip_grad_norm(self.model.decoder.parameters(),
                                       clip_decoder)
                elif self.grad_clip > 0:  # grad_clip is a number
                    clip_grad_norm(self.model.parameters(), self.grad_clip)
            if self.embedding_grad_clip is not None and self.embedding_grad_clip > 0:
                if hasattr(self.model.encoder, 'embedder'):
                    clip_grad_norm(self.model.encoder.embedder.parameters(),
                                   self.embedding_grad_clip)
                if hasattr(self.model.decoder, 'embedder'):
                    clip_grad_norm(self.model.decoder.embedder.parameters(),
                                   self.embedding_grad_clip)
            self.optimizer.step()
        return loss.data[0], num_words
    def forward(self, docs, doc_lengths, sent_lengths):
        """
        :param docs: encoded document-level data; LongTensor (num_docs, padded_doc_length, padded_sent_length)
        :param doc_lengths: unpadded document lengths; LongTensor (num_docs)
        :param sent_lengths: unpadded sentence lengths; LongTensor (num_docs, padded_doc_length)
        :return: document embeddings, attention weights of words, attention weights of sentences
        """
        # Sort documents by decreasing order in length
        doc_lengths, doc_perm_idx = doc_lengths.sort(dim=0, descending=True)
        docs = docs[doc_perm_idx]
        sent_lengths = sent_lengths[doc_perm_idx]

        # Make a long batch of sentences by removing pad-sentences
        # i.e. `docs` was of size (num_docs, padded_doc_length, padded_sent_length)
        # -> `packed_sents.data` is now of size (num_sents, padded_sent_length)
        packed_sents = pack_padded_sequence(docs,
                                            lengths=doc_lengths.tolist(),
                                            batch_first=True)

        # effective batch size at each timestep
        valid_bsz = packed_sents.batch_sizes

        # Make a long batch of sentence lengths by removing pad-sentences
        # i.e. `sent_lengths` was of size (num_docs, padded_doc_length)
        # -> `packed_sent_lengths.data` is now of size (num_sents)
        packed_sent_lengths = pack_padded_sequence(
            sent_lengths, lengths=doc_lengths.tolist(), batch_first=True)

        # Word attention module
        sents, word_att_weights = self.word_attention(packed_sents.data,
                                                      packed_sent_lengths.data)

        # NOTE MODIFICATION (FEATURES)
        sents = self.dropout(sents)

        # Sentence-level GRU over sentence embeddings
        packed_sents, _ = self.gru(PackedSequence(sents, valid_bsz))

        # NOTE MODIFICATION (FEATURES)
        if self.use_layer_norm:
            normed_sents = self.layer_norm(packed_sents.data)
        else:
            normed_sents = packed_sents

        # Sentence attention
        att = torch.tanh(self.sent_attention(normed_sents))
        att = self.sentence_context_vector(att).squeeze(1)

        # NOTE MODIFICATION (BUG)
        val = att.max()
        att = torch.exp(att - val)

        # Restore as documents by repadding
        att, _ = pad_packed_sequence(PackedSequence(att, valid_bsz),
                                     batch_first=True)

        # Note MODIFICATION (BUG)
        sent_att_weights = att / torch.sum(att, dim=1, keepdim=True)

        # Restore as documents by repadding
        docs, _ = pad_packed_sequence(packed_sents, batch_first=True)

        # Compute document vectors
        docs = docs * sent_att_weights.unsqueeze(2)
        docs = docs.sum(dim=1)

        # Restore as documents by repadding
        word_att_weights, _ = pad_packed_sequence(PackedSequence(
            word_att_weights, valid_bsz),
                                                  batch_first=True)

        # Restore the original order of documents (undo the first sorting)
        _, doc_unperm_idx = doc_perm_idx.sort(dim=0, descending=False)
        docs = docs[doc_unperm_idx]

        # NOTE MODIFICATION (BUG)
        word_att_weights = word_att_weights[doc_unperm_idx]
        sent_att_weights = sent_att_weights[doc_unperm_idx]

        return docs, word_att_weights, sent_att_weights
Exemplo n.º 19
0
    def forward(self, input, hx=None):
        is_lstm = self.is_lstm
        is_packed = isinstance(input, PackedSequence)
        if not is_packed:
            seq_len = input.size(1) if self.batch_first else input.size(0)
            max_batch_size = input.size(0) if self.batch_first else input.size(
                1)
            seq_lens = torch.LongTensor(
                [seq_len for _ in range(max_batch_size)])
            input = pack_padded_sequence(input,
                                         seq_lens,
                                         batch_first=self.batch_first)
        else:
            max_batch_size = int(input.batch_sizes[0])
        input, batch_sizes = input.data, input.batch_sizes

        if hx is None:
            hx = input.new_zeros(self.num_layers * self.num_directions,
                                 max_batch_size,
                                 self.hidden_size,
                                 requires_grad=True)
            if is_lstm:
                hx = (hx, hx.new_zeros(hx.size(), requires_grad=True))

        mask_x = input.new_ones((max_batch_size, self.input_size))
        mask_out = input.new_ones(
            (max_batch_size, self.hidden_size * self.num_directions))
        mask_h_ones = input.new_ones((max_batch_size, self.hidden_size))
        nn.functional.dropout(mask_x,
                              p=self.input_dropout,
                              training=self.training,
                              inplace=True)
        nn.functional.dropout(mask_out,
                              p=self.hidden_dropout,
                              training=self.training,
                              inplace=True)

        hidden = input.new_zeros((self.num_layers * self.num_directions,
                                  max_batch_size, self.hidden_size))
        if is_lstm:
            cellstate = input.new_zeros((self.num_layers * self.num_directions,
                                         max_batch_size, self.hidden_size))
        for layer in range(self.num_layers):
            output_list = []
            input_seq = PackedSequence(input, batch_sizes)
            mask_h = nn.functional.dropout(mask_h_ones,
                                           p=self.hidden_dropout,
                                           training=self.training,
                                           inplace=False)
            for direction in range(self.num_directions):
                output_x, hidden_x = self._forward_one(
                    layer, direction, input_seq, hx,
                    mask_x if layer == 0 else mask_out, mask_h)
                output_list.append(output_x.data)
                idx = self.num_directions * layer + direction
                if is_lstm:
                    hidden[idx] = hidden_x[0]
                    cellstate[idx] = hidden_x[1]
                else:
                    hidden[idx] = hidden_x
            input = torch.cat(output_list, dim=-1)

        if is_lstm:
            hidden = (hidden, cellstate)

        if is_packed:
            output = PackedSequence(input, batch_sizes)
        else:
            input = PackedSequence(input, batch_sizes)
            output, _ = pad_packed_sequence(input,
                                            batch_first=self.batch_first)

        return output, hidden
Exemplo n.º 20
0
    def forward(self, word, word_mask, wordchars, wordchars_mask, pos, feats,
                pretrained, word_orig_idx, sentlens, wordlens):
        def pack(x):
            return pack_padded_sequence(x, sentlens, batch_first=True)

        def get_batch_sizes(sentlens):
            b = []
            for i in range(max(sentlens)):
                c = len([x for x in sentlens if x > i])
                b.append(c)
            return torch.tensor(b)

        def pad(x):
            return pad_packed_sequence(PackedSequence(x, batch_sizes),
                                       batch_first=True)[0]

        inputs = []
        if self.use_word:
            word_emb = self.word_emb(word)
            word_emb = pack(word_emb)
            inputs += [word_emb]
            batch_sizes = word_emb.batch_sizes
        else:
            batch_sizes = get_batch_sizes(sentlens)

        if self.use_pretrained:
            pretrained_emb = self.pretrained_emb(pretrained)
            pretrained_emb = self.trans_pretrained(pretrained_emb)
            pretrained_emb = pack(pretrained_emb)
            inputs += [pretrained_emb]

        if self.use_char:
            char_reps = self.charmodel(wordchars, wordchars_mask,
                                       word_orig_idx, sentlens, wordlens)
            char_reps = PackedSequence(
                self.trans_char(self.drop(char_reps.data)),
                char_reps.batch_sizes)
            inputs += [char_reps]

        lstm_inputs = torch.cat([x.data for x in inputs], 1)
        lstm_inputs = self.worddrop(lstm_inputs, self.drop_replacement)
        lstm_inputs = self.drop(lstm_inputs)
        lstm_inputs = PackedSequence(lstm_inputs, inputs[0].batch_sizes)

        lstm_outputs, _ = self.taggerlstm(
            lstm_inputs,
            sentlens,
            hx=(self.taggerlstm_h_init.expand(
                2 * self.args['tag_num_layers'], word.size(0),
                self.args['tag_hidden_dim']).contiguous(),
                self.taggerlstm_c_init.expand(
                    2 * self.args['tag_num_layers'], word.size(0),
                    self.args['tag_hidden_dim']).contiguous()))
        lstm_outputs = lstm_outputs.data

        pos_hid = F.relu(self.pos_hid(self.drop(lstm_outputs)))
        pos_pred = self.pos_clf(self.drop(pos_hid))

        preds = [pad(pos_pred).max(2)[1]]

        pos = pack(pos).data
        loss = self.crit(pos_pred.view(-1, pos_pred.size(-1)), pos.view(-1))

        if self.share_hid:
            feats_hid = pos_hid
            clffunc = lambda clf, hid: clf(self.drop(hid))
        else:
            feats_hid = F.relu(self.feats_hid(self.drop(lstm_outputs)))
            # TODO: self.training is never set, but check if this is a bug
            #if self.training: pos_emb = self.pos_emb(pos) else:
            pos_emb = self.pos_emb(pos_pred.max(1)[1])
            clffunc = lambda clf, hid: clf(self.drop(hid), self.drop(pos_emb))

        feats_preds = []
        feats = pack(feats).data
        for i in range(len(self.vocab['feats'])):
            feats_pred = clffunc(self.feats_clf[i], feats_hid)
            loss += self.crit(feats_pred.view(-1, feats_pred.size(-1)),
                              feats[:, i].view(-1))
            feats_preds.append(pad(feats_pred).max(2, keepdim=True)[1])
        preds.append(torch.cat(feats_preds, 2))

        return loss, preds
Exemplo n.º 21
0
    def _add_embeddings_internal(self, sentences: Union[List[Sentence],
                                                        Sentence]):
        """Add embeddings to all sentences in the given list of sentences. If embeddings are already added, update
         only if embeddings are non-static."""

        if type(sentences) is Sentence:
            sentences = [sentences]

        self.rnn.zero_grad()

        # embed words in the sentence
        self.embeddings.embed(sentences)

        lengths: List[int] = [len(sentence.tokens) for sentence in sentences]
        longest_token_sequence_in_batch: int = max(lengths)

        pre_allocated_zero_tensor = torch.zeros(
            self.embeddings.embedding_length * longest_token_sequence_in_batch,
            dtype=torch.float,
            device=flair.device,
        )

        all_embs: List[torch.Tensor] = list()
        for sentence in sentences:
            all_embs += [
                emb for token in sentence
                for emb in token.get_each_embedding()
            ]
            nb_padding_tokens = longest_token_sequence_in_batch - len(sentence)

            if nb_padding_tokens > 0:
                t = pre_allocated_zero_tensor[:self.embeddings.
                                              embedding_length *
                                              nb_padding_tokens]
                all_embs.append(t)

        sentence_tensor = torch.cat(all_embs).view([
            len(sentences),
            longest_token_sequence_in_batch,
            self.embeddings.embedding_length,
        ])

        # before-RNN dropout
        if self.dropout:
            sentence_tensor = self.dropout(sentence_tensor)
        if self.locked_dropout:
            sentence_tensor = self.locked_dropout(sentence_tensor)
        if self.word_dropout:
            sentence_tensor = self.word_dropout(sentence_tensor)

        # reproject if set
        if self.reproject_words:
            sentence_tensor = self.word_reprojection_map(sentence_tensor)

        # push through RNN
        packed = pack_padded_sequence(sentence_tensor,
                                      lengths,
                                      enforce_sorted=False,
                                      batch_first=True)
        rnn_out, hidden = self.rnn(packed)

        # Attention mechanism is inspired by word attention network in:
        # https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Text-Classification/blob/ec11e234bbbae2adcd7d665489999410911a9fb4/model.py#L173

        # Feed word annotation through one layer MLP to get hidden representation
        hidden_rep = self.word_attention(rnn_out.data)
        hidden_rep = torch.tanh(hidden_rep)

        # Measure importance of word as similarity of hidden representation with word level context vector
        # To get normalized attention weights perform softmax function in steps
        # 1. Take the dot-product of the attention vectors with the context vector (i.e. parameter of linear layer)
        att_weights = self.word_context_vector(hidden_rep).squeeze(
            1)  # (n_words)
        # 2. Take the exponent
        max_value = att_weights.max(
        )  # scalar, for numerical stability during exponent calculation
        att_weights = torch.exp(att_weights - max_value)  # (n_words)
        # Re-arrange attention weights as sentences
        packed_att_w = PackedSequence(
            data=att_weights,
            batch_sizes=rnn_out.batch_sizes,
            sorted_indices=rnn_out.sorted_indices,
            unsorted_indices=rnn_out.unsorted_indices)
        att_weights, output_lengths = pad_packed_sequence(
            packed_att_w,
            batch_first=True)  # (n_sentences, max(words_per_sentence))
        # 3. Calculate softmax values: could have called F.softmax here instead of doing exp before re-arrangement?
        att_weights = att_weights / torch.sum(
            att_weights, dim=1,
            keepdim=True)  # (n_sentences, max(words_per_sentence))

        # Re-arrange word-level RNN outputs as sentences by re-padding with 0s (WORDS -> SENTENCES)
        outputs, _ = pad_packed_sequence(
            rnn_out, batch_first=True
        )  # (n_sentences, max(words_per_sentence), 2 * word_rnn_size)

        # Compute sentence embeddings as weighted sum of word annotations based on the attention weights
        outputs = outputs * att_weights.unsqueeze(
            2)  # (n_sentences, max(words_per_sentence), 2 * word_rnn_size)
        outputs = outputs.sum(dim=1)  # (n_sentences, 2 * word_rnn_size)

        # after-RNN dropout
        if self.dropout:
            outputs = self.dropout(outputs)
        if self.locked_dropout:
            outputs = self.locked_dropout(outputs)

        # extract sentence embeddings
        for sentence_no, length in enumerate(lengths):
            embedding = outputs[sentence_no]

            if self.static_embeddings:
                embedding = embedding.detach()

            sentence = sentences[sentence_no]
            sentence.set_embedding(self.name, embedding)
Exemplo n.º 22
0
 def pad(x):
     return pad_packed_sequence(PackedSequence(x, batch_sizes),
                                batch_first=True)[0]
Exemplo n.º 23
0
 def pad(x):  # inverse operation to pack_padded_sequence(). Pads a packed batch of variable length sequences.
     return pad_packed_sequence(PackedSequence(x, word_emb.batch_sizes), batch_first=True)[0]
Exemplo n.º 24
0
    def forward(self, src_tokens, src_lengths):
        if LanguagePairDataset.LEFT_PAD_SOURCE:
            # convert left-padding to right-padding
            src_tokens.data = utils.convert_padding_direction(
                src_tokens.data,
                src_lengths.data,
                self.padding_idx,
                left_to_right=True,
            )
        if self.word_dropout_module is not None:
            src_tokens.data = self.word_dropout_module(src_tokens.data)
        bsz, seqlen = src_tokens.size()

        # embed tokens
        x = self.embed_tokens(src_tokens)
        x = F.dropout(x, p=self.dropout_in, training=self.training)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # Generate packed seq to deal with varying source seq length
        packed_input, batch_sizes = pack_padded_sequence(
            x,
            src_lengths,
        )
        final_hiddens, final_cells = [], []
        next_hiddens = []
        for i, rnn_layer in enumerate(self.layers):
            current_hidden_size = self.hidden_dim // 2 if \
                rnn_layer.is_bidirectional else self.hidden_dim
            if self.cell_type in ['lstm', 'milstm', 'layer_norm_lstm']:
                prev_hidden = (
                    x.data.new(bsz, current_hidden_size).zero_(),
                    x.data.new(bsz, current_hidden_size).zero_(),
                )
            else:
                raise Exception(f'{self.cell_type} not implemented')

            hidden, current_output = rnn_layer.forward(
                packed_input,
                prev_hidden,
                batch_sizes,
            )
            next_hiddens.append(hidden)
            prev_hidden = next_hiddens[-1]

            if self.dropout_out != 0:
                current_output = F.dropout(
                    current_output,
                    p=self.dropout_out,
                    training=self.training,
                )

            if self.residual_level is not None and i >= self.residual_level:
                packed_input = packed_input.clone() + current_output
            else:
                packed_input = current_output

        final_hiddens, final_cells = zip(*next_hiddens)
        # Reshape to [num_layer, batch_size, hidden_dim]
        final_hiddens = torch.cat(
            final_hiddens,
            dim=0,
        ).view(self.num_layers, *final_hiddens[0].size())
        final_cells = torch.cat(
            final_cells,
            dim=0,
        ).view(self.num_layers, *final_cells[0].size())

        #  [max_seqlen, batch_size, hidden_dim]
        padding_value = -np.inf if self.add_encoder_output_as_decoder_input else 0
        unpacked_output, _ = pad_packed_sequence(
            PackedSequence(packed_input, batch_sizes),
            padding_value=padding_value,
        )

        return (
            unpacked_output,
            final_hiddens,
            final_cells,
            src_lengths,
            src_tokens,
        )
Exemplo n.º 25
0
    def forward(self, documents, sentences_per_document, words_per_sentence):

        # pack sequences (remove word-pads, DOCUMENTS -> SENTENCES)
        packed_sentences = pack_padded_sequence(
            documents,
            lengths=sentences_per_document.tolist(),
            batch_first=True,
            enforce_sorted=False
        )  # a PackedSequence object, where 'data' is the flattened sentences (n_sentences, word_pad_len)

        # re-arrange sentence lengths in the same way (DOCUMENTS -> SENTENCES)
        packed_words_per_sentence = pack_padded_sequence(
            words_per_sentence,
            lengths=sentences_per_document.tolist(),
            batch_first=True,
            enforce_sorted=False
        )  # a PackedSequence object, where 'data' is the flattened sentence lengths (n_sentences)

        # word encoder, get sentence vectors
        sentences, word_alphas = self.word_encoder(
            packed_sentences.data, packed_words_per_sentence.data
        )  # (n_sentences, 2 * word_rnn_size), (n_sentences, max(words_per_sentence))
        sentences = self.dropout(sentences)

        # run through sentence-level RNN (PyTorch automatically applies it on the PackedSequence)
        packed_sentences, _ = self.sentence_rnn(
            PackedSequence(data=sentences,
                           batch_sizes=packed_sentences.batch_sizes,
                           sorted_indices=packed_sentences.sorted_indices,
                           unsorted_indices=packed_sentences.unsorted_indices)
        )  # a PackedSequence object, where 'data' is the output of the RNN (n_sentences, 2 * sentence_rnn_size)

        # unpack sequences (re-pad with 0s, SENTENCES -> DOCUMENTS)
        # we do unpacking here because attention weights have to be computed only over sentences in the same document
        documents, _ = pad_packed_sequence(
            packed_sentences, batch_first=True
        )  # (n_documents, max(sentences_per_document), 2 * sentence_rnn_size)

        # sentence-level attention
        # eq.8: u_i = tanh(W_s h_i + b_s)
        u_i = self.W_s(
            documents)  # (n_documents, max(sentences_per_document), att_size)
        u_i = self.tanh(
            u_i)  # (n_documents, max(sentences_per_document), att_size)

        # eq.9: alpha_i = softmax(u_i u_s)
        sent_alphas = self.u_s(u_i).squeeze(
            2)  # (n_documents, max(sentences_per_document))
        sent_alphas = self.softmax(
            sent_alphas)  # (n_documents, max(sentences_per_document))

        # form document vectors
        # eq.10: v = \sum_i α_i h_i
        documents = documents * sent_alphas.unsqueeze(
            2
        )  # (n_documents, max(sentences_per_document), 2 * sentence_rnn_size)
        documents = documents.sum(
            dim=1)  # (n_documents, 2 * sentence_rnn_size)

        # also re-arrange word_alphas (SENTENCES -> DOCUMENTS)
        word_alphas, _ = pad_packed_sequence(
            PackedSequence(data=word_alphas,
                           batch_sizes=packed_sentences.batch_sizes,
                           sorted_indices=packed_sentences.sorted_indices,
                           unsorted_indices=packed_sentences.unsorted_indices),
            batch_first=True
        )  # (n_documents, max(sentences_per_document), max(words_per_sentence))

        return documents, word_alphas, sent_alphas
Exemplo n.º 26
0
 def to_cuda(batch_data):
     sentences, gazetteers, batch_tags = batch_data
     return (PackedSequence(sentences.data.cuda(), sentences.batch_sizes),
             PackedSequence(gazetteers.data.cuda(), gazetteers.batch_sizes),
             PackedSequence(batch_tags.data.cuda(), batch_tags.batch_sizes))
def val_emotion(encoder, decoder, vocab, criterion, data_loaders, tags):
    decoder.eval()
    encoder.eval()

    batch_time = AverageMeter()
    losses = [AverageMeter() for _ in range(len(tags))]
    top5accs = [AverageMeter() for _ in range(len(tags))]
    bleu4s = []
    start = time.time()

    for j in range(len(tags)):

        # references (true captions) for calculating BLEU-4 score
        references = list()
        # hypotheses (predictions)
        hypotheses = list()
        for i, (images, captions, lengths,
                all_captions) in enumerate(data_loaders[j]):
            # Set mini-batch dataset
            images = images.to(device)
            captions = captions.to(device)
            lengths = [l - 1 for l in lengths]
            packed_targets = pack_padded_sequence(input=captions[:, 1:],
                                                  lengths=lengths,
                                                  batch_first=True)
            targets = packed_targets.data
            # Forward, backward and optimize
            with torch.no_grad():
                features = encoder(images)
                outputs, alphas = decoder(captions[:, :-1],
                                          lengths,
                                          features,
                                          teacher_forcing_ratio=0)
            loss = criterion(outputs, targets)
            alpha_c = 1.
            loss += alpha_c * ((1. - alphas.sum(dim=1))**2).mean()

            # Keep track of metrics
            losses[j].update(loss.item(), sum(lengths))
            top5 = accuracy(outputs, targets, 5)
            top5accs[j].update(top5, sum(lengths))
            batch_time.update(time.time() - start)

            # unpacked outputs
            scores = outputs.clone()
            scores = PackedSequence(scores, packed_targets.batch_sizes)
            scores = pad_packed_sequence(scores, batch_first=True)

            start = vocab.word2idx['<start>']
            end = vocab.word2idx['<end>']
            all_caps = deepcopy(all_captions)
            for caps in all_caps:
                caps = [c.long().tolist() for c in caps]
                caps = [[w for w in c if w != start and w != end]
                        for c in caps]
                references.append(caps)

            preds = list()
            for s, l in zip(scores[0], scores[1]):
                _, pred = torch.max(s, dim=1)
                pred = pred.tolist()[:l]
                pred = [w for w in pred if w != start and w != end]
                preds.append(pred)
            hypotheses.extend(preds)

            assert len(references) == len(hypotheses)

            # free
            del images
            del captions
            del lengths
            del all_captions
            del packed_targets
            del outputs
            del alphas

        torch.cuda.empty_cache()

        # Calculate BLEU-4 scores
        bleu4 = corpus_bleu(references, hypotheses)
        bleu4s.append(bleu4)

        feature = features[0].unsqueeze(0)

        start = vocab.word2idx['<start>']
        end = vocab.word2idx['<end>']
        sampled_ids = decoder.sample(feature, start_token=start, end_token=end)
        sampled_ids = sampled_ids[0].cpu().numpy()

        # Convert word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break

        print(sampled_caption)

    top5accs = [top5acc.avg for top5acc in top5accs]
    losses = [loss.avg for loss in losses]
    return batch_time.val, top5accs, losses, bleu4s
Exemplo n.º 28
0
    def forward(self,W):
        X = PackedSequence(self.embedding_dropout(self.char_embedding(W.data)),W.batch_sizes)
        H,h = self.gru(X)
        Y = PackedSequence(self.out(H.data),W.batch_sizes)

        return Y
Exemplo n.º 29
0
def pack_wrapper(module, att_feats, att_masks):
    if att_masks is not None:
        packed, inv_ix = sort_pack_padded_sequence(att_feats, att_masks.data.long().sum(1))
        return pad_unsort_packed_sequence(PackedSequence(module(packed[0]), packed[1]), inv_ix)
    else:
        return module(att_feats)
Exemplo n.º 30
0
def extract(strings,modelPackage,batch_size=50,return_char_scores=False,dropout_samples=0):
    resultsDF = pd.DataFrame(list(strings),columns=['string'])

    uniqueDF = resultsDF.drop_duplicates()
    uniqueDF['chars'] = uniqueDF['string'].apply(stringToAscii)
    uniqueDF['len'] = [len(c) for c in uniqueDF['chars']]
    uniqueDF = uniqueDF[uniqueDF['len']>0]

    batchResults = []
    for batchDF in dfChunks(uniqueDF,batch_size):
        batchDF = batchDF.sort_values('len',ascending=False)

        with torch.no_grad():
            packedChars,_ = bytesToPacked1Hot(list(batchDF['chars']),clamp_range=(31,126),presorted=True)

            if next(modelPackage['model'].parameters()).is_cuda:
                packedChars = packedToCuda(packedChars)

            # Compute point estimates (no dropout)
            modelPackage['model'].eval()

            packedOutput = modelPackage['model'](packedChars)

            packedProbs = PackedSequence(F.sigmoid(packedOutput.data),packedOutput.batch_sizes)
            paddedProbs,lengths = torch.nn.utils.rnn.pad_packed_sequence(packedProbs)

            packedEntropies = PackedSequence(F.binary_cross_entropy_with_logits(packedOutput.data,packedProbs.data,reduce=False),packedOutput.batch_sizes)
            paddedEntropies,lengths = torch.nn.utils.rnn.pad_packed_sequence(packedEntropies)

            batchDF['probs'] = [x[:l].ravel() for x,l in zip(paddedProbs.t().cpu().numpy(),lengths)]
            # batchDF['entropies'] = [x[:l].cpu().numpy() for x,l in zip(paddedEntropies.t(),lengths)]
            batchDF['entropy'] = [x[:l].sum() for x,l in zip(paddedEntropies.t().cpu().numpy(),lengths)]

            batchDF['matches'] = [tuple(matchesFromProbs(c,p)) for i,c,p in batchDF[['chars','probs']].itertuples()]

            # Estimate uncertainty using dropout samples (if dropout samples > 0)
            if dropout_samples:

                samples = [paddedProbs] #Use point estimates as first sample
                for i in range(dropout_samples):
                    modelPackage['model'].train()

                    packedOutput = modelPackage['model'](packedChars)

                    packedProbs = PackedSequence(F.sigmoid(packedOutput.data),packedOutput.batch_sizes)
                    paddedProbs,lengths = torch.nn.utils.rnn.pad_packed_sequence(packedProbs)

                    samples.append(paddedProbs)

                stds = torch.cat(samples,dim=2).std(dim=2)

                if return_char_scores:
                    batchDF['dropout_sds'] = [x[:l].ravel() for x,l in zip(stds.t().cpu().numpy(),lengths)]

                batchDF['dropout_sd'] = stds.sum(dim=0).cpu().numpy()

        if not return_char_scores:
            batchDF = batchDF.drop(['probs'],axis=1)

        batchResults.append(batchDF)

    allBatchesDF = pd.concat(batchResults)
    resultsDF = pd.merge_ordered(resultsDF,allBatchesDF,on='string')

    return resultsDF
Exemplo n.º 31
0
    def iterate(self, src_tuple, target_tuple, training=True):
        # limit number of tokens o avoid gpu overload
        if self.limit_num_tokens is not None:
            src_tuple, target_tuple = self._batch_limit_tokens(
                src_tuple, target_tuple)
        src, src_length = src_tuple
        target, target_length = target_tuple
        batch_dim, time_dim = (0, 1) if self.batch_first else (1, 0)
        num_words = sum(target_length) - target.size(batch_dim)

        if isinstance(src, PackedSequence) or \
                not isinstance(self.model_with_loss, DataParallel):
            if isinstance(src, PackedSequence):
                src = PackedSequence(src.data.to(self.device),
                                     src.batch_sizes.to(self.device))
            else:
                src = src.to(self.device)
            target = target.to(self.device)

        if self.batch_first:
            inputs = (src, target[:, :-1])
            target_labels = target[:, 1:].contiguous()
        else:
            inputs = (src, target[:-1])
            target_labels = target[1:]

        # compute output
        loss, accuracy = self.model_with_loss(inputs, target_labels)

        loss = loss.sum()
        loss_measure = float(loss / num_words)
        if self.avg_loss_time:
            loss /= num_words
        else:
            loss /= target.size(batch_dim)
        accuracy = float(accuracy.sum().float() / num_words)

        if training:
            # compute gradient and do SGD step
            self.optimizer.zero_grad()
            loss.backward()
            if self.grad_clip is not None:
                if isinstance(self.grad_clip, dict):
                    clip_encoder = self.grad_clip.get('encoder', 0)
                    clip_decoder = self.grad_clip.get('decoder', 0)
                    if clip_encoder > 0:
                        clip_grad_norm_(
                            self.model.encoder.parameters(), clip_encoder)
                    if clip_decoder > 0:
                        clip_grad_norm_(
                            self.model.decoder.parameters(), clip_decoder)
                elif self.grad_clip > 0:  # grad_clip is a number
                    clip_grad_norm_(self.model.parameters(), self.grad_clip)
            if self.embedding_grad_clip is not None and self.embedding_grad_clip > 0:
                if hasattr(self.model.encoder, 'embedder'):
                    clip_grad_norm_(self.model.encoder.embedder.parameters(),
                                    self.embedding_grad_clip)
                if hasattr(self.model.decoder, 'embedder'):
                    clip_grad_norm_(self.model.decoder.embedder.parameters(),
                                    self.embedding_grad_clip)
            self.optimizer.step()
        return loss_measure, accuracy, num_words
Exemplo n.º 32
0
    def forward(self, sequence, hx=None):
        r"""
        Args:
            sequence (~torch.nn.utils.rnn.PackedSequence):
                A packed variable length sequence.
            hx (~torch.Tensor, ~torch.Tensor):
                A tuple composed of two tensors `h` and `c`.
                `h` of shape ``[num_layers*num_directions, batch_size, hidden_size]`` holds the initial hidden state
                for each element in the batch.
                `c` of shape ``[num_layers*num_directions, batch_size, hidden_size]`` holds the initial cell state
                for each element in the batch.
                If `hx` is not provided, both `h` and `c` default to zero.
                Default: ``None``.

        Returns:
            ~torch.nn.utils.rnn.PackedSequence, (~torch.Tensor, ~torch.Tensor):
                The first is a packed variable length sequence.
                The second is a tuple of tensors `h` and `c`.
                `h` of shape ``[num_layers*num_directions, batch_size, hidden_size]`` holds the hidden state for `t=seq_len`.
                Like output, the layers can be separated using ``h.view(num_layers, num_directions, batch_size, hidden_size)``
                and similarly for c.
                `c` of shape ``[num_layers*num_directions, batch_size, hidden_size]`` holds the cell state for `t=seq_len`.
        """
        x, batch_sizes = sequence.data, sequence.batch_sizes.tolist()
        batch_size = batch_sizes[0]
        h_n, c_n = [], []

        if hx is None:
            ih = x.new_zeros(self.num_layers * self.num_directions, batch_size,
                             self.hidden_size)
            h, c = ih, ih
        else:
            h, c = self.permute_hidden(hx, sequence.sorted_indices)
        h = h.view(self.num_layers, self.num_directions, batch_size,
                   self.hidden_size)
        c = c.view(self.num_layers, self.num_directions, batch_size,
                   self.hidden_size)

        for i in range(self.num_layers):
            x = torch.split(x, batch_sizes)
            if self.training:
                mask = SharedDropout.get_mask(x[0], self.dropout)
                x = [i * mask[:len(i)] for i in x]
            x_i, (h_i, c_i) = self.layer_forward(x, (h[i, 0], c[i, 0]),
                                                 self.f_cells[i], batch_sizes)
            if self.bidirectional:
                x_b, (h_b, c_b) = self.layer_forward(x, (h[i, 1], c[i, 1]),
                                                     self.b_cells[i],
                                                     batch_sizes, True)
                x_i = torch.cat((x_i, x_b), -1)
                h_i = torch.stack((h_i, h_b))
                c_i = torch.stack((c_i, c_b))
            x = x_i
            h_n.append(h_i)
            c_n.append(h_i)

        x = PackedSequence(x, sequence.batch_sizes, sequence.sorted_indices,
                           sequence.unsorted_indices)
        hx = torch.cat(h_n, 0), torch.cat(c_n, 0)
        hx = self.permute_hidden(hx, sequence.unsorted_indices)

        return x, hx