Exemplo n.º 1
0
    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = to_gpu(torch.Tensor([0]))
        tags = to_gpu(
            torch.cat([torch.LongTensor([self.tag_to_ix[START_TAG]]), tags]))

        # print((len(feats), len(self.transitions), len(tags)))
        for i, feat in enumerate(feats):
            score = score + \
                    self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score
Exemplo n.º 2
0
 def init_hidden(
     self, batch_size
 ) -> Iterable[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]]:
     if self.rnn_type == 'LSTM':
         return [(to_gpu(torch.zeros(1, batch_size, self.hidden_size)),
                  to_gpu(torch.zeros(1, batch_size, self.hidden_size)))
                 for l in range(self.n_layers)]
     elif self.rnn_type == 'SRU' or self.rnn_type == 'GRU':
         return [
             to_gpu(torch.zeros(1, batch_size, self.hidden_size))
             for l in range(self.n_layers)
         ]
Exemplo n.º 3
0
    def on_epoch(self, X, y):
        model = self.model_wrapper.model

        input_embs, pos_output_embs = model.get_embs(X, y)
        positive_similarity = model.similarity(input_embs,
                                               pos_output_embs).squeeze(1)
        # print(positive_similarity.size())

        batch_size = X.size(0)
        n_samples = batch_size * self.n_negative
        neg_rhs = to_gpu(self.neg_sampling.sample(n_samples))

        _, neg_output_embs = model.get_embs(
            output=neg_rhs)  # (B * n_negative) x dim
        neg_output_embs = neg_output_embs.view(batch_size, self.n_negative,
                                               -1)  # B x n_negative x dim
        negative_similarity = model.similarity(
            input_embs, neg_output_embs).squeeze(1)  # B x n_negative
        # print(negative_similarity.size())

        similarity = model(X)

        loss = self.criterion(positive_similarity, negative_similarity)

        return {'loss': loss, 'logits': torch.max(similarity, dim=-1)[1]}
Exemplo n.º 4
0
    def generate(self, n_tokens, temperature=1.):
        self.model.eval()
        self.hidden = self.model.init_hidden(1)

        seed = torch.rand(1, 1).mul(n_tokens).long()
        retstr = []
        # retidx = []

        with torch.no_grad():
            for ix in range(n_tokens):
                seed = to_gpu(seed)

                output, self.hidden = self.model(seed, self.hidden)
                word_weights = output.squeeze().data.div(
                    temperature).exp().cpu()

                # filter out inf and negative probabilities
                word_weights[word_weights == float("Inf")] = 0
                word_weights[word_weights < 0] = 0

                word_idx = torch.multinomial(word_weights, 1)[0]
                seed.data.fill_(word_idx)

                word_idx = int(word_idx)
                word = self.featurizer.tokenizer.ix_to_word.get(word_idx, '')
                retstr += [word]
                # retidx += [word_idx]

        self.model.train()

        if self.char_level:
            return ''.join(retstr)
        else:
            return ' '.join(retstr)
Exemplo n.º 5
0
    def embedded_dropout(self,
                         embed: nn.Module,
                         words: Union[torch.LongTensor, torch.cuda.LongTensor],
                         dropout: float = 0.1,
                         scale=None):

        if dropout:
            mask = embed.weight.data.new().resize_(
                (embed.weight.size(0), 1)).bernoulli_(1 - dropout).expand_as(
                    embed.weight) / (1 - dropout)
            masked_embed_weight = mask * embed.weight
        else:
            masked_embed_weight = embed.weight
        if scale:
            masked_embed_weight = scale.expand_as(
                masked_embed_weight) * masked_embed_weight

        padding_idx = embed.padding_idx
        if padding_idx is None:
            padding_idx = -1

        X = to_gpu(
            torch.nn.functional.embedding(words, masked_embed_weight,
                                          padding_idx, embed.max_norm,
                                          embed.norm_type,
                                          embed.scale_grad_by_freq,
                                          embed.sparse))

        return X
Exemplo n.º 6
0
 def _make_mask_from_seq_lens(self, seq_lens):
     seq_lens = seq_lens.view(-1, 1)
     max_len = torch.max(seq_lens)
     range_tensor = to_gpu(torch.arange(max_len)).unsqueeze(0)
     range_tensor = range_tensor.expand(seq_lens.size(0),
                                        range_tensor.size(1))
     mask = (range_tensor < seq_lens).float()
     return mask
Exemplo n.º 7
0
    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.Tensor(1, self.tagset_size).fill_(-10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = to_gpu(init_vvars)

        for feat in feats:
            next_tag_var = forward_var.view(1, -1).expand(
                self.tagset_size, self.tagset_size) + self.transitions
            _, bptrs_t = torch.max(next_tag_var, dim=1)
            bptrs_t = bptrs_t.squeeze().data.cpu().numpy()
            next_tag_var = next_tag_var.data.cpu().numpy()
            viterbivars_t = next_tag_var[range(len(bptrs_t)), bptrs_t]
            viterbivars_t = torch.FloatTensor(viterbivars_t)

            viterbivars_t = to_gpu(viterbivars_t)

            forward_var = viterbivars_t + feat
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        terminal_var.data[self.tag_to_ix[STOP_TAG]] = -10000.
        terminal_var.data[self.tag_to_ix[START_TAG]] = -10000.

        best_tag_id = argmax(terminal_var.unsqueeze(0))
        path_score = terminal_var[best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path
Exemplo n.º 8
0
    def forward(self, input_embs):
        batch_size = input_embs.size(0)
        candidate_rhs = to_gpu(
            torch.arange(0, self.n_classes).long().expand(batch_size, -1))
        input_embs, candidate_rhs_repr = self.get_embs(
            input_embs,
            candidate_rhs.contiguous().view(batch_size * self.n_classes))
        candidate_rhs_repr = candidate_rhs_repr.view(batch_size,
                                                     self.n_classes, -1)

        return self.similarity(input_embs, candidate_rhs_repr).squeeze(1)
Exemplo n.º 9
0
    def __init__(self, config={}):
        super(OvrClassifier, self).__init__()

        self.input_dim = config.get('input_dim', EMBEDDING_DIM)
        self.hidden_size = config.get('hidden_size', 0)
        self.h_dropout_prob = config.get('h_dropout_prob', 0.)
        self.n_classes = config.get('num_classes', 10)

        self.classifiers = list()
        for ix in range(self.n_classes):
            if self.hidden_size == 0:
                clf = to_gpu(
                    nn.Sequential(nn.Dropout(self.h_dropout_prob),
                                  nn.Linear(self.input_dim, 1)))
            else:
                clf = to_gpu(
                    nn.Sequential(nn.Linear(self.input_dim, self.hidden_size),
                                  nn.Dropout(self.h_dropout_prob),
                                  nn.Sigmoid(), nn.Linear(self.hidden_size,
                                                          1)))
            self.classifiers.append(clf)
Exemplo n.º 10
0
    def forward(self, sentence):
        words_batch, word_lengths = self._process_sentence(
            [token if len(token) > 0 else UNK_TAG for token in sentence])

        words_batch = to_gpu(words_batch)  # letters x words

        words_batch = self.dropout(
            self.embedding(words_batch))  # letters x words x embeds

        # print('words_batch: %s' % str(words_batch.size()))
        # Sort by length (keep idx)
        word_lengths, idx_sort = np.sort(word_lengths)[::-1], np.argsort(
            -word_lengths)
        idx_unsort = np.argsort(idx_sort)

        idx_sort = to_gpu(torch.from_numpy(idx_sort))

        words_batch = words_batch.index_select(1, idx_sort)

        # Handling padding in Recurrent Networks
        # copy() call is to fix negative strides support in pytorch
        words_packed = pack_padded_sequence(words_batch, word_lengths.copy())
        words_output = self.rnn(words_packed)[0]
        words_output = pad_packed_sequence(words_output)[0]

        # Un-sort by length
        idx_unsort = to_gpu(torch.from_numpy(idx_unsort))

        words_output = words_output.index_select(1, idx_unsort)

        # Max Pooling
        embeds = torch.max(words_output, 0)[0]
        if embeds.ndimension() == 3:
            embeds = embeds.squeeze(0)
            assert embeds.ndimension() == 2

        # print(embeds)

        return embeds  # words x embeds
Exemplo n.º 11
0
    def on_training_start(self):
        config = self.model_wrapper.config or dict()
        embedding_dim = config.get('embedding_dim', LM_HIDDEN_DIM)

        self.char_level = config.get('char_level', False)

        if self.char_level:
            self.criterion = to_gpu(nn.CrossEntropyLoss())
        else:
            num_words = config.get(
                'num_words', self.model_wrapper.featurizer.tokenizer.num_words)
            splits = []
            if num_words > 500000:
                # One Billion
                # This produces fairly even matrix mults for the buckets:
                # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422
                splits = [4200, 35000, 180000]
            elif num_words > 75000:
                # WikiText-103
                splits = [2800, 20000, 76000]
            else:
                splits = [num_words // 3, num_words // 3]

            print('Number of tokens', num_words)
            print('Cross Entropy Splits: Using', splits)

            self.model_wrapper.config['adasoft_cutoffs'] = splits
            self.model_wrapper.config['num_words'] = num_words
            self.criterion = to_gpu(
                SplitCrossEntropyLoss(embedding_dim, splits))

        self.hidden = None

        # regularization
        self.clip_grad = config.get('clip_grad', .25)
        self.alpha = config.get('alpha', 2)
        self.beta = config.get('beta', 1)
        self.batch_size = 0
Exemplo n.º 12
0
    def forward(self, sent_batch: List[List[str]]):
        max_length = min(max([len(sent) for sent in sent_batch]),
                         self.config.max_position_embeddings)

        words_embeddings = to_gpu(
            torch.FloatTensor(word_to_vec(sent_batch,
                                          pad_to_length=max_length)))

        chars_embeddings = to_gpu(
            torch.stack([
                torch.cat((self.char_encoder(sent),
                           torch.zeros(max_length -
                                       len(sent), self.char_embedding_dim)),
                          dim=0) if len(sent) < max_length else
                self.char_encoder(sent)[:max_length]
                if len(sent) > max_length else self.char_encoder(sent)
                for sent in sent_batch
            ], 0))

        if self.use_position_embeddings:
            position_ids = torch.arange(max_length,
                                        dtype=torch.long,
                                        device=words_embeddings.device)
            position_ids = position_ids.unsqueeze(0).expand(
                words_embeddings.size(0), words_embeddings.size(1))

            position_embeddings = self.position_embeddings(position_ids)

        embeddings = torch.cat([words_embeddings, chars_embeddings],
                               dim=-1) + position_embeddings

        if self.use_position_embeddings:
            embeddings = words_embeddings + position_embeddings

        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings
Exemplo n.º 13
0
 def init_hidden(
     self, batch_size: int
 ) -> Iterable[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]]:
     if self.rnn_type == 'LSTM':
         return [(to_gpu(torch.zeros(2, batch_size, self.hidden_dim // 2)),
                  to_gpu(torch.zeros(2, batch_size, self.hidden_dim // 2)))
                 for l in range(self.n_layers)]
     elif self.rnn_type == 'QRNN':  # 2 hidden layers for each direction
         return [
             to_gpu(torch.zeros(2, batch_size, self.hidden_dim // 2))
             for l in range(self.n_layers)
         ]
     elif self.rnn_type == 'GRU':
         return [
             to_gpu(torch.zeros(2, batch_size, self.hidden_dim // 2))
             for l in range(self.n_layers)
         ]
     elif self.rnn_type == 'SRU':
         return [
             to_gpu(torch.zeros(1, batch_size, self.hidden_dim))
             for l in range(self.n_layers)
         ]
     else:
         return None
Exemplo n.º 14
0
    def neg_log_likelihood(self, sent_batch, tags):
        word_embeds = to_gpu(
            torch.FloatTensor([word_to_vec(w) for w in sent_batch[0]]))
        word_embeds = self.emb_dropout(word_embeds)

        char_embeds = self.word_encoder(sent_batch[0])

        sentence_in = torch.cat((word_embeds, char_embeds),
                                dim=-1).unsqueeze(1)
        sentence_in = self.dropout(sentence_in)

        feats = self._get_lstm_features(sentence_in)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags[0])
        return feats, forward_score - gold_score
Exemplo n.º 15
0
    def transform(self, data):
        if self.tokenizer is None:
            self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)

        tokens = [self.tokenize_fn(sent) for sent in data]
        tokens = self.tokenizer.texts_to_sequences(tokens)
        tokens = self.add_ngram(tokens, self.token_indice, self.ngrams)

        max_len = max([len(seq) for seq in tokens])
        if max_len > self.max_len:
            warnings.warn('Max training sequence length is %s, which is higher than max length setting %s' % \
                (max_len, self.max_len), UserWarning)

        tokens = pad_sequences(tokens, maxlen=self.max_len)

        return to_gpu(torch.LongTensor(tokens))
Exemplo n.º 16
0
    def forward(self,
                sent_batch):  # dont confuse this with _forward_alg above.
        word_embeds = to_gpu(
            torch.FloatTensor([word_to_vec(w) for w in sent_batch[0]]))
        word_embeds = self.emb_dropout(word_embeds)

        char_embeds = self.word_encoder(sent_batch[0])

        sentence_in = torch.cat((word_embeds, char_embeds),
                                dim=-1).unsqueeze(1)

        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence_in)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq, sent_batch[0]
Exemplo n.º 17
0
    def __init__(self, config):
        super(VNTokenizer, self).__init__()
        self.max_emb_words = config.get('max_emb_words')
        self.embedding_dim = config.get('embedding_dim', EMBEDDING_DIM)
        self.char_embedding_dim = config.get('char_embedding_dim', CHAR_EMBEDDING_DIM)
        self.hidden_dim = config.get('hidden_dim', 1200)
        self.num_layers = config.get('num_layers', 3)
        self.dropout_prob = config.get('dropout_prob', .2)
        self.is_cuda = is_cuda if is_cuda is not None else torch.cuda.is_available()

        self.word_encoder = to_gpu(BRNNWordEncoder(self.char_embedding_dim, rnn_type='LSTM'))
        self.dropout = nn.Dropout(self.dropout_prob))

        # 0: reserved index by Keras tokenizer
        # num_words + 1: index for oov token
        self.embedding = nn.Embedding(self.max_emb_words + 2, self.embedding_dim)
        self.lstm = nn.LSTM(self.embedding_dim + self.char_embedding_dim,
                            self.hidden_dim // 2,
                            num_layers=self.num_layers,
                            bidirectional=True)
Exemplo n.º 18
0
    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.Tensor(1, self.tagset_size).fill_(-10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = to_gpu(init_alphas)

        # Iterate through the sentence
        for feat in feats:
            emit_score = feat.view(-1, 1)
            tag_var = forward_var + self.transitions + emit_score
            max_tag_var, _ = torch.max(tag_var, dim=1)
            tag_var = tag_var - max_tag_var.view(-1, 1)
            forward_var = max_tag_var + torch.logsumexp(tag_var, dim=1).view(
                1, -1)

        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = torch.logsumexp(terminal_var, dim=-1)
        return alpha
Exemplo n.º 19
0
    def init_on_data(self, X, y):
        self.model_wrapper.label_encoder.fit(y)
        self.model_wrapper.n_classes = len(
            self.model_wrapper.label_encoder.classes_)
        self.model_wrapper.config['num_classes'] = self.model_wrapper.n_classes

        config = self.model_wrapper.config
        if 'contexts' in config:
            contexts = config['contexts']
            contexts_list = [
                contexts[label] if label in contexts else []
                for label in self.model_wrapper.label_encoder.classes_
            ]
            self.model_wrapper.config['contexts'] = contexts_list
            # print('number of contexts: %s' % str(len(contexts_list)))

        y_labels = self.model_wrapper.label_encoder.transform(y)
        class_weights = class_weight.compute_class_weight(
            'balanced', np.unique(y_labels), y_labels)
        self.class_weights = to_gpu(torch.from_numpy(class_weights).float())

        self.criterion = nn.CrossEntropyLoss(weight=self.class_weights)
Exemplo n.º 20
0
    def init_on_data(self, X, y):
        tokens = [self.model_wrapper.tokenize_fn(sent) for sent in X]
        self.model_wrapper.tokenizer.fit_on_texts(tokens)
        self.n_samples = len(tokens)
        # self.n_classes = len(np.unique(y))
        self.buffer_pointer = 0

        self.model_wrapper.label_encoder.fit(y)

        n_classes = len(self.model_wrapper.label_encoder.classes_)
        self.model_wrapper.config['num_classes'] = n_classes

        self.neg_sampling = to_gpu(
            NegativeSampling(n_output=n_classes, n_negative=self.n_negative))
        # self.class_weights = class_weight.compute_class_weight('balanced', np.unique(y), y)

        config = self.model_wrapper.config
        if 'contexts' in config:
            contexts = config['contexts']
            contexts_list = [
                contexts[label] if label in contexts else []
                for label in self.model_wrapper.label_encoder.classes_
            ]
            self.model_wrapper.config['contexts'] = contexts_list
Exemplo n.º 21
0
    def forward(self, sent_batch):
        sentence = sent_batch[0]
        tokens = self.tokenizer.texts_to_sequences([sentence])

        tokens = to_gpu(torch.LongTensor(tokens))

        word_embeds = self.embedding(tokens).permute(1, 0, 2)
        # print('word_embeds: %s' % str(word_embeds.size()))

        char_embeds = self.word_encoder([
            remove_tone_marks(token) for token in sentence
        ]).unsqueeze(1)
        # print('char_embeds: %s' % str(char_embeds.size()))

        sentence_in = torch.cat((word_embeds, char_embeds), dim=-1)

        seq_len = len(sentence_in)

        # embeds = sentence_in.view(seq_len, 1, -1)  # [seq_len, batch_size, features]
        lstm_out, _ = self.lstm(sentence_in)
        lstm_out = lstm_out.view(seq_len, self.hidden_dim)
        tags = self.hidden2tag(lstm_out).squeeze(1)

        return tags
Exemplo n.º 22
0
    def __init__(self, config):
        super(BiRNNLanguageModel, self).__init__()
        self.config = config

        self.tie_weights = config.get('tie_weights', True)
        self.embedding_dim = config.get('embedding_dim', LM_HIDDEN_DIM)
        self.hidden_dim = self.embedding_dim if self.tie_weights else config.get(
            'hidden_dim', LM_HIDDEN_DIM)
        self.dropout_emb = config.get('emb_dropout', .2)
        self.dropout_i = config.get('lock_drop', .5)
        self.dropout_h = config.get('h_dropout', .5)
        self.dropout_w = config.get('w_dropout', 0)
        self.num_words = config.get('num_words', LM_VOCAB_SIZE)
        self.rnn_type = config.get('rnn_type', 'SRU')
        self.n_layers = config.get('n_layers', 6)
        self.dropout_rnn = config.get('rnn_dropout', .2)
        self.highway_bias = config.get('highway_bias', -3)
        self.use_adasoft = config.get('use_adasoft', True)
        self.adasoft_cutoffs = config.get(
            'adasoft_cutoffs', [LM_VOCAB_SIZE // 2, LM_VOCAB_SIZE // 2])

        assert self.rnn_type in ['LSTM', 'GRU', 'SRU', 'QRNN']

        self.encoder = nn.Embedding(self.num_words, self.embedding_dim)
        self.lockdrop = to_gpu(LockedDropout())

        # for the mean time weight drop is broken
        if self.rnn_type == 'LSTM':
            self.rnns = [
                nn.LSTM(
                    self.embedding_dim if layer_ix == 0 else self.hidden_dim,
                    self.hidden_dim // 2,
                    bidirectional=True,
                    dropout=self.dropout_rnn)
                for layer_ix in range(self.n_layers)
            ]
            if self.dropout_w:
                self.rnns = [
                    WeightDrop(rnn, ['weight_hh_l0'], dropout=self.dropout_w)
                    for rnn in self.rnns
                ]
        elif self.rnn_type == 'GRU':
            self.rnns = [
                nn.GRU(
                    self.embedding_dim if layer_ix == 0 else self.hidden_dim,
                    self.hidden_dim // 2,
                    bidirectional=True,
                    dropout=self.dropout_rnn)
                for layer_ix in range(self.n_layers)
            ]
            if self.dropout_w:
                self.rnns = [
                    WeightDrop(rnn, ['weight_hh_l0'], dropout=self.dropout_w)
                    for rnn in self.rnns
                ]
        elif self.rnn_type == 'QRNN':
            from torchqrnn import QRNNLayer
            self.rnns = self.rnns = [
                QRNNLayer(
                    self.embedding_dim if layer_ix == 0 else self.hidden_dim,
                    self.hidden_dim // 2,
                    bidirectional=True) for layer_ix in range(self.n_layers)
            ]
            if self.dropout_w:
                for rnn in self.rnns:
                    rnn.linear = WeightDrop(rnn.linear, ['weight'],
                                            dropout=self.dropout_w)
        else:
            from sru import SRU
            self.rnns = [
                to_gpu(
                    SRU(self.embedding_dim
                        if layer_ix == 0 else self.hidden_dim,
                        self.hidden_dim // 2,
                        num_layers=1,
                        rnn_dropout=self.dropout_rnn,
                        dropout=self.wdrop,
                        rescale=False,
                        highway_bias=self.highway_bias,
                        use_tanh=0,
                        bidirectional=True,
                        v1=True)) for layer_ix in range(self.n_layers)
            ]

        self.rnns = nn.ModuleList(self.rnns)
        self.decoder = nn.Linear(
            self.embedding_dim if self.tie_weights else self.hidden_dim,
            self.num_words)

        # Adaptive softmax
        self.use_adasoft = config.get('use_adasoft', True)

        if self.use_adasoft:
            if 'adasoft_cutoffs' in config:
                splits = config['adasoft_cutoffs']
            else:
                splits = []
                if self.num_words >= 500000:
                    # One Billion
                    # This produces fairly even matrix mults for the buckets:
                    # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422
                    splits = [4200, 35000, 180000]
                elif self.num_words >= 75000:
                    # WikiText-103
                    splits = [2800, 20000, 76000]
                elif self.num_words >= 20000:
                    splits = [2000, 4000, 10000]
                else:
                    splits = [self.num_words // 3, self.num_words // 3]

                config['adasoft_cutoffs'] = splits

            # print('Cross Entropy Splits: Using', splits)

            self.adasoft = SplitCrossEntropyLoss(self.hidden_dim,
                                                 splits,
                                                 ignore_index=0)
        else:
            self.adasoft = None

        # Weight tying
        if self.tie_weights:
            self.decoder.weight = self.encoder.weight

        self.init_weights()
Exemplo n.º 23
0
    def __init__(self, config):
        super(RNNLanguageModel, self).__init__()
        self.config = config

        self.tie_weights = config.get('tie_weights', True)
        self.char_level = config.get('char_level', False)

        self.embedding_dim = config.get(
            'embedding_dim',
            LM_HIDDEN_DIM if self.tie_weights else LM_EMBEDDING_DIM)
        self.hidden_size = self.embedding_dim if self.tie_weights else config.get(
            'hidden_size', LM_HIDDEN_DIM)
        self.num_words = config.get(
            'num_words',
            n_letters + LM_CHAR_RESERVED if self.char_level else LM_VOCAB_SIZE)

        self.dropout_emb = config.get('emb_dropout', .2)
        self.dropout_i = config.get('lock_drop', .5)
        self.dropout_h = config.get('h_dropout', .5)
        self.wdrop = config.get('wdrop', 0)
        self.rnn_type = config.get('rnn_type', 'SRU')
        self.n_layers = config.get('n_layers', 6)
        self.dropout_rnn = config.get('rnn_dropout', .2)
        self.highway_bias = config.get('highway_bias', -3)
        self.adasoft_cutoffs = config.get('adasoft_cutoffs', [LM_VOCAB_SIZE])

        assert self.rnn_type in ['LSTM', 'GRU', 'SRU']

        self.encoder = nn.Embedding(self.num_words, self.embedding_dim)
        self.lockdrop = to_gpu(LockedDropout())

        # for the mean time weight drop is broken
        if self.rnn_type == 'LSTM':
            self.rnns = nn.ModuleList([
                nn.LSTM(
                    self.embedding_dim if layer_ix == 0 else self.hidden_size,
                    self.hidden_size if layer_ix != self.n_layers - 1 else \
                        (self.embedding_dim if self.tie_weights else self.hidden_size),
                )
                for layer_ix in range(self.n_layers)
            ])
        elif self.rnn_type == 'GRU':
            self.rnns = nn.ModuleList([
                nn.GRU(
                    self.embedding_dim if layer_ix == 0 else self.hidden_size,
                    self.hidden_size if layer_ix != self.n_layers - 1 else \
                        (self.embedding_dim if self.tie_weights else self.hidden_size)
                )
                for layer_ix in range(self.n_layers)
            ])
        else:
            from sru import SRU
            self.rnns = nn.ModuleList([
                to_gpu(
                    SRU(self.embedding_dim
                        if layer_ix == 0 else self.hidden_size,
                        self.hidden_size,
                        num_layers=1,
                        rnn_dropout=self.dropout_rnn,
                        dropout=self.wdrop,
                        rescale=False,
                        highway_bias=self.highway_bias,
                        use_tanh=0,
                        v1=True)) for layer_ix in range(self.n_layers)
            ])

        self.decoder = nn.Linear(self.hidden_size, self.num_words)
        # Weight tying
        if self.tie_weights:
            self.decoder.weight = self.encoder.weight

        self.init_weights()
Exemplo n.º 24
0
    def _viterbi_decode(
        self, emissions: Union[torch.FloatTensor, torch.cuda.FloatTensor],
        mask: Union[torch.FloatTensor,
                    torch.cuda.FloatTensor]) -> torch.Tensor:
        seq_len = emissions.shape[1]
        mask = mask.to(torch.uint8)

        log_prob = emissions[:, 0].clone()
        log_prob += self.transitions[
            self.start_tag, :self.start_tag].unsqueeze(0)

        # At each step, we need to keep track of the total score, as if this step
        # was the last valid step.
        end_scores = log_prob + self.transitions[:self.start_tag,
                                                 self.end_tag].unsqueeze(0)

        best_scores_list = []
        # If the element has only token, empty tensor in best_paths helps
        # torch.cat() from crashing
        best_paths_list = [to_gpu(torch.Tensor().long())]
        best_scores_list.append(end_scores.unsqueeze(1))

        for idx in range(1, seq_len):
            broadcast_emissions = emissions[:, idx].unsqueeze(1)
            broadcast_transmissions = self.transitions[:self.start_tag, :self.
                                                       start_tag].unsqueeze(0)
            broadcast_log_prob = log_prob.unsqueeze(2)

            score = broadcast_emissions + broadcast_transmissions + broadcast_log_prob

            max_scores, max_score_indices = torch.max(score, 1)

            best_paths_list.append(max_score_indices.unsqueeze(1))

            # Storing the scores incase this was the last step.
            end_scores = max_scores + self.transitions[:self.start_tag, self.
                                                       end_tag].unsqueeze(0)

            best_scores_list.append(end_scores.unsqueeze(1))
            log_prob = max_scores

        best_scores = torch.cat(best_scores_list, 1).float()
        best_paths = torch.cat(best_paths_list, 1)

        _, max_indices_from_scores = torch.max(best_scores, 2)

        valid_index_tensor = to_gpu(torch.tensor(0)).long()
        padding_tensor = to_gpu(torch.tensor(self.ignore_index)).long()

        # Label for the last position is always based on the index with max score
        # For illegal timesteps, we set as ignore_index
        labels = max_indices_from_scores[:, seq_len - 1]
        labels = self._mask_tensor(labels, 1.0 - mask[:, seq_len - 1],
                                   padding_tensor)

        all_labels = labels.unsqueeze(1).long()

        # For Viterbi decoding, we start at the last position and go towards first
        for idx in range(seq_len - 2, -1, -1):
            # There are two ways to obtain labels for tokens at a particular position.

            # Option 1: Use the labels obtained from the previous position to index
            # the path in present position. This is used for all positions except
            # last position in the sequence.
            # Option 2: Find the indices with maximum scores obtained during
            # viterbi decoding. This is used for the token at the last position

            # For option 1 need to convert invalid indices to 0 so that lookups
            # dont fail.
            indices_for_lookup = all_labels[:, -1].clone()
            indices_for_lookup = self._mask_tensor(
                indices_for_lookup,
                indices_for_lookup == self.ignore_index,
                valid_index_tensor,
            )

            # Option 1 is used here when previous timestep (idx+1) was valid.
            indices_from_prev_pos = (best_paths[:, idx, :].gather(
                1,
                indices_for_lookup.view(-1, 1).long()).squeeze(1))
            indices_from_prev_pos = self._mask_tensor(indices_from_prev_pos,
                                                      (1.0 - mask[:, idx + 1]),
                                                      padding_tensor)

            # Option 2 is used when last timestep was not valid which means idx+1
            # is the last position in the sequence.
            indices_from_max_scores = max_indices_from_scores[:, idx]
            indices_from_max_scores = self._mask_tensor(
                indices_from_max_scores, mask[:, idx + 1], padding_tensor)

            # We need to combine results from 1 and 2 as rows in a batch can have
            # sequences of varying lengths
            labels = torch.where(
                indices_from_max_scores == self.ignore_index,
                indices_from_prev_pos,
                indices_from_max_scores,
            )

            # Set to ignore_index if present state is not valid.
            labels = self._mask_tensor(labels, (1 - mask[:, idx]),
                                       padding_tensor)
            all_labels = torch.cat((all_labels, labels.view(-1, 1).long()), 1)

        return torch.flip(all_labels, [1])
Exemplo n.º 25
0
 def repackage_hidden(self, h) -> Union[torch.Tensor, Tuple]:
     if torch.is_tensor(h):
         return to_gpu(h.detach())
     else:
         return tuple(self.repackage_hidden(v) for v in h)
Exemplo n.º 26
0
    def fit(self,
            training_data: Iterable = None,
            validation_data: Iterable = None,
            epochs: int = 1,
            minibatches: int = None,
            epoch_start: int = 0,
            batch_size: int = 64,
            shuffle: bool = True,
            optimize_on_cpu: bool = False,
            fp16: bool = False,
            gradient_accumulation_steps: int = 1,
            callbacks: Iterable[object] = [],
            clip_grad: float = 0):

        if self._uneven_batch_size: batch_size = 1

        self._batch_size = batch_size

        self.clip_grad = clip_grad

        if gradient_accumulation_steps and 'gradient_accumulation_steps' in dict(
                inspect.getmembers(
                    self.on_epoch.__func__.__code__))['co_varnames']:
            print('Gradient accumulation is supported by this class')
            self.gradient_accumulation_steps = gradient_accumulation_steps
        else:
            self.gradient_accumulation_steps = 1

        if training_data is not None: self.set_training_data(training_data)
        if validation_data is not None:
            self.set_validation_data(validation_data)

        for callback in callbacks:
            callback.set_learner(self)

        self._callbacks = callbacks or []
        self._n_epochs = epochs
        self._optimize_on_cpu = optimize_on_cpu

        # Preprocess data. If data is already a dataset class
        # then preprocessing logic should be implemented in the class
        if not self._is_dataset:
            X, y = self._data

            # Process input and output data - if needed (tokenization etc.)
            if self.model_wrapper._featurizer is not None:
                self.model_wrapper._featurizer.fit(X)

            X = self.model_wrapper.preprocess_dataset_X(X)
            y = self.model_wrapper.preprocess_dataset_y(y)

            self.init_on_data(X, y)

            # Preprocess all batches of data (adding n-grams etc.)
            # If data should be lazily processed, use the Dataset class instead.

            if self._preprocess_batch:
                if self.model_wrapper._featurizer is not None:
                    dataset = BatchPreprocessedDataset(
                        X,
                        y,
                        input_process_fn=lambda _X: self.model_wrapper.
                        preprocess_input(
                            self.model_wrapper._featurizer.transform(_X)),
                        output_process_fn=self.model_wrapper.preprocess_output,
                        batch_size=batch_size)
                else:
                    dataset = BatchPreprocessedDataset(
                        X,
                        y,
                        input_process_fn=self.model_wrapper.preprocess_input,
                        output_process_fn=self.model_wrapper.preprocess_output,
                        batch_size=batch_size)
            else:
                if self.model_wrapper._featurizer is not None:
                    X = self.model_wrapper._featurizer.transform(X)

                X = self.model_wrapper.preprocess_input(X)
                y = self.model_wrapper.preprocess_output(y)

                if not self._uneven_batch_size:
                    dataset = GenericDataset(X, y)
        else:
            dataset = self._data

            self.init_on_dataset(dataset)

        # Call on_training_start hooks
        self.on_training_start()
        for callback in self.callbacks:
            callback.on_training_start()

        if self._verbose == 2:
            from tqdm import trange
            iterator = trange(epoch_start,
                              self._n_epochs,
                              desc='Epochs',
                              leave=False)
        else:
            iterator = range(epoch_start, self._n_epochs)

        cpu_count = int(
            os.environ.get('NUM_WORKERS', max(mp.cpu_count() - 1, 1)))

        if batch_size is None:
            batch_size = len(dataset)

        if USE_GPU:
            try:
                mp.set_start_method('spawn')
            except:
                warnings.warn(
                    'Error orcurred in multiprocessing.set_start_method')

        if not self._uneven_batch_size:
            loader_kwargs = {
                'batch_size': batch_size,
                'num_workers': cpu_count,
                'shuffle': shuffle
            }
            if USE_GPU:
                loader_kwargs['pin_memory'] = True
            if self._collate_fn is not None:
                loader_kwargs['collate_fn'] = self._collate_fn

            data_loader = DataLoader(dataset, **loader_kwargs)
        else:
            data_loader = [([X[idx]], [y[idx]]) for idx in range(len(X))]

        if self.model_wrapper._featurizer is not None:
            self.model_wrapper.config[
                'input_shape'] = self.model_wrapper._featurizer.get_output_shape(
                )

        if self.model_wrapper.model is None:
            self.model_wrapper.init_model()
        self.on_model_init()

        model = self.model_wrapper._model

        # optimizer must be initialized after the model
        if self.optimizer is None and self._auto_optimize:
            optim_params = [(n, param)
                            for n, param in model.named_parameters()
                            if param.requires_grad]

            if self._optimize_on_cpu:
                optim_params = [
                    (n, param.clone().detach().to('cpu').requires_grad_()) \
                    for n, param in optim_params
                ]

            self.optimizer = self._optimizer_fn([p for n, p in optim_params],
                                                **self._optimizer_kwargs)

        if self.model_wrapper.is_pytorch_module() and not hasattr(
                self, 'criterion'):
            raise ValueError(
                'Criterion must be set for the Learner class before training')

        # fp16
        if fp16:
            try:
                from apex import amp, optimizers
                from apex.multi_tensor_apply import multi_tensor_applier

                model, self.optimizer = amp.initialize(model,
                                                       self.optimizer,
                                                       opt_level="O1",
                                                       loss_scale="dynamic")
                self.model_wrapper._model = model
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to run this example."
                )

        # Main training loop
        try:
            for epoch in iterator:
                if self._halt:  # For early stopping
                    self._halt = False
                    break

                self._current_epoch = epoch
                self._metrics = None

                self.on_epoch_start()

                for callback in self._callbacks:
                    callback.on_epoch_start()

                for batch_idx, (X_batch, y_batch) in enumerate(data_loader, 0):
                    if self._halt:  # For early stopping / skipping batches
                        break

                    self._batch_idx = batch_idx

                    for callback in self.callbacks:
                        callback.on_batch_start()

                    if model is not None and self.model_wrapper.is_pytorch_module(
                    ):
                        model.train()

                    args = to_gpu(X_batch), to_gpu(y_batch)
                    kwargs = {}
                    if gradient_accumulation_steps > 1:
                        kwargs[
                            'gradient_accumulation_steps'] = self.gradient_accumulation_steps

                    epoch_ret = self.on_epoch(*args, **kwargs)

                    if epoch_ret is not None:
                        if 'logits' in epoch_ret:
                            with torch.no_grad():
                                batch_metrics = self.calculate_metrics(
                                    epoch_ret['logits'], y_batch) or {}
                        else:
                            batch_metrics = {}

                        if 'loss' in epoch_ret:
                            epoch_loss = epoch_ret['loss']

                            # backward
                            if fp16:
                                with amp.scale_loss(
                                        epoch_loss,
                                        self.optimizer) as scaled_loss:
                                    scaled_loss.backward()

                                if self.clip_grad > 0:
                                    torch.nn.utils.clip_grad_norm_(
                                        amp.master_params(self.optimizer),
                                        self.clip_grad)
                            else:
                                epoch_loss.backward()

                                if self.clip_grad > 0:
                                    torch.nn.utils.clip_grad_norm_(
                                        model.parameters(), self.clip_grad)

                            epoch_ret['loss'] = epoch_loss.detach().cpu().item(
                            )
                            batch_metrics['loss'] = epoch_ret['loss']

                        self._batch_metrics = batch_metrics

                        if self._metrics is None:
                            self._metrics = batch_metrics
                        else:
                            self._metrics = {
                                k: v + batch_metrics[k]
                                for k, v in self._metrics.items()
                            }

                    if self.model_wrapper.is_pytorch_module(
                    ) and self._auto_optimize:
                        if (batch_idx +
                                1) % self.gradient_accumulation_steps == 0:
                            if self._optimize_on_cpu:
                                is_nan = set_optimizer_params_grad(
                                    optim_params,
                                    model.named_parameters(),
                                    test_nan=True)

                                self.optimizer.step()
                                copy_optimizer_params_to_model(
                                    model.named_parameters(), optim_params)

                            else:
                                self.optimizer.step()

                            model.zero_grad()

                    for callback in self.callbacks:
                        callback.on_batch_end()

                    if epochs == 1 and minibatches is not None:
                        if batch_idx >= minibatches:
                            self._halt = True

                self.on_epoch_end()

                for callback in self.callbacks:
                    callback.on_epoch_end()

        except KeyboardInterrupt:
            warnings.warn('Training aborted')

        for callback in self.callbacks:
            callback.on_training_end()

        self.on_training_end()
Exemplo n.º 27
0
    def init_hidden(self):
        hidden_0 = torch.zeros(self.num_layers * 2, 1, self.hidden_size // 2)
        hidden_1 = torch.zeros(self.num_layers * 2, 1, self.hidden_size // 2)

        return to_gpu(hidden_0), to_gpu(hidden_1)
Exemplo n.º 28
0
    def init_model(self, update_configs: dict = {}):
        if self._from_fp is None:
            model_state = None
        else:
            self._model = None

            if torch.cuda.is_available():
                model_state = torch.load(self._from_fp)
            else:
                model_state = torch.load(
                    self._from_fp, map_location=lambda storage, loc: storage)

        if model_state is None:
            config = self.config or dict()
        else:
            config = model_state.get('config', dict())
            self._onnx = model_state.get('onnx', None)

        # convert to dotdict
        if config is dict and not isinstance(config, dotdict):
            config = dotdict(config)
            self.config = config

        config.update(update_configs)

        if self.is_pytorch_module():
            # re-initialize model with loaded config
            self._model = self._model_class(config=config,
                                            *self._args,
                                            **self._kwargs)
            if self._use_data_parallel:
                self._model = nn.DataParallel(self._model, dim=1)
            # if fp16: self._model.half()
            self._model = to_gpu(self._model)
        else:
            # initialize model normally
            if self._onnx is None:
                self._model = self._model_class(*self._args, **self._kwargs)

        if model_state is not None:
            featurizer = model_state.get('featurizer', None)

            if featurizer is None:
                if self._featurizer is None:
                    warnings.warn(
                        'Featurizer is not found in this binary. This is likely to be an error'
                    )
            else:
                # print('Featurizer found: ', featurizer)
                self._featurizer = featurizer
            state_dict = model_state.get('state_dict', None)

            if self.is_pytorch_module():
                if state_dict is not None:
                    self._model.load_state_dict(state_dict, strict=False)
            elif self._onnx is not None:
                import onnx
                self._onnx_model = onnx.load(self._onnx)
                print('Loaded ONNX model')

            self.load_state_dict(model_state)

        self.config = config
        self.on_model_init()
Exemplo n.º 29
0
        inputs, outputs = next(iter(loader))

        outputs = outputs.view(inputs.size(0), inputs.size(1))

        if model._onnx is not None:
            padded_input = torch.zeros(EXPORT_SIZE).long()
            padded_output = torch.zeros(EXPORT_SIZE).long()

            padded_input[:inputs.size(0)] = inputs
            padded_output[:outputs.size(0)] = outputs

            inputs = padded_input
            outputs = padded_output

        # print(inputs.size())
        inputs, outputs = to_gpu(inputs), to_gpu(outputs)
        result, hidden = model(inputs)
        result = torch.max(result, dim=1)[1].view(inputs.size(0),
                                                  inputs.size(1))

        mask = (outputs != 0)
        total_count += mask.sum().item()
        total_correct += (result.masked_select(mask) == outputs.masked_select(
            mask)).sum().item()
        # total_accuracy += accuracy(result.masked_select(mask), outputs.masked_select(mask))

    # total_accuracy /= TEST_EPOCHS
    total_accuracy = total_correct / total_count
    print('Accuracy over %s test sentences: %4f' %
          (TEST_EPOCHS * BATCH_SIZE, total_accuracy * 100))
Exemplo n.º 30
0
 def on_model_init(self):
     self.criterion = to_gpu(
         MarginRankingLoss(margin=self.model_wrapper.loss_margin))