def _score_sentence(self, feats, tags): # Gives the score of a provided tag sequence score = to_gpu(torch.Tensor([0])) tags = to_gpu( torch.cat([torch.LongTensor([self.tag_to_ix[START_TAG]]), tags])) # print((len(feats), len(self.transitions), len(tags))) for i, feat in enumerate(feats): score = score + \ self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]] score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]] return score
def init_hidden( self, batch_size ) -> Iterable[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]]: if self.rnn_type == 'LSTM': return [(to_gpu(torch.zeros(1, batch_size, self.hidden_size)), to_gpu(torch.zeros(1, batch_size, self.hidden_size))) for l in range(self.n_layers)] elif self.rnn_type == 'SRU' or self.rnn_type == 'GRU': return [ to_gpu(torch.zeros(1, batch_size, self.hidden_size)) for l in range(self.n_layers) ]
def on_epoch(self, X, y): model = self.model_wrapper.model input_embs, pos_output_embs = model.get_embs(X, y) positive_similarity = model.similarity(input_embs, pos_output_embs).squeeze(1) # print(positive_similarity.size()) batch_size = X.size(0) n_samples = batch_size * self.n_negative neg_rhs = to_gpu(self.neg_sampling.sample(n_samples)) _, neg_output_embs = model.get_embs( output=neg_rhs) # (B * n_negative) x dim neg_output_embs = neg_output_embs.view(batch_size, self.n_negative, -1) # B x n_negative x dim negative_similarity = model.similarity( input_embs, neg_output_embs).squeeze(1) # B x n_negative # print(negative_similarity.size()) similarity = model(X) loss = self.criterion(positive_similarity, negative_similarity) return {'loss': loss, 'logits': torch.max(similarity, dim=-1)[1]}
def generate(self, n_tokens, temperature=1.): self.model.eval() self.hidden = self.model.init_hidden(1) seed = torch.rand(1, 1).mul(n_tokens).long() retstr = [] # retidx = [] with torch.no_grad(): for ix in range(n_tokens): seed = to_gpu(seed) output, self.hidden = self.model(seed, self.hidden) word_weights = output.squeeze().data.div( temperature).exp().cpu() # filter out inf and negative probabilities word_weights[word_weights == float("Inf")] = 0 word_weights[word_weights < 0] = 0 word_idx = torch.multinomial(word_weights, 1)[0] seed.data.fill_(word_idx) word_idx = int(word_idx) word = self.featurizer.tokenizer.ix_to_word.get(word_idx, '') retstr += [word] # retidx += [word_idx] self.model.train() if self.char_level: return ''.join(retstr) else: return ' '.join(retstr)
def embedded_dropout(self, embed: nn.Module, words: Union[torch.LongTensor, torch.cuda.LongTensor], dropout: float = 0.1, scale=None): if dropout: mask = embed.weight.data.new().resize_( (embed.weight.size(0), 1)).bernoulli_(1 - dropout).expand_as( embed.weight) / (1 - dropout) masked_embed_weight = mask * embed.weight else: masked_embed_weight = embed.weight if scale: masked_embed_weight = scale.expand_as( masked_embed_weight) * masked_embed_weight padding_idx = embed.padding_idx if padding_idx is None: padding_idx = -1 X = to_gpu( torch.nn.functional.embedding(words, masked_embed_weight, padding_idx, embed.max_norm, embed.norm_type, embed.scale_grad_by_freq, embed.sparse)) return X
def _make_mask_from_seq_lens(self, seq_lens): seq_lens = seq_lens.view(-1, 1) max_len = torch.max(seq_lens) range_tensor = to_gpu(torch.arange(max_len)).unsqueeze(0) range_tensor = range_tensor.expand(seq_lens.size(0), range_tensor.size(1)) mask = (range_tensor < seq_lens).float() return mask
def _viterbi_decode(self, feats): backpointers = [] # Initialize the viterbi variables in log space init_vvars = torch.Tensor(1, self.tagset_size).fill_(-10000.) init_vvars[0][self.tag_to_ix[START_TAG]] = 0 # forward_var at step i holds the viterbi variables for step i-1 forward_var = to_gpu(init_vvars) for feat in feats: next_tag_var = forward_var.view(1, -1).expand( self.tagset_size, self.tagset_size) + self.transitions _, bptrs_t = torch.max(next_tag_var, dim=1) bptrs_t = bptrs_t.squeeze().data.cpu().numpy() next_tag_var = next_tag_var.data.cpu().numpy() viterbivars_t = next_tag_var[range(len(bptrs_t)), bptrs_t] viterbivars_t = torch.FloatTensor(viterbivars_t) viterbivars_t = to_gpu(viterbivars_t) forward_var = viterbivars_t + feat backpointers.append(bptrs_t) # Transition to STOP_TAG terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]] terminal_var.data[self.tag_to_ix[STOP_TAG]] = -10000. terminal_var.data[self.tag_to_ix[START_TAG]] = -10000. best_tag_id = argmax(terminal_var.unsqueeze(0)) path_score = terminal_var[best_tag_id] # Follow the back pointers to decode the best path. best_path = [best_tag_id] for bptrs_t in reversed(backpointers): best_tag_id = bptrs_t[best_tag_id] best_path.append(best_tag_id) # Pop off the start tag (we dont want to return that to the caller) start = best_path.pop() assert start == self.tag_to_ix[START_TAG] # Sanity check best_path.reverse() return path_score, best_path
def forward(self, input_embs): batch_size = input_embs.size(0) candidate_rhs = to_gpu( torch.arange(0, self.n_classes).long().expand(batch_size, -1)) input_embs, candidate_rhs_repr = self.get_embs( input_embs, candidate_rhs.contiguous().view(batch_size * self.n_classes)) candidate_rhs_repr = candidate_rhs_repr.view(batch_size, self.n_classes, -1) return self.similarity(input_embs, candidate_rhs_repr).squeeze(1)
def __init__(self, config={}): super(OvrClassifier, self).__init__() self.input_dim = config.get('input_dim', EMBEDDING_DIM) self.hidden_size = config.get('hidden_size', 0) self.h_dropout_prob = config.get('h_dropout_prob', 0.) self.n_classes = config.get('num_classes', 10) self.classifiers = list() for ix in range(self.n_classes): if self.hidden_size == 0: clf = to_gpu( nn.Sequential(nn.Dropout(self.h_dropout_prob), nn.Linear(self.input_dim, 1))) else: clf = to_gpu( nn.Sequential(nn.Linear(self.input_dim, self.hidden_size), nn.Dropout(self.h_dropout_prob), nn.Sigmoid(), nn.Linear(self.hidden_size, 1))) self.classifiers.append(clf)
def forward(self, sentence): words_batch, word_lengths = self._process_sentence( [token if len(token) > 0 else UNK_TAG for token in sentence]) words_batch = to_gpu(words_batch) # letters x words words_batch = self.dropout( self.embedding(words_batch)) # letters x words x embeds # print('words_batch: %s' % str(words_batch.size())) # Sort by length (keep idx) word_lengths, idx_sort = np.sort(word_lengths)[::-1], np.argsort( -word_lengths) idx_unsort = np.argsort(idx_sort) idx_sort = to_gpu(torch.from_numpy(idx_sort)) words_batch = words_batch.index_select(1, idx_sort) # Handling padding in Recurrent Networks # copy() call is to fix negative strides support in pytorch words_packed = pack_padded_sequence(words_batch, word_lengths.copy()) words_output = self.rnn(words_packed)[0] words_output = pad_packed_sequence(words_output)[0] # Un-sort by length idx_unsort = to_gpu(torch.from_numpy(idx_unsort)) words_output = words_output.index_select(1, idx_unsort) # Max Pooling embeds = torch.max(words_output, 0)[0] if embeds.ndimension() == 3: embeds = embeds.squeeze(0) assert embeds.ndimension() == 2 # print(embeds) return embeds # words x embeds
def on_training_start(self): config = self.model_wrapper.config or dict() embedding_dim = config.get('embedding_dim', LM_HIDDEN_DIM) self.char_level = config.get('char_level', False) if self.char_level: self.criterion = to_gpu(nn.CrossEntropyLoss()) else: num_words = config.get( 'num_words', self.model_wrapper.featurizer.tokenizer.num_words) splits = [] if num_words > 500000: # One Billion # This produces fairly even matrix mults for the buckets: # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422 splits = [4200, 35000, 180000] elif num_words > 75000: # WikiText-103 splits = [2800, 20000, 76000] else: splits = [num_words // 3, num_words // 3] print('Number of tokens', num_words) print('Cross Entropy Splits: Using', splits) self.model_wrapper.config['adasoft_cutoffs'] = splits self.model_wrapper.config['num_words'] = num_words self.criterion = to_gpu( SplitCrossEntropyLoss(embedding_dim, splits)) self.hidden = None # regularization self.clip_grad = config.get('clip_grad', .25) self.alpha = config.get('alpha', 2) self.beta = config.get('beta', 1) self.batch_size = 0
def forward(self, sent_batch: List[List[str]]): max_length = min(max([len(sent) for sent in sent_batch]), self.config.max_position_embeddings) words_embeddings = to_gpu( torch.FloatTensor(word_to_vec(sent_batch, pad_to_length=max_length))) chars_embeddings = to_gpu( torch.stack([ torch.cat((self.char_encoder(sent), torch.zeros(max_length - len(sent), self.char_embedding_dim)), dim=0) if len(sent) < max_length else self.char_encoder(sent)[:max_length] if len(sent) > max_length else self.char_encoder(sent) for sent in sent_batch ], 0)) if self.use_position_embeddings: position_ids = torch.arange(max_length, dtype=torch.long, device=words_embeddings.device) position_ids = position_ids.unsqueeze(0).expand( words_embeddings.size(0), words_embeddings.size(1)) position_embeddings = self.position_embeddings(position_ids) embeddings = torch.cat([words_embeddings, chars_embeddings], dim=-1) + position_embeddings if self.use_position_embeddings: embeddings = words_embeddings + position_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings
def init_hidden( self, batch_size: int ) -> Iterable[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]]: if self.rnn_type == 'LSTM': return [(to_gpu(torch.zeros(2, batch_size, self.hidden_dim // 2)), to_gpu(torch.zeros(2, batch_size, self.hidden_dim // 2))) for l in range(self.n_layers)] elif self.rnn_type == 'QRNN': # 2 hidden layers for each direction return [ to_gpu(torch.zeros(2, batch_size, self.hidden_dim // 2)) for l in range(self.n_layers) ] elif self.rnn_type == 'GRU': return [ to_gpu(torch.zeros(2, batch_size, self.hidden_dim // 2)) for l in range(self.n_layers) ] elif self.rnn_type == 'SRU': return [ to_gpu(torch.zeros(1, batch_size, self.hidden_dim)) for l in range(self.n_layers) ] else: return None
def neg_log_likelihood(self, sent_batch, tags): word_embeds = to_gpu( torch.FloatTensor([word_to_vec(w) for w in sent_batch[0]])) word_embeds = self.emb_dropout(word_embeds) char_embeds = self.word_encoder(sent_batch[0]) sentence_in = torch.cat((word_embeds, char_embeds), dim=-1).unsqueeze(1) sentence_in = self.dropout(sentence_in) feats = self._get_lstm_features(sentence_in) forward_score = self._forward_alg(feats) gold_score = self._score_sentence(feats, tags[0]) return feats, forward_score - gold_score
def transform(self, data): if self.tokenizer is None: self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) tokens = [self.tokenize_fn(sent) for sent in data] tokens = self.tokenizer.texts_to_sequences(tokens) tokens = self.add_ngram(tokens, self.token_indice, self.ngrams) max_len = max([len(seq) for seq in tokens]) if max_len > self.max_len: warnings.warn('Max training sequence length is %s, which is higher than max length setting %s' % \ (max_len, self.max_len), UserWarning) tokens = pad_sequences(tokens, maxlen=self.max_len) return to_gpu(torch.LongTensor(tokens))
def forward(self, sent_batch): # dont confuse this with _forward_alg above. word_embeds = to_gpu( torch.FloatTensor([word_to_vec(w) for w in sent_batch[0]])) word_embeds = self.emb_dropout(word_embeds) char_embeds = self.word_encoder(sent_batch[0]) sentence_in = torch.cat((word_embeds, char_embeds), dim=-1).unsqueeze(1) # Get the emission scores from the BiLSTM lstm_feats = self._get_lstm_features(sentence_in) # Find the best path, given the features. score, tag_seq = self._viterbi_decode(lstm_feats) return score, tag_seq, sent_batch[0]
def __init__(self, config): super(VNTokenizer, self).__init__() self.max_emb_words = config.get('max_emb_words') self.embedding_dim = config.get('embedding_dim', EMBEDDING_DIM) self.char_embedding_dim = config.get('char_embedding_dim', CHAR_EMBEDDING_DIM) self.hidden_dim = config.get('hidden_dim', 1200) self.num_layers = config.get('num_layers', 3) self.dropout_prob = config.get('dropout_prob', .2) self.is_cuda = is_cuda if is_cuda is not None else torch.cuda.is_available() self.word_encoder = to_gpu(BRNNWordEncoder(self.char_embedding_dim, rnn_type='LSTM')) self.dropout = nn.Dropout(self.dropout_prob)) # 0: reserved index by Keras tokenizer # num_words + 1: index for oov token self.embedding = nn.Embedding(self.max_emb_words + 2, self.embedding_dim) self.lstm = nn.LSTM(self.embedding_dim + self.char_embedding_dim, self.hidden_dim // 2, num_layers=self.num_layers, bidirectional=True)
def _forward_alg(self, feats): # Do the forward algorithm to compute the partition function init_alphas = torch.Tensor(1, self.tagset_size).fill_(-10000.) # START_TAG has all of the score. init_alphas[0][self.tag_to_ix[START_TAG]] = 0. # Wrap in a variable so that we will get automatic backprop forward_var = to_gpu(init_alphas) # Iterate through the sentence for feat in feats: emit_score = feat.view(-1, 1) tag_var = forward_var + self.transitions + emit_score max_tag_var, _ = torch.max(tag_var, dim=1) tag_var = tag_var - max_tag_var.view(-1, 1) forward_var = max_tag_var + torch.logsumexp(tag_var, dim=1).view( 1, -1) terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]] alpha = torch.logsumexp(terminal_var, dim=-1) return alpha
def init_on_data(self, X, y): self.model_wrapper.label_encoder.fit(y) self.model_wrapper.n_classes = len( self.model_wrapper.label_encoder.classes_) self.model_wrapper.config['num_classes'] = self.model_wrapper.n_classes config = self.model_wrapper.config if 'contexts' in config: contexts = config['contexts'] contexts_list = [ contexts[label] if label in contexts else [] for label in self.model_wrapper.label_encoder.classes_ ] self.model_wrapper.config['contexts'] = contexts_list # print('number of contexts: %s' % str(len(contexts_list))) y_labels = self.model_wrapper.label_encoder.transform(y) class_weights = class_weight.compute_class_weight( 'balanced', np.unique(y_labels), y_labels) self.class_weights = to_gpu(torch.from_numpy(class_weights).float()) self.criterion = nn.CrossEntropyLoss(weight=self.class_weights)
def init_on_data(self, X, y): tokens = [self.model_wrapper.tokenize_fn(sent) for sent in X] self.model_wrapper.tokenizer.fit_on_texts(tokens) self.n_samples = len(tokens) # self.n_classes = len(np.unique(y)) self.buffer_pointer = 0 self.model_wrapper.label_encoder.fit(y) n_classes = len(self.model_wrapper.label_encoder.classes_) self.model_wrapper.config['num_classes'] = n_classes self.neg_sampling = to_gpu( NegativeSampling(n_output=n_classes, n_negative=self.n_negative)) # self.class_weights = class_weight.compute_class_weight('balanced', np.unique(y), y) config = self.model_wrapper.config if 'contexts' in config: contexts = config['contexts'] contexts_list = [ contexts[label] if label in contexts else [] for label in self.model_wrapper.label_encoder.classes_ ] self.model_wrapper.config['contexts'] = contexts_list
def forward(self, sent_batch): sentence = sent_batch[0] tokens = self.tokenizer.texts_to_sequences([sentence]) tokens = to_gpu(torch.LongTensor(tokens)) word_embeds = self.embedding(tokens).permute(1, 0, 2) # print('word_embeds: %s' % str(word_embeds.size())) char_embeds = self.word_encoder([ remove_tone_marks(token) for token in sentence ]).unsqueeze(1) # print('char_embeds: %s' % str(char_embeds.size())) sentence_in = torch.cat((word_embeds, char_embeds), dim=-1) seq_len = len(sentence_in) # embeds = sentence_in.view(seq_len, 1, -1) # [seq_len, batch_size, features] lstm_out, _ = self.lstm(sentence_in) lstm_out = lstm_out.view(seq_len, self.hidden_dim) tags = self.hidden2tag(lstm_out).squeeze(1) return tags
def __init__(self, config): super(BiRNNLanguageModel, self).__init__() self.config = config self.tie_weights = config.get('tie_weights', True) self.embedding_dim = config.get('embedding_dim', LM_HIDDEN_DIM) self.hidden_dim = self.embedding_dim if self.tie_weights else config.get( 'hidden_dim', LM_HIDDEN_DIM) self.dropout_emb = config.get('emb_dropout', .2) self.dropout_i = config.get('lock_drop', .5) self.dropout_h = config.get('h_dropout', .5) self.dropout_w = config.get('w_dropout', 0) self.num_words = config.get('num_words', LM_VOCAB_SIZE) self.rnn_type = config.get('rnn_type', 'SRU') self.n_layers = config.get('n_layers', 6) self.dropout_rnn = config.get('rnn_dropout', .2) self.highway_bias = config.get('highway_bias', -3) self.use_adasoft = config.get('use_adasoft', True) self.adasoft_cutoffs = config.get( 'adasoft_cutoffs', [LM_VOCAB_SIZE // 2, LM_VOCAB_SIZE // 2]) assert self.rnn_type in ['LSTM', 'GRU', 'SRU', 'QRNN'] self.encoder = nn.Embedding(self.num_words, self.embedding_dim) self.lockdrop = to_gpu(LockedDropout()) # for the mean time weight drop is broken if self.rnn_type == 'LSTM': self.rnns = [ nn.LSTM( self.embedding_dim if layer_ix == 0 else self.hidden_dim, self.hidden_dim // 2, bidirectional=True, dropout=self.dropout_rnn) for layer_ix in range(self.n_layers) ] if self.dropout_w: self.rnns = [ WeightDrop(rnn, ['weight_hh_l0'], dropout=self.dropout_w) for rnn in self.rnns ] elif self.rnn_type == 'GRU': self.rnns = [ nn.GRU( self.embedding_dim if layer_ix == 0 else self.hidden_dim, self.hidden_dim // 2, bidirectional=True, dropout=self.dropout_rnn) for layer_ix in range(self.n_layers) ] if self.dropout_w: self.rnns = [ WeightDrop(rnn, ['weight_hh_l0'], dropout=self.dropout_w) for rnn in self.rnns ] elif self.rnn_type == 'QRNN': from torchqrnn import QRNNLayer self.rnns = self.rnns = [ QRNNLayer( self.embedding_dim if layer_ix == 0 else self.hidden_dim, self.hidden_dim // 2, bidirectional=True) for layer_ix in range(self.n_layers) ] if self.dropout_w: for rnn in self.rnns: rnn.linear = WeightDrop(rnn.linear, ['weight'], dropout=self.dropout_w) else: from sru import SRU self.rnns = [ to_gpu( SRU(self.embedding_dim if layer_ix == 0 else self.hidden_dim, self.hidden_dim // 2, num_layers=1, rnn_dropout=self.dropout_rnn, dropout=self.wdrop, rescale=False, highway_bias=self.highway_bias, use_tanh=0, bidirectional=True, v1=True)) for layer_ix in range(self.n_layers) ] self.rnns = nn.ModuleList(self.rnns) self.decoder = nn.Linear( self.embedding_dim if self.tie_weights else self.hidden_dim, self.num_words) # Adaptive softmax self.use_adasoft = config.get('use_adasoft', True) if self.use_adasoft: if 'adasoft_cutoffs' in config: splits = config['adasoft_cutoffs'] else: splits = [] if self.num_words >= 500000: # One Billion # This produces fairly even matrix mults for the buckets: # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422 splits = [4200, 35000, 180000] elif self.num_words >= 75000: # WikiText-103 splits = [2800, 20000, 76000] elif self.num_words >= 20000: splits = [2000, 4000, 10000] else: splits = [self.num_words // 3, self.num_words // 3] config['adasoft_cutoffs'] = splits # print('Cross Entropy Splits: Using', splits) self.adasoft = SplitCrossEntropyLoss(self.hidden_dim, splits, ignore_index=0) else: self.adasoft = None # Weight tying if self.tie_weights: self.decoder.weight = self.encoder.weight self.init_weights()
def __init__(self, config): super(RNNLanguageModel, self).__init__() self.config = config self.tie_weights = config.get('tie_weights', True) self.char_level = config.get('char_level', False) self.embedding_dim = config.get( 'embedding_dim', LM_HIDDEN_DIM if self.tie_weights else LM_EMBEDDING_DIM) self.hidden_size = self.embedding_dim if self.tie_weights else config.get( 'hidden_size', LM_HIDDEN_DIM) self.num_words = config.get( 'num_words', n_letters + LM_CHAR_RESERVED if self.char_level else LM_VOCAB_SIZE) self.dropout_emb = config.get('emb_dropout', .2) self.dropout_i = config.get('lock_drop', .5) self.dropout_h = config.get('h_dropout', .5) self.wdrop = config.get('wdrop', 0) self.rnn_type = config.get('rnn_type', 'SRU') self.n_layers = config.get('n_layers', 6) self.dropout_rnn = config.get('rnn_dropout', .2) self.highway_bias = config.get('highway_bias', -3) self.adasoft_cutoffs = config.get('adasoft_cutoffs', [LM_VOCAB_SIZE]) assert self.rnn_type in ['LSTM', 'GRU', 'SRU'] self.encoder = nn.Embedding(self.num_words, self.embedding_dim) self.lockdrop = to_gpu(LockedDropout()) # for the mean time weight drop is broken if self.rnn_type == 'LSTM': self.rnns = nn.ModuleList([ nn.LSTM( self.embedding_dim if layer_ix == 0 else self.hidden_size, self.hidden_size if layer_ix != self.n_layers - 1 else \ (self.embedding_dim if self.tie_weights else self.hidden_size), ) for layer_ix in range(self.n_layers) ]) elif self.rnn_type == 'GRU': self.rnns = nn.ModuleList([ nn.GRU( self.embedding_dim if layer_ix == 0 else self.hidden_size, self.hidden_size if layer_ix != self.n_layers - 1 else \ (self.embedding_dim if self.tie_weights else self.hidden_size) ) for layer_ix in range(self.n_layers) ]) else: from sru import SRU self.rnns = nn.ModuleList([ to_gpu( SRU(self.embedding_dim if layer_ix == 0 else self.hidden_size, self.hidden_size, num_layers=1, rnn_dropout=self.dropout_rnn, dropout=self.wdrop, rescale=False, highway_bias=self.highway_bias, use_tanh=0, v1=True)) for layer_ix in range(self.n_layers) ]) self.decoder = nn.Linear(self.hidden_size, self.num_words) # Weight tying if self.tie_weights: self.decoder.weight = self.encoder.weight self.init_weights()
def _viterbi_decode( self, emissions: Union[torch.FloatTensor, torch.cuda.FloatTensor], mask: Union[torch.FloatTensor, torch.cuda.FloatTensor]) -> torch.Tensor: seq_len = emissions.shape[1] mask = mask.to(torch.uint8) log_prob = emissions[:, 0].clone() log_prob += self.transitions[ self.start_tag, :self.start_tag].unsqueeze(0) # At each step, we need to keep track of the total score, as if this step # was the last valid step. end_scores = log_prob + self.transitions[:self.start_tag, self.end_tag].unsqueeze(0) best_scores_list = [] # If the element has only token, empty tensor in best_paths helps # torch.cat() from crashing best_paths_list = [to_gpu(torch.Tensor().long())] best_scores_list.append(end_scores.unsqueeze(1)) for idx in range(1, seq_len): broadcast_emissions = emissions[:, idx].unsqueeze(1) broadcast_transmissions = self.transitions[:self.start_tag, :self. start_tag].unsqueeze(0) broadcast_log_prob = log_prob.unsqueeze(2) score = broadcast_emissions + broadcast_transmissions + broadcast_log_prob max_scores, max_score_indices = torch.max(score, 1) best_paths_list.append(max_score_indices.unsqueeze(1)) # Storing the scores incase this was the last step. end_scores = max_scores + self.transitions[:self.start_tag, self. end_tag].unsqueeze(0) best_scores_list.append(end_scores.unsqueeze(1)) log_prob = max_scores best_scores = torch.cat(best_scores_list, 1).float() best_paths = torch.cat(best_paths_list, 1) _, max_indices_from_scores = torch.max(best_scores, 2) valid_index_tensor = to_gpu(torch.tensor(0)).long() padding_tensor = to_gpu(torch.tensor(self.ignore_index)).long() # Label for the last position is always based on the index with max score # For illegal timesteps, we set as ignore_index labels = max_indices_from_scores[:, seq_len - 1] labels = self._mask_tensor(labels, 1.0 - mask[:, seq_len - 1], padding_tensor) all_labels = labels.unsqueeze(1).long() # For Viterbi decoding, we start at the last position and go towards first for idx in range(seq_len - 2, -1, -1): # There are two ways to obtain labels for tokens at a particular position. # Option 1: Use the labels obtained from the previous position to index # the path in present position. This is used for all positions except # last position in the sequence. # Option 2: Find the indices with maximum scores obtained during # viterbi decoding. This is used for the token at the last position # For option 1 need to convert invalid indices to 0 so that lookups # dont fail. indices_for_lookup = all_labels[:, -1].clone() indices_for_lookup = self._mask_tensor( indices_for_lookup, indices_for_lookup == self.ignore_index, valid_index_tensor, ) # Option 1 is used here when previous timestep (idx+1) was valid. indices_from_prev_pos = (best_paths[:, idx, :].gather( 1, indices_for_lookup.view(-1, 1).long()).squeeze(1)) indices_from_prev_pos = self._mask_tensor(indices_from_prev_pos, (1.0 - mask[:, idx + 1]), padding_tensor) # Option 2 is used when last timestep was not valid which means idx+1 # is the last position in the sequence. indices_from_max_scores = max_indices_from_scores[:, idx] indices_from_max_scores = self._mask_tensor( indices_from_max_scores, mask[:, idx + 1], padding_tensor) # We need to combine results from 1 and 2 as rows in a batch can have # sequences of varying lengths labels = torch.where( indices_from_max_scores == self.ignore_index, indices_from_prev_pos, indices_from_max_scores, ) # Set to ignore_index if present state is not valid. labels = self._mask_tensor(labels, (1 - mask[:, idx]), padding_tensor) all_labels = torch.cat((all_labels, labels.view(-1, 1).long()), 1) return torch.flip(all_labels, [1])
def repackage_hidden(self, h) -> Union[torch.Tensor, Tuple]: if torch.is_tensor(h): return to_gpu(h.detach()) else: return tuple(self.repackage_hidden(v) for v in h)
def fit(self, training_data: Iterable = None, validation_data: Iterable = None, epochs: int = 1, minibatches: int = None, epoch_start: int = 0, batch_size: int = 64, shuffle: bool = True, optimize_on_cpu: bool = False, fp16: bool = False, gradient_accumulation_steps: int = 1, callbacks: Iterable[object] = [], clip_grad: float = 0): if self._uneven_batch_size: batch_size = 1 self._batch_size = batch_size self.clip_grad = clip_grad if gradient_accumulation_steps and 'gradient_accumulation_steps' in dict( inspect.getmembers( self.on_epoch.__func__.__code__))['co_varnames']: print('Gradient accumulation is supported by this class') self.gradient_accumulation_steps = gradient_accumulation_steps else: self.gradient_accumulation_steps = 1 if training_data is not None: self.set_training_data(training_data) if validation_data is not None: self.set_validation_data(validation_data) for callback in callbacks: callback.set_learner(self) self._callbacks = callbacks or [] self._n_epochs = epochs self._optimize_on_cpu = optimize_on_cpu # Preprocess data. If data is already a dataset class # then preprocessing logic should be implemented in the class if not self._is_dataset: X, y = self._data # Process input and output data - if needed (tokenization etc.) if self.model_wrapper._featurizer is not None: self.model_wrapper._featurizer.fit(X) X = self.model_wrapper.preprocess_dataset_X(X) y = self.model_wrapper.preprocess_dataset_y(y) self.init_on_data(X, y) # Preprocess all batches of data (adding n-grams etc.) # If data should be lazily processed, use the Dataset class instead. if self._preprocess_batch: if self.model_wrapper._featurizer is not None: dataset = BatchPreprocessedDataset( X, y, input_process_fn=lambda _X: self.model_wrapper. preprocess_input( self.model_wrapper._featurizer.transform(_X)), output_process_fn=self.model_wrapper.preprocess_output, batch_size=batch_size) else: dataset = BatchPreprocessedDataset( X, y, input_process_fn=self.model_wrapper.preprocess_input, output_process_fn=self.model_wrapper.preprocess_output, batch_size=batch_size) else: if self.model_wrapper._featurizer is not None: X = self.model_wrapper._featurizer.transform(X) X = self.model_wrapper.preprocess_input(X) y = self.model_wrapper.preprocess_output(y) if not self._uneven_batch_size: dataset = GenericDataset(X, y) else: dataset = self._data self.init_on_dataset(dataset) # Call on_training_start hooks self.on_training_start() for callback in self.callbacks: callback.on_training_start() if self._verbose == 2: from tqdm import trange iterator = trange(epoch_start, self._n_epochs, desc='Epochs', leave=False) else: iterator = range(epoch_start, self._n_epochs) cpu_count = int( os.environ.get('NUM_WORKERS', max(mp.cpu_count() - 1, 1))) if batch_size is None: batch_size = len(dataset) if USE_GPU: try: mp.set_start_method('spawn') except: warnings.warn( 'Error orcurred in multiprocessing.set_start_method') if not self._uneven_batch_size: loader_kwargs = { 'batch_size': batch_size, 'num_workers': cpu_count, 'shuffle': shuffle } if USE_GPU: loader_kwargs['pin_memory'] = True if self._collate_fn is not None: loader_kwargs['collate_fn'] = self._collate_fn data_loader = DataLoader(dataset, **loader_kwargs) else: data_loader = [([X[idx]], [y[idx]]) for idx in range(len(X))] if self.model_wrapper._featurizer is not None: self.model_wrapper.config[ 'input_shape'] = self.model_wrapper._featurizer.get_output_shape( ) if self.model_wrapper.model is None: self.model_wrapper.init_model() self.on_model_init() model = self.model_wrapper._model # optimizer must be initialized after the model if self.optimizer is None and self._auto_optimize: optim_params = [(n, param) for n, param in model.named_parameters() if param.requires_grad] if self._optimize_on_cpu: optim_params = [ (n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in optim_params ] self.optimizer = self._optimizer_fn([p for n, p in optim_params], **self._optimizer_kwargs) if self.model_wrapper.is_pytorch_module() and not hasattr( self, 'criterion'): raise ValueError( 'Criterion must be set for the Learner class before training') # fp16 if fp16: try: from apex import amp, optimizers from apex.multi_tensor_apply import multi_tensor_applier model, self.optimizer = amp.initialize(model, self.optimizer, opt_level="O1", loss_scale="dynamic") self.model_wrapper._model = model except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to run this example." ) # Main training loop try: for epoch in iterator: if self._halt: # For early stopping self._halt = False break self._current_epoch = epoch self._metrics = None self.on_epoch_start() for callback in self._callbacks: callback.on_epoch_start() for batch_idx, (X_batch, y_batch) in enumerate(data_loader, 0): if self._halt: # For early stopping / skipping batches break self._batch_idx = batch_idx for callback in self.callbacks: callback.on_batch_start() if model is not None and self.model_wrapper.is_pytorch_module( ): model.train() args = to_gpu(X_batch), to_gpu(y_batch) kwargs = {} if gradient_accumulation_steps > 1: kwargs[ 'gradient_accumulation_steps'] = self.gradient_accumulation_steps epoch_ret = self.on_epoch(*args, **kwargs) if epoch_ret is not None: if 'logits' in epoch_ret: with torch.no_grad(): batch_metrics = self.calculate_metrics( epoch_ret['logits'], y_batch) or {} else: batch_metrics = {} if 'loss' in epoch_ret: epoch_loss = epoch_ret['loss'] # backward if fp16: with amp.scale_loss( epoch_loss, self.optimizer) as scaled_loss: scaled_loss.backward() if self.clip_grad > 0: torch.nn.utils.clip_grad_norm_( amp.master_params(self.optimizer), self.clip_grad) else: epoch_loss.backward() if self.clip_grad > 0: torch.nn.utils.clip_grad_norm_( model.parameters(), self.clip_grad) epoch_ret['loss'] = epoch_loss.detach().cpu().item( ) batch_metrics['loss'] = epoch_ret['loss'] self._batch_metrics = batch_metrics if self._metrics is None: self._metrics = batch_metrics else: self._metrics = { k: v + batch_metrics[k] for k, v in self._metrics.items() } if self.model_wrapper.is_pytorch_module( ) and self._auto_optimize: if (batch_idx + 1) % self.gradient_accumulation_steps == 0: if self._optimize_on_cpu: is_nan = set_optimizer_params_grad( optim_params, model.named_parameters(), test_nan=True) self.optimizer.step() copy_optimizer_params_to_model( model.named_parameters(), optim_params) else: self.optimizer.step() model.zero_grad() for callback in self.callbacks: callback.on_batch_end() if epochs == 1 and minibatches is not None: if batch_idx >= minibatches: self._halt = True self.on_epoch_end() for callback in self.callbacks: callback.on_epoch_end() except KeyboardInterrupt: warnings.warn('Training aborted') for callback in self.callbacks: callback.on_training_end() self.on_training_end()
def init_hidden(self): hidden_0 = torch.zeros(self.num_layers * 2, 1, self.hidden_size // 2) hidden_1 = torch.zeros(self.num_layers * 2, 1, self.hidden_size // 2) return to_gpu(hidden_0), to_gpu(hidden_1)
def init_model(self, update_configs: dict = {}): if self._from_fp is None: model_state = None else: self._model = None if torch.cuda.is_available(): model_state = torch.load(self._from_fp) else: model_state = torch.load( self._from_fp, map_location=lambda storage, loc: storage) if model_state is None: config = self.config or dict() else: config = model_state.get('config', dict()) self._onnx = model_state.get('onnx', None) # convert to dotdict if config is dict and not isinstance(config, dotdict): config = dotdict(config) self.config = config config.update(update_configs) if self.is_pytorch_module(): # re-initialize model with loaded config self._model = self._model_class(config=config, *self._args, **self._kwargs) if self._use_data_parallel: self._model = nn.DataParallel(self._model, dim=1) # if fp16: self._model.half() self._model = to_gpu(self._model) else: # initialize model normally if self._onnx is None: self._model = self._model_class(*self._args, **self._kwargs) if model_state is not None: featurizer = model_state.get('featurizer', None) if featurizer is None: if self._featurizer is None: warnings.warn( 'Featurizer is not found in this binary. This is likely to be an error' ) else: # print('Featurizer found: ', featurizer) self._featurizer = featurizer state_dict = model_state.get('state_dict', None) if self.is_pytorch_module(): if state_dict is not None: self._model.load_state_dict(state_dict, strict=False) elif self._onnx is not None: import onnx self._onnx_model = onnx.load(self._onnx) print('Loaded ONNX model') self.load_state_dict(model_state) self.config = config self.on_model_init()
inputs, outputs = next(iter(loader)) outputs = outputs.view(inputs.size(0), inputs.size(1)) if model._onnx is not None: padded_input = torch.zeros(EXPORT_SIZE).long() padded_output = torch.zeros(EXPORT_SIZE).long() padded_input[:inputs.size(0)] = inputs padded_output[:outputs.size(0)] = outputs inputs = padded_input outputs = padded_output # print(inputs.size()) inputs, outputs = to_gpu(inputs), to_gpu(outputs) result, hidden = model(inputs) result = torch.max(result, dim=1)[1].view(inputs.size(0), inputs.size(1)) mask = (outputs != 0) total_count += mask.sum().item() total_correct += (result.masked_select(mask) == outputs.masked_select( mask)).sum().item() # total_accuracy += accuracy(result.masked_select(mask), outputs.masked_select(mask)) # total_accuracy /= TEST_EPOCHS total_accuracy = total_correct / total_count print('Accuracy over %s test sentences: %4f' % (TEST_EPOCHS * BATCH_SIZE, total_accuracy * 100))
def on_model_init(self): self.criterion = to_gpu( MarginRankingLoss(margin=self.model_wrapper.loss_margin))