def forward(self, x): #print('x:', str(x)) batch_size = len(x) character_ids = batch_to_ids(x).to(tdevice) embeddings = self.elmo(character_ids)['elmo_representations'] #print('elmo embeddings:', embeddings[0].size()) X = embeddings[0].view(batch_size, -1, 1024) # (N, W, D) # Pad to 10 words if X.size(1) > self.pad_size: X = X[:, 0:self.pad_size, :] elif X.size(1) < self.pad_size: pad = self.pad_size - X.size(1) zero_vec = torch.zeros(X.size(0), pad, X.size(2), device=tdevice) X = torch.cat((X, zero_vec), dim=1) if self.reduce_size > 0: X, hn = self.gru(X, None) x = X.unsqueeze(1) # (N, Ci, W, D)] print('x size:', x.size()) x_list = [] for conv in self.convs: x_list.append(self.conv_and_pool(x, conv)) x = torch.cat(x_list, 1) x = self.dropout(x) # (N, len(Ks)*Co) logit = self.fc1(x) # (N, C) return logit
def batch_preprocessing_elmo(batch, args): # 2D list containing all tokenized sentences from all the docs in batch [[sent1], [sent2]] all_batch_sentences = [] # 1D numpy array containing len of each sentences in each docs after padding modified_sentence_len = [] # 1D list containing len of docs (num of sentences in each docs) batch_docs_len = [len(batch[i]) for i in range(len(batch))] # 2D list containing len of each sentences in each docs [doc->len_sent] batch_sentences_len = batch_sentences_length(batch) max_doc_len = max(batch_docs_len) pad_token = [args.padding_symbol] for doc in batch: if len(doc) < max_doc_len: for _ in range(max_doc_len - len(doc)): doc.append(pad_token) for sent in doc: modified_sentence_len.append(len(sent)) all_batch_sentences.append(sent) # Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters # (len(batch), max sentence length, max word length[char]). all_batch_sents_enc_char = batch_to_ids(all_batch_sentences).to( args.device) # Converting to batch-mode again. 4D tensor [doc] -> [sent] -> [word] -> [char] d0, d1, d2 = all_batch_sents_enc_char.size() all_batch_sents_enc_char = all_batch_sents_enc_char.view( args.batch_size_train, -1, d1, d2) return all_batch_sents_enc_char, batch_docs_len, batch_sentences_len, np.asarray( modified_sentence_len).reshape(args.batch_size_train, -1)
def elmo_encode(data, __id2word=id2word): data_text = [glove_tokenizer(x, __id2word) for x in data] with torch.no_grad(): character_ids = batch_to_ids(data_text).cuda() elmo_emb = elmo(character_ids)['elmo_representations'] elmo_emb = (elmo_emb[0] + elmo_emb[1]) / 2 # avg of two layers return elmo_emb.cuda()
def get_scope_elmo(model, ELMO_folder, scope_text, elmo_dim, idx2id_dict, id2idx_dict, device=torch.device('cpu')): """ Get scope note ELMo embedding representation """ with torch.no_grad(): elmo_embeddings = [ model(batch_to_ids(i).to(device)) for i in scope_text ] elmo_scope_embeddings = [ i['elmo_representations'][0].view(-1, elmo_dim) for i in elmo_embeddings ] elmo_scope_embeddings = [ torch.mean(item, dim=0) for item in elmo_scope_embeddings ] elmo_scope_embeddings = torch.stack(elmo_scope_embeddings) return elmo_scope_embeddings, idx2id_dict
def load_data_with_diff_vocab(inputFile, src_vocab, tgt_vocab, max_word=200, max_char=50, type='norm'): """ use different to load data, src_vocab, tgt_vocab, src_vocab including the lexnorm2015 and aggressive dataset :param inputFile: :param src_vocab: :param tgt_vocab: :param max_word: :param max_char: :param type: :return: """ char_inputs = [] word_inputs = [] outputs = [] max_output = 0 df = pd.read_csv(inputFile, names=['src', 'tgt']) for src, tgt in zip(df['src'], df['tgt']): elmo_id = batch_to_ids([tokenize(str(src))]) elmo_id = elmo_id.view(-1, 50) word_input = [ src_vocab.word_to_id(word) for word in tokenize(str(src)) ] if type == 'norm': output = [tgt_vocab.word_to_id(tgt_vocab.SYM_SOS)] output.extend([ tgt_vocab.word_to_id(word) for word in tgt.strip().split(' ') ]) output.append(tgt_vocab.word_to_id(tgt_vocab.SYM_EOS)) else: output = [tgt_vocab.tag_to_id(tag) for tag in tgt.strip().split()] char_inputs.append(elmo_id) word_inputs.append(word_input) # print(str(idx), len(output)) # idx += 1 outputs.append(output) max_output = max([len(sent) for sent in outputs]) # print(max_output) outputs = list(map(lambda d: d[:max_output], outputs)) outputs = list( map( lambda d: d + (max_output - len(d)) * [tgt_vocab.word_to_id(tgt_vocab.W_PAD)], outputs)) word_inputs = list(map(lambda d: d[:max_word], word_inputs)) word_inputs = list( map( lambda d: d + (max_word - len(d)) * [src_vocab.word_to_id(src_vocab.W_PAD)], word_inputs)) char_inputs = pad_sequence(char_inputs, True, max_word, max_char) # char_input = [[vocab.char_to_id(char) for char in word] for word in tokenize(str(src))] dataset = OurDataset(word_inputs, char_inputs, outputs) return dataset
def train(self, training_data): ''' Must be passed the training data - list of questions from the QuizBowlDataset class ''' print("train") # We want questions to store each question tokenized by word # and answers stored as a list questions = [] for ques in training_data: tokens = self.tokenizer(' '.join(ques.sentences)) tokens_list = [token.text for token in tokens] questions.append(tokens_list) self.answers.append(ques.page) print("chars to ids") character_ids = batch_to_ids(questions) print("elmo output") elmo_output = self.elmo(character_ids) # index at zero because we only have a single output representation word_embeddings = elmo_output['elmo_representations'][0] print("mean") # A matrix of size (num_train_questions * embed_length) self.question_matrix = word_embeddings.mean(1) print("train done")
def pointerEncoder(self, Xin_ELMo, lens): self.bn_inputdata = nn.BatchNorm1d(self.word_dim, affine=False, track_running_stats=False) batch_size = len(Xin_ELMo) #to convert input to ELMo embeddings character_ids = batch_to_ids(Xin_ELMo) if self.use_cuda: character_ids = character_ids.cuda() embeddings = elmo(character_ids) X_ELMo = embeddings['elmo_representations'][ 0] #two layers output [batch,length,d_elmo] if self.use_cuda: X_ELMo = X_ELMo.cuda() X = X_ELMo if self.isbanor: X = X.permute(0, 2, 1) # N C L X = self.bn_inputdata(X) X = X.permute(0, 2, 1) # N L C X = self.nnDropout(X) encoder_lstm_co_h_o = self.initHidden(self.hidden_dim, batch_size) output_encoder, hidden_states_encoder = self._run_rnn_packed( self.encoder_rnn, X, lens, encoder_lstm_co_h_o) # batch_first=True output_encoder = output_encoder.contiguous() output_encoder = self.nnDropout(output_encoder) return output_encoder, hidden_states_encoder
def run_batch_lattice(self, batch, testing=False): if testing: self.lm.eval() else: self.lm.train() inputs, positions, prevs, rev_prevs, lm_labels, rev_lm_labels, lm_masks, rev_lm_masks = batch char_ids = batch_to_ids(inputs).to(self.device) lm_labels = torch.from_numpy(lm_labels).to(self.device) rev_lm_labels = torch.from_numpy(rev_lm_labels).to(self.device) lm_masks = torch.from_numpy(lm_masks).float().to(self.device).view(-1) rev_lm_masks = torch.from_numpy(rev_lm_masks).float().to( self.device).flip(dims=[1]).view(-1) logits_forward, logits_backward, hiddens = self.lm( char_ids, prevs, rev_prevs) bs, sl, vs = logits_forward.size() logits_forward = logits_forward.view(-1, vs).log_softmax(-1) logits_backward = logits_backward.view(-1, vs).log_softmax(-1) lm_labels = lm_labels.view(-1, vs).float() rev_lm_labels = rev_lm_labels.flip(dims=[1]).view(-1, vs).float() loss_for = F.kl_div(logits_forward, lm_labels, reduction='none') loss_for = (loss_for.sum(-1) * lm_masks).sum() / lm_masks.sum() loss_rev = F.kl_div(logits_backward, rev_lm_labels, reduction='none') loss_rev = (loss_rev.sum(-1) * rev_lm_masks).sum() / rev_lm_masks.sum() loss = (loss_for + loss_rev) / 2 if not testing: self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.parameters, 0.25) self.optimizer.step() return loss_for, loss_rev, torch.tensor(0.0), torch.tensor(0.0)
def convert_tokens_to_cids(self, tokens, pad_sequence=True, min_seq_size=10): """ Args: pad_sequence, min_seq_size: if pad_sequence is True, pad the sequence up to n_ctx(max_seq_size). else do not pad basically. however, since the sequence size should be larger than min_seq_size. we pad the sequence additionally. """ from allennlp.modules.elmo import batch_to_ids pad_cids = [[self.pad_id] * self.char_n_ctx] ids = batch_to_ids([tokens])[0].detach().cpu().numpy().tolist() # padding if pad_sequence: padding_length = self.n_ctx - len(ids) if padding_length > 0: ids += pad_cids * padding_length else: padding_length = min_seq_size - len(ids) if padding_length > 0: ids += pad_cids * padding_length ids = ids[:self.n_ctx] return ids
def run_batch_lattice(self, batch, testing=False): if testing: self.slu.eval() else: self.slu.train() inputs, words, positions, prevs, rev_inputs, rev_prevs, labels = batch inputs = torch.from_numpy(inputs).to(self.device) rev_inputs = torch.from_numpy(rev_inputs).to(self.device) labels = torch.from_numpy(labels).to(self.device) elmo_emb = None if self.use_elmo: char_ids = batch_to_ids(words).to(self.device) elmo_emb = self.elmo( char_ids, prevs=prevs, rev_prevs=rev_prevs)['elmo_representations'][0] logits = self.slu(inputs, positions, prevs, rev_inputs, rev_prevs, elmo_emb) loss = F.cross_entropy(logits, labels) if not testing: start = time.time() self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.parameters, 1.0) self.optimizer.step() # print(f"backward takes {time.time()-start}") return loss, logits
def run_batch(self, batch, testing=False): if testing: self.lm.eval() else: self.lm.train() inputs, outputs, outputs_rev, uids = batch char_ids = batch_to_ids(inputs).to(self.device) outputs = torch.from_numpy(outputs).to(self.device) outputs_rev = torch.from_numpy(outputs_rev).to(self.device) logits_forward, logits_backward, hiddens, mask = self.lm(char_ids) bs, sl, vs = logits_forward.size() logits_forward = logits_forward.view(-1, vs) logits_backward = logits_backward.view(-1, vs) outputs = outputs.view(-1) outputs_rev = outputs_rev.view(-1) loss_for = F.cross_entropy(logits_forward, outputs, ignore_index=PAD) loss_rev = F.cross_entropy(logits_backward, outputs_rev, ignore_index=PAD) loss = (loss_for + loss_rev) / 2 if not testing: self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.parameters, 0.25) self.optimizer.step() return loss_for, loss_rev, torch.tensor(0.0), torch.tensor(0.0)
def run_batch(self, batch, testing=False): if testing: self.slu.eval() else: self.slu.train() if len(batch) == 4: inputs, words, positions, labels = batch else: inputs, words, positions, _, _, _, labels = batch inputs = torch.from_numpy(inputs).to(self.device) labels = torch.from_numpy(labels).to(self.device) elmo_emb = None if self.use_elmo: char_ids = batch_to_ids(words).to(self.device) elmo_emb = self.elmo(char_ids)['elmo_representations'][0] logits = self.slu(inputs, positions, elmo_emb) loss = F.cross_entropy(logits, labels) if not testing: self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.parameters, 1.0) self.optimizer.step() return loss, logits
def elmo_encode(self, data, __id2word): """ get the id2word from vocab, then convert to id from allennlp.modules.elmo import Elmo, batch_to_ids batch_to_id fills to the max sentence length, which could be less than desired So further fill it to get to the max sent length """ data_text = [self.glove_tokenizer(x, __id2word) for x in data] with torch.no_grad(): elmo = Elmo(options_file, weight_file, 2, dropout=0).cuda() elmo.eval() character_ids = batch_to_ids(data_text).cuda() row_num = character_ids.shape[0] elmo_dim = self.elmo_dim if torch.sum(character_ids) != 0: elmo_emb = elmo(character_ids)['elmo_representations'] elmo_emb = (elmo_emb[0] + elmo_emb[1]) / 2 # avg of two layers else: elmo_emb = torch.zeros([row_num, self.sent_pad_len, elmo_dim], dtype=torch.float) sent_len = elmo_emb.shape[1] if sent_len < self.sent_pad_len: fill_sent_len = self.sent_pad_len - sent_len # create a bunch of 0's to fill it up filler = torch.zeros([row_num, fill_sent_len, elmo_dim], dtype=torch.float) elmo_emb = torch.cat((elmo_emb, filler.cuda()), dim=1) return elmo_emb.cuda()
def correct_string(self, line): premise = line[0].lower() Xtype = torch.FloatTensor ytype = torch.LongTensor is_cuda = torch.cuda.is_available() if is_cuda: self.model.cuda() Xtype = torch.cuda.FloatTensor ytype = torch.cuda.LongTensor if self.use_background: self.model_bg.cuda() X, _ = get_line_representation(premise) tx = Variable(torch.from_numpy(np.array([X]))).type(Xtype) if self.use_elmo or self.use_elmo_bg: tx_elmo = Variable(batch_to_ids([premise.split()])).type(ytype) print(premise) SEQ_LEN = len(premise.split()) if self.use_elmo: ty_pred = self.model(tx, tx_elmo, [SEQ_LEN]) else: ty_pred = self.model(tx, [234]) y_pred = ty_pred.detach().cpu().numpy() y_pred = y_pred[0] # ypred now is NUM_CLASSES x SEQ_LEN if self.use_background: if self.use_elmo_bg: ty_pred_bg = self.model_bg(tx, tx_elmo, [SEQ_LEN]) else: ty_pred_bg = self.model_bg(tx, [SEQ_LEN]) y_pred_bg = ty_pred_bg.detach().cpu().numpy() y_pred_bg = y_pred_bg[0] output_words = [] self.total_predictions += SEQ_LEN for idx in range(SEQ_LEN): pred_idx = np.argmax(y_pred[:, idx]) if pred_idx == utils.WORD_LIMIT: word = premise.split()[idx] if self.use_background: pred_idx_bg = np.argmax(y_pred_bg[:, idx]) if pred_idx_bg != self.vocab_size_bg: word = utils.i2w_bg[pred_idx_bg] if self.unk_output: word = "a" # choose a sentiment neutral word output_words.append(word) self.predicted_unks += 1.0 if word in utils.w2i: self.predicted_unks_in_vocab += 1.0 else: output_words.append(utils.i2w[pred_idx]) line[0] = " ".join(output_words) return line
def load_data_with_diff_vocab(inputFile, src_vocab, tgt_vocab, max_word=200, max_char=50, data_type ='norm'): """ use different to load data, src_vocab, tgt_vocab, src_vocab including the lexnorm2015 and aggressive dataset :param inputFile: :param src_vocab: :param tgt_vocab: :param max_word: :param max_char: :param data_type: :return: """ char_inputs = [] word_inputs = [] outputs = [] src, tgt = pickle.load(open(inputFile, 'rb')) for s, t in zip(src, tgt): elmo_id = batch_to_ids(s) elmo_id = elmo_id.view(-1, 50) tokens = ['<sos>'] + s + ['<eos>'] word_input = [src_vocab.word_to_id(word) for word in tokens] if data_type == 'norm': output = [tgt_vocab.word_to_id(tgt_vocab.SYM_SOS)] output.extend([tgt_vocab.word_to_id(word) for word in t]) output.append(tgt_vocab.word_to_id(tgt_vocab.SYM_EOS)) else: output = [tgt_vocab.tag_to_id(t)] char_inputs.append(elmo_id) word_inputs.append(word_input) outputs.append(output) dataset = OurDataset(word_inputs, char_inputs, outputs, max_word, max_char) return dataset
def elmo_large(conll_file): dataset, textset = read_conll_corpus(conll_file) options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" elmo = Elmo(options_file, weight_file, 1) sentences = [sent[1:-1] for sent in dataset] print(sentences) character_ids = batch_to_ids(sentences) embeddings = elmo(character_ids)['elmo_representations'][0].detach() vectors = [] print(embeddings) for sent_i, sent in enumerate(embeddings): key = sentences[sent_i] if 'play' in key: i = key.index('play') elif 'bright' in key: i = key.index('bright') elif 'light' in key: i = key.index('light') elif 'smart' in key: i = key.index('smart') vectors.append(np.array(sent[i])) print(vectors) return np.stack(vectors, axis=0), ['\t'.join(sentence) for sentence in sentences]
def transform(self, X, y=None): """Transform documents to document ids. Uses the vocabulary learned by fit. Args: X: iterable an iterable which yields either str, unicode or file objects. y: iterable, label string Returns: features: document if matrix. y: label id matrix. """ word_ids = [self._word_vocab.doc2id(doc) for doc in X] word_ids = pad_sequences(word_ids, padding='post') char_ids = [[self._char_vocab.doc2id(w) for w in doc] for doc in X] char_ids = pad_nested_sequences(char_ids) character_ids = batch_to_ids(X) elmo_embeddings = self._elmo(character_ids)['elmo_representations'][1] elmo_embeddings = elmo_embeddings.detach().numpy() features = [word_ids, char_ids, elmo_embeddings] if y is not None: y = [self._label_vocab.doc2id(doc) for doc in y] y = pad_sequences(y, padding='post') y = to_categorical(y, self.label_size).astype(int) # categorical issues y = y if len(y.shape) == 3 else np.expand_dims(y, axis=0) return features, y else: return features
def __init__(self, config, path): from allennlp.modules.elmo import batch_to_ids pad_ids = [config['pad_token_id']] * config['char_n_ctx'] all_token_ids = [] all_pos_ids = [] all_char_ids = [] all_label_ids = [] with open(path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() items = line.split('\t') token_ids = [int(d) for d in items[1].split()] pos_ids = [int(d) for d in items[2].split()] # compute ELMo character ids tokens = items[3].split() char_ids = batch_to_ids([tokens ])[0].detach().cpu().numpy().tolist() for _ in range(len(token_ids) - len(char_ids)): char_ids.append(pad_ids) label_ids = [int(d) for d in items[0].split()] all_token_ids.append(token_ids) all_pos_ids.append(pos_ids) all_char_ids.append(char_ids) all_label_ids.append(label_ids) all_token_ids = torch.tensor(all_token_ids, dtype=torch.long) all_pos_ids = torch.tensor(all_pos_ids, dtype=torch.long) all_char_ids = torch.tensor(all_char_ids, dtype=torch.long) all_label_ids = torch.tensor(all_label_ids, dtype=torch.long) self.x = TensorDataset(all_token_ids, all_pos_ids, all_char_ids) self.y = all_label_ids
def elmoFromPair(pair, elmo): # choose layer 2 representations layer = 1 character_ids = batch_to_ids(pair) input_tensor, output_tensor = elmo( character_ids)['elmo_representations'][layer].data return (input_tensor, output_tensor)
def __init__(self, config, path): from allennlp.modules.elmo import batch_to_ids pad_ids = [config['pad_token_id']] * config['char_n_ctx'] all_token_ids = [] all_pos_ids = [] all_char_ids = [] all_label_ids = [] with open(path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() items = line.split('\t') token_ids = [int(d) for d in items[1].split()] pos_ids = [int(d) for d in items[2].split()] # using ELMo.batch_to_ids, compute character ids: ex) 'The' [259, 85, 105, 102, 260, 261, 261, ...] # (actually byte-based, char_vocab_size == 262, char_padding_idx == 261) tokens = items[3].split() char_ids = batch_to_ids([tokens ])[0].detach().cpu().numpy().tolist() for _ in range(len(token_ids) - len(char_ids)): char_ids.append(pad_ids) label_ids = [int(d) for d in items[0].split()] all_token_ids.append(token_ids) all_pos_ids.append(pos_ids) all_char_ids.append(char_ids) all_label_ids.append(label_ids) all_token_ids = torch.tensor(all_token_ids, dtype=torch.long) all_pos_ids = torch.tensor(all_pos_ids, dtype=torch.long) all_char_ids = torch.tensor(all_char_ids, dtype=torch.long) all_label_ids = torch.tensor(all_label_ids, dtype=torch.long) self.x = TensorDataset(all_token_ids, all_pos_ids, all_char_ids) self.y = all_label_ids
def encode_sent_and_span_paral(self, text, # batch, max_sent, max_word text_msk, # batch, max_sent, max_word span, # batch, max_sent_num, max_span_num, max_word sent_idx # batch size ): this_text = two_dim_index_select(text['tokens'], sent_idx) # batch, max_word from allennlp.modules.elmo import batch_to_ids if self.use_elmo: this_text_list: List = this_text.tolist() text_str_list = [] for sample in this_text_list: s = [self.vocab.get_token_from_index(x) for x in sample] text_str_list.append(s) character_ids = batch_to_ids(text_str_list).to(self.device) this_context = self.elmo(character_ids) # print(this_context['elmo_representations'][0].size()) this_context = this_context['elmo_representations'][0] else: this_text = {'tokens': this_text} this_context = self._text_field_embedder(this_text) num_doc, max_word, inp_dim = this_context.size() batch_size = sent_idx.size()[0] assert batch_size == num_doc # text is the original text of the selected sentence. # this_context = two_dim_index_select(context, sent_idx) # batch, max_word, hdim this_context_mask = two_dim_index_select(text_msk, sent_idx) # batch, max_word this_span = two_dim_index_select(span, sent_idx) # batch , nspan, max_word concat_rep_of_compression, \ span_msk, original_sent_rep = self.enc.forward(word_emb=this_context, word_emb_msk=this_context_mask, span=this_span) return concat_rep_of_compression, span_msk, original_sent_rep
def batch_to_embeddings(self, batch: List[List[str]]) -> Tuple[torch.Tensor, torch.Tensor]: """ Parameters ---------- batch : ``List[List[str]]``, required A list of tokenized sentences. Returns ------- A tuple of tensors, the first representing activations (batch_size, 3, num_timesteps, 1024) and the second a mask (batch_size, num_timesteps). """ character_ids = batch_to_ids(batch) if self.cuda_device >= 0: character_ids = character_ids.cuda(device=self.cuda_device) bilm_output = self.elmo_bilm(character_ids) layer_activations = bilm_output['activations'] mask_with_bos_eos = bilm_output['mask'] # without_bos_eos is a 3 element list of (activation, mask) tensor pairs, # each with size (batch_size, num_timesteps, dim and (batch_size, num_timesteps) # respectively. without_bos_eos = [remove_sentence_boundaries(layer, mask_with_bos_eos) for layer in layer_activations] # Converts a list of pairs (activation, mask) tensors to a single tensor of activations. activations = torch.cat([ele[0].unsqueeze(1) for ele in without_bos_eos], dim=1) # The mask is the same for each ELMo vector, so just take the first. mask = without_bos_eos[0][1] return activations, mask
def elmo_embeddings(self, processed_sent, number_sent, lang='en', args=None): print('complicated_layers_extraction') ''' (batch, sequence_length, 50) returns dict ''' first_layer_lst = [] second_layer_lst = [] third_layer_lst = [] for index, sent in enumerate(processed_sent[:number_sent]): sentences = sent # create a list of size one of a list. # print(sentences) character_ids = batch_to_ids(sentences).to(self.device) # embeddings = elmo._elmo_lstm._token_embedder(character_ids) bilm_output = self.elmo._elmo_lstm(character_ids) temp = bilm_output['activations'] first_layer = temp[0].cpu()[:, 1:-1] # shape = (bsz, seqlen, dim) second_layer = temp[1].cpu()[:, 1:-1] third_layer = temp[2].cpu()[:, 1:-1] first_layer_lst.append(first_layer) second_layer_lst.append(second_layer) third_layer_lst.append(third_layer) if index % 1000 == 0: sys.stdout.write('-') sys.stdout.flush() sys.stdout.write('\n') return first_layer_lst, second_layer_lst, third_layer_lst
def forward(self, sentences, device='cuda'): """ sentences: list[str], len of list: B output sent_embs: Tensor B x OUT """ sentences = [WordEncoder.tokenize(s) for s in sentences] # sentences = [['First', 'sentence', '.'], ['Another', '.']] # use batch_to_ids to convert sentences to character ids character_ids = batch_to_ids(sentences).to(device) embeddings = self.elmo(character_ids) # embeddings['elmo_representations'] is length two list of tensors. # Each element contains one layer of ELMo representations with shape # (2, 3, 1024). # 2 - the batch size # 3 - the sequence length of the batch # 1024 - the length of each ELMo vector sent_embeds = embeddings['elmo_representations'][1] # B x max_l x 1024 sent_emb_list = list() for si in range(len(sentences)): sent_len = len(sentences[si]) sent_embed = torch.mean(sent_embeds[si, :sent_len, :], dim=0) # 1024 sent_emb_list.append(sent_embed) sent_embs = torch.stack(sent_emb_list, dim=0) # B x 1024 return sent_embs
def batch_to_embeddings(self, batch: List[List[str]]) -> Tuple[torch.Tensor, torch.Tensor]: """ Parameters ---------- batch : ``List[List[str]]``, required A list of tokenized sentences. Returns ------- A tuple of tensors, the first representing activations (batch_size, 3, num_timesteps, 1024) and the second a mask (batch_size, num_timesteps). """ character_ids = batch_to_ids(batch) if self.cuda_device >= 0: character_ids = character_ids.cuda(device=self.cuda_device) bilm_output = self.elmo_bilm(character_ids, self.bias, self.num_bias, self.contraction) layer_activations = bilm_output['activations'] mask_with_bos_eos = bilm_output['mask'] # without_bos_eos is a 3 element list of (activation, mask) tensor pairs, # each with size (batch_size, num_timesteps, dim and (batch_size, num_timesteps) # respectively. without_bos_eos = [remove_sentence_boundaries(layer, mask_with_bos_eos) for layer in layer_activations] # Converts a list of pairs (activation, mask) tensors to a single tensor of activations. activations = torch.cat([ele[0].unsqueeze(1) for ele in without_bos_eos], dim=1) # The mask is the same for each ELMo vector, so just take the first. mask = without_bos_eos[0][1] return activations, mask
def elmo_read_and_generate_vecs(korpus_path: str, size: int, test=False): """ INPUT: korpus - nazwa folderu z korpusem (np. "korpusGAZETA") size - rozmiar sąsiedztwa budującego pojedynczą frazę (liczba słów w tył i w przód względem nazwiska) OUTPUT: vecs - wektory przetworzone przez elmo - 1 wektor dla frazy zdubowanej napodstawie sąsiedztwa corpus_list, person_list, document_dict, person_dict - wynik działania funkcji 'read_corpus' - patrz komentarz do tamtej funkcji """ elmo = Elmo(options_file, weight_file, 1, dropout=0) path_raw = korpus_path [corpus_list, person_list, document_dict, person_dict, profession_dict] = read_corpus(path_raw, size, test) vecs = [] step = 1000 for i in tqdm(range(0, len(corpus_list), step)): character_ids = batch_to_ids(corpus_list[i:i + step]) embeddings = elmo(character_ids) vecs.extend(elmo_emb_2_vec(embeddings)) return [ vecs, corpus_list, person_list, document_dict, person_dict, profession_dict ]
def __call__(self, tokens): """Elmo representation is extracted for each candidate in a turn.""" token_ids = batch_to_ids(tokens) if torch.cuda.is_available(): token_ids = token_ids.cuda() embeddings = self.encoder( token_ids)["elmo_representations"][0].detach().cpu().data return embeddings
def create_embed_loaders(): train, train_labs, test, test_labs, _ = get_data() ttr, tte = [], [] for t in train: ttr.append(len(t)) for t in test: tte.append(len(t)) # train = train[:100] # test = test[:50] lentrain = len(train) data = copy(train) data.extend(test) elmo = get_elmo() # elmo = ElmoEmbedder() # te = process_sentences(elmo, test) # print(tr.size()) # assert False da_ids = batch_to_ids(data) da = elmo(da_ids)['elmo_representations'][0] da = da.to(device) print(da.size()) tr = da[:lentrain, :, :] te = da[lentrain:, :, :] trdata = ElmoDset(tr, train_labs, ttr) tedata = ElmoDset(te, test_labs, tte) trload = DataLoader(trdata, shuffle = True, batch_size = 2) teload = DataLoader(tedata, shuffle = True, batch_size = 2) return trload, teload
def _word_embed_elmo(self): print(':: Initializing ELMo') elmo = Elmo(ELMO_OPTIONS_FILE, ELMO_WEIGHT_FILE, 1, dropout=0) print(':: Encoding tokens') char_ids = batch_to_ids(self.sent_tokens) print(':: Calculating ELMo embeddings') sent_embeds, sent_lengths = [], [] for i in range(0, len(char_ids), ELMO_BATCH_SIZE): print(' > {}/{} [{}%]'.format(i, len(char_ids), int(i / len(char_ids) * 100)), end='\r') elmo_batch = char_ids[i:i + ELMO_BATCH_SIZE] sent_batch_embeds = elmo(elmo_batch) sent_embeds.append(sent_batch_embeds['elmo_representations'][0]) sent_lengths.append(sent_batch_embeds['mask'].sum(dim=1)) if len(sent_embeds) >= 8: if self.word_tensor is not None: sent_embeds = [self.word_tensor] + sent_embeds sent_lengths = [self.sent_lengths] + sent_lengths self.word_tensor = torch.cat(sent_embeds) self.sent_lengths = torch.cat(sent_lengths) sent_embeds, sent_lengths = [], [] self.word_tensor = torch.cat([self.word_tensor] + sent_embeds) self.sent_lengths = torch.cat([self.sent_lengths] + sent_lengths) print(' > ELMo embeddings are calculated for all {} sentences'.format( len(char_ids)))
def forward(self, input, seq_lens, vocab, batchartoovs=None): strings = [] for examp in input: copy = examp.clone().cpu().numpy().astype(int) converted = data.outputids2words( copy, vocab, (batchartoovs if config.pointer_gen else None)) strings.append(converted) strings = batch_to_ids(strings).cuda() embedded = self.elmo_layer(strings)['elmo_representations'] embedded = embedded[0] #[batch size, max enc steps, 1024] #embedded = self.embedding(input) packed = pack_padded_sequence(embedded, seq_lens, batch_first=True) output, hidden = self.lstm(packed) encoder_outputs, _ = pad_packed_sequence( output, batch_first=True) # h dim = B x t_k x n encoder_outputs = encoder_outputs.contiguous() encoder_feature = encoder_outputs.view( -1, 2 * config.hidden_dim) # B * t_k x 2*hidden_dim encoder_feature = self.W_h(encoder_feature) return encoder_outputs, encoder_feature, hidden
def forward(self, span, a_vec, raw_span): #span_vec = span span_vec = self.word_embeddings(span) character_ids = batch_to_ids(raw_span).cuda() elmo_embeddings = self.elmo(character_ids) elmo_representations = torch.cat( [span_vec] + elmo_embeddings['elmo_representations'], dim=2) generate_output = self.generative_decoder(elmo_representations, a_vec, elmo_embeddings['mask'], span) batch_size, target_iter = a_vec.shape gen_out = torch.zeros(batch_size, target_iter).to(generate_output.device) for i in range(batch_size): gen_out[i, :] = generate_output[i, :, :].max(1)[1] print(generate_output.shape) generate_output = generate_output[:, 1:, :].contiguous() generate_output = generate_output.view( generate_output.shape[0] * generate_output.shape[1], generate_output.shape[2]) print(generate_output.shape) generate_output = F.softmax(generate_output, dim=1) eps = 1e-8 generate_output = (1 - eps) * generate_output + eps * torch.min( generate_output[generate_output != 0]) generate_loss = self.gen_loss(torch.log(generate_output), a_vec[:, 1:].contiguous().view(-1)) #print (generate_output.max(1)[1]) #print (a_vec[:,1:].contiguous().view(-1)) loss = generate_loss return loss, gen_out
def batchify(x_data, y_data, batch_size=128, shuffle=False): batches = [] for i in range(0, len(x_data), batch_size): start, stop = i, i + batch_size x_batch = batch_to_ids(x_data[start:stop]) lengths = Variable(torch.from_numpy(np.array([max(len(x), 1) for x in x_data[start:stop]])).float()).view(-1, 1) if CUDA: y_batch = Variable(torch.from_numpy(np.array(y_data[start:stop])).cuda()) else: y_batch = Variable(torch.from_numpy(np.array(y_data[start:stop]))) batches.append((x_batch, y_batch, lengths)) if shuffle: random.shuffle(batches) return batches