def process_df(xdf, ydf=None, passing_y=False): args1 = [' '.join(nltk.sent_tokenize(x)[0:5]) for x in xdf['argument1'].tolist()] args1 = [x[0:500] for x in args1] args2 = [' '.join(nltk.sent_tokenize(x)[0:5]) for x in xdf['argument2'].tolist()] args2 = [x[0:500] for x in args2] ys = ydf['is_same_side'].tolist() x1_out = [] for c in chunks(args1, 2): sents = [Sentence(x, use_tokenizer=True) for x in c] document_embeddings.embed(sents) for sent in sents: x1_out.append(sent.get_embedding().detach()) del sents x2_out = [] for c in chunks(args2, 2): sents = [Sentence(x, use_tokenizer=True) for x in c] document_embeddings.embed(sents) for sent in sents: x2_out.append(sent.get_embedding().detach()) del sents if passing_y: ys = [1 if y else 0 for y in ys] else: ys = [0] * len(x1_out) return x1_out, x2_out, ys
def chunked_embed(corpus, embeddings, chunk_size=256): def find_nth(n, substring, text, start): index = start for _ in range(n): index = text.find(substring, index + 1) return index try: partial_embeddings = [] i = 0 while i < len(corpus): next_i = find_nth(chunk_size, " ", corpus, i) if next_i < i: next_i = len(corpus) chunk = corpus[i:next_i] sentence = Sentence(chunk, use_tokenizer=False) embeddings.embed(sentence) partial_embeddings.append(sentence.get_embedding().numpy()) i = next_i avg = np.average(np.asarray(partial_embeddings), axis=0) return avg except RuntimeError: print( "Please, ignore the message above indicating that the sentence is too long. The problem has been solved." ) return FeatureExtractor.chunked_embed(corpus, embeddings, int(chunk_size / 2))
def get_embeddings(self, sentence): # document_embeddings = DocumentPoolEmbeddings( # [self.glove_embedding, # initialize the document embeddings, mode = mean # self.flair_embedding_backward, # self.flair_embedding_forward]) # Glove + BPE document_embeddings = DocumentPoolEmbeddings( [self.glove_embedding, self.bpe_embedding]) # Nilc fasttext 600 emdedding #document_embeddings = DocumentPoolEmbeddings( # [self.fast_text_embedding]) # Flair #document_embeddings = DocumentPoolEmbeddings( # [self.flair_embedding_forward]) # ElMO #document_embeddings = DocumentPoolEmbeddings( # [self.elmo_embedding]) # create an example sentence sentence = Sentence(sentence) # embed the sentence with our document embedding document_embeddings.embed(sentence) # now check out the embedded sentence. return sentence.get_embedding()
def get_pooling_embedding(document): tokens = [token.text for token in nlp(document)] text = ' '.join(tokens) sentence = Sentence(text) document_pooling_embeddings.embed(sentence) return sentence.get_embedding().squeeze().tolist()
def get_sentence_vector(self, text): sentence = Sentence(clean_text(text)) _ = self.embeddings.embed(sentence) a = sentence.get_embedding() result = a.cpu().detach().numpy() if np.sum(result[0:5]) == 0: result = np.random.randn(self.n_dims) return result
def get_fastText_embeding(text_str): ''' Fonction qui retourne l'embedding d'un :param text_str: text a embeddé :return: le vecteur d'embeding correspondant au text ''' text = Sentence(text_str) pool_embeddings.embed(text) return text.get_embedding()
def construct_vector(self, original_sentence): """ Given a sentence, Contruct and return a vector based on different stacked embeddings """ sentence = Sentence(original_sentence) self.stacked_embedding.embed(sentence) sentence_embedding = sentence.get_embedding() sentence_embedding_array = sentence_embedding.detach().numpy() return sentence_embedding_array
def other_embeddings(embd): sess = tf.InteractiveSession() train_data_list = [] test_data_list = [] val_data_list = [] if embd == 'glove': print('Starting Glove Embedding...') glove_embedding = WordEmbeddings('glove') document_embeddings = DocumentPoolEmbeddings( embeddings=[glove_embedding]) elif embd == 'xlnet': print('Starting XLNet Embedding...') xlnet_embedding = XLNetEmbeddings('xlnet-large-cased') document_embeddings = DocumentPoolEmbeddings( embeddings=[xlnet_embedding]) elif embd == 'fasttext': print('Starting Fasttext Embedding...') fasttext_embedding = WordEmbeddings('en') document_embeddings = DocumentPoolEmbeddings( embeddings=[fasttext_embedding]) elif embd == 'elmo': print('Starting ELMo Embedding...') elmo_embedding = ELMoEmbeddings() document_embeddings = DocumentPoolEmbeddings( embeddings=[elmo_embedding]) else: # init Flair embeddings flair_forward_embedding = FlairEmbeddings('multi-forward') flair_backward_embedding = FlairEmbeddings('multi-backward') glove_embedding = WordEmbeddings('glove') # now create the DocumentPoolEmbeddings object that combines all embeddings document_embeddings = DocumentPoolEmbeddings(embeddings=[ glove_embedding, flair_forward_embedding, flair_backward_embedding ]) print('Train embedding Started...') for text in final_train['text'].tolist(): text = Sentence(text) document_embeddings.embed(text) emb = text.get_embedding().detach().numpy() emb = tf.constant(emb).eval() train_data_list.append(emb) print('Embedded Train data!!') print('Test embedding Started...') for text in final_test['text'].tolist(): text = Sentence(text) document_embeddings.embed(text) emb = text.get_embedding().detach().numpy() emb = tf.constant(emb).eval() test_data_list.append(emb) print('Embedded Test data!!') for text in final_val['text'].tolist(): text = Sentence(text) document_embeddings.embed(text) emb = text.get_embedding().detach().numpy() emb = tf.constant(emb).eval() val_data_list.append(emb) print('Embedded Test data!!') return train_data_list, test_data_list, val_data_list
def save_json(jfile): global dataset user_id = os.path.basename(jfile).split('.')[0] save_path = os.path.join('..', DATA_DIR, 'pos_tags_{}_embeds'.format(dataset), '{}.json'.format(user_id)) if os.path.isfile(save_path): print("Skipping user {} file already exists".format(user_id)) return document_embeddings = get_doc_embeddings() with open(jfile, encoding='utf-8') as f: user_data = json.load(f) if len(user_data['tokens']) > 500: print('User {}.json has {} posts NOT skipping'.format( user_id, len(user_data['tokens']))) # return posts_list = user_data['tokens'] # each post is a list of tokens pos_tags_list = user_data['posTags'] posts_lowercase_list = [] posts_embeddings_list = [] pos_tags_list_lowercase = [] for i, (post, pos_tags) in enumerate(zip(posts_list, pos_tags_list)): post_lowercase = [token.lower() for token in post] if any("http" in word for word in post_lowercase): continue if 0 < len(post_lowercase): posts_lowercase_list.append(post_lowercase) pos_tags_list_lowercase.append(pos_tags) post_sentence = Sentence(' '.join( [post for post in post_lowercase])) document_embeddings.embed(post_sentence) posts_embeddings_list.append( post_sentence.get_embedding().tolist()) elif len(post_lowercase) > 100: print('long post') else: continue user_data["tokens"] = posts_lowercase_list user_data["posTags"] = pos_tags_list_lowercase user_data["embeddings"] = posts_embeddings_list with open(save_path, 'w') as out_file: json.dump(user_data, out_file) print('Finished with file {}.json'.format(user_id))
def embed_flair(texts, max_length=100, max_words=1000): tokenizer = Tokenizer(num_words=max_words) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) texts = tokenizer.sequences_to_texts(sequences) sentence_embeddings = [] padding = np.zeros(embedding_features) count = 0 step = 3 max = len(texts) for text in texts: sentence_embedding = [] paddings = [] sentence = Sentence(text) embeddings_flair.embed(sentence) for token in sentence: sentence_embedding.append(token.embedding.cpu().numpy()) for i in range(max_length - len(sentence_embedding)): paddings.append(padding) if len(paddings) > 0: sentence_embedding = np.concatenate([paddings, sentence_embedding], axis=0) else: sentence_embedding = np.array(sentence_embedding[:max_length]) count += 1 if (100 * count / max > step): print(str(step) + '%') step += 3 sentence_embeddings.append(sentence_embedding) return np.array(sentence_embeddings)
def get(self, keys): if self._embeddings is None: if self._no_cuda: import flair import torch flair.device = torch.device('cpu') from .flair_bert import BertEmbeddings self._embeddings = BertEmbeddings( bert_model_or_path=self._path, layers=self._layers, pooling_operation=self._pooling_operation, use_scalar_mix=self._use_scalar_mix) sentences = [Sentence(key) for key in keys] # noinspection PyUnresolvedReferences self._embeddings.embed(sentences) for s_idx, sentence in enumerate(sentences): for t_idx, token in enumerate(sentence): emb = token.embedding.cpu().numpy() tok = str(token) yield tok, emb del token del sentence
def embed_tweet(tweetList): # initialize the word embeddings tr_embedding = WordEmbeddings('tr') char_embedding = CharacterEmbeddings() # initialize the document embeddings, mode = mean document_embeddings = DocumentPoolEmbeddings( [tr_embedding, char_embedding]) tweetTensors = [] for tweet in tweetList: #print(norm_tweet(tweet)) sentence = Sentence(norm_tweet(tweet)) document_embeddings.embed(sentence) tweetTensors.append(sentence.get_embedding().data) return tweetTensors
def get_word_vectors(self, words: List[str]) -> List[np.ndarray]: """ Vectorizes the list of words, using pretrained Flair embeddings. These embeddings are context dependent, so this method is preferred over fetching word vectors for single words. :param words: The list of words to vectorize. :return: A list of word vectors. """ sentence = Sentence(' '.join(words)) self.model.embed(sentence) return list( map(lambda token: np.array(token.embedding), list(sentence)))
def embed_data(self, sentences): sentences = [Sentence(s) for s in sentences] self.embedding.embed(sentences) if self.method == "average": sentences = [torch.stack([word.embedding.detach().cpu() for word in s]).mean( 0) for s in sentences] else: sentences = [torch.stack( [word.embedding.detach().cpu() for word in s]) for s in sentences] return sentences
def find(self, text): if not self.compiled: raise Exception('You need to compile the vocabulary first.') # Apply the general tagger if self.ner_tagger: text_ = Sentence(text) self.ner_tagger.predict(text_) for ent in text_.to_dict(tag_type='ner')['entities']: yield { 'text': ent['text'], 'start_pos': ent['start_pos'], 'end_pos': ent['end_pos'] } # Apply the especialized vocabulary if self.compiled: for item in self.compiled.finditer(text): span = item.span(0) text = item.group(0) yield {'text': text, 'start_pos': span[0], 'end_pos': span[1]}
def encode(self, data: 'np.ndarray', *args, **kwargs) -> 'np.ndarray': """ :param data: a 1d array of string type in size `B` :return: an ndarray in size `B x D` """ import torch from flair.embeddings import Sentence c_batch = [Sentence(row) for row in data] self.model.embed(c_batch) return torch.stack([c_text.get_embedding() for c_text in c_batch]).detach().numpy()
def transform(self, X: dt.Frame): X.replace([None, math.inf, -math.inf], self._repl_val) from flair.embeddings import WordEmbeddings, BertEmbeddings, DocumentPoolEmbeddings, Sentence if self.embedding_name in ["glove", "en"]: self.embedding = WordEmbeddings(self.embedding_name) elif self.embedding_name in ["bert"]: self.embedding = BertEmbeddings() self.doc_embedding = DocumentPoolEmbeddings([self.embedding]) output = [] X = X.to_pandas() text1_arr = X.iloc[:, 0].values text2_arr = X.iloc[:, 1].values for ind, text1 in enumerate(text1_arr): try: text1 = Sentence(str(text1).lower()) self.doc_embedding.embed(text1) text2 = text2_arr[ind] text2 = Sentence(str(text2).lower()) self.doc_embedding.embed(text2) score = cosine_similarity(text1.get_embedding().reshape(1, -1), text2.get_embedding().reshape(1, -1))[0, 0] output.append(score) except: output.append(-99) return np.array(output)
def load_documents_into_embedding(self): print("Embedding ", len(self.documents_orig), " Documents") #self.documents_orig = self.documents_orig[0:50] self.documents = [ self.elmo.embed(Sentence(elem)) for elem in self.documents_orig ] self.documents = torch.stack([ torch.cat([token.embedding.unsqueeze(0) for token in elem[0]], dim=0)[0] for elem in self.documents ]) np.save("./documents_embedded.npy", self.documents)
def get_word_vector(self, word: str) -> Optional[np.ndarray]: """ Returns the word vector for word |word| or None. It is discouraged to use this method as it invalidates the purpose of Flair embeddings. Instead, utilize the context as well for more accurate vectorization. In reality, Flair embeddings never return None, even for bogus words. :param word: The word to vectorize. :return: Either the word vector or None. """ dummy_sentence = Sentence(word) self.model.embed(dummy_sentence) return np.array(list(dummy_sentence)[0].embedding)
def answer_similarity(ans1, real): sent1 = Sentence(ans1) sent2 = Sentence(real) document_embeddings.embed(sent1) document_embeddings.embed(sent2) emb1 = sent1.get_embedding() emb2 = sent2.get_embedding() emb1 /= torch.sqrt((emb1**2).sum()) emb2 /= torch.sqrt((emb2**2).sum()) return max(0., (emb1.T @ emb2).item())
def get(self, keys, return_positions): from flair.embeddings import Sentence sentences = [Sentence(key, use_tokenizer=self._use_tokenizer) for key in keys] # noinspection PyUnresolvedReferences self._embeddings.embed(sentences) for s_idx, (sentence, ret_positions) in enumerate(zip(sentences, return_positions)): for t_idx, token in enumerate(sentence): if t_idx not in ret_positions: continue # ignore tokens where embeddings have not been requested yield s_idx, token.text, token.embedding.cpu().numpy()
def flair_embeddings(x, *args): from flair.embeddings import DocumentPoolEmbeddings, DocumentRNNEmbeddings, Sentence word_embedders, aggregating_strategy, aggregating_params = args[0], args[ 1], args[2] embedding = None if aggregating_strategy == 'pooling': # TODO: check if kwargs work embedding = DocumentPoolEmbeddings(word_embedders, **aggregating_params) if aggregating_strategy == 'rnn': # TODO: check if kwargs work embedding = DocumentRNNEmbeddings(word_embedders, **aggregating_params) if embedding is None: raise KeyError("Insufficient vespine gas") sentence = Sentence(x) embedding.embed(sentence) return sentence.embedding.detach().numpy().reshape(-1, 1)
def criterion(str1, str2, embed): try: s1 = Sentence(str1) s2 = Sentence(str2) embed.embed(s1) s1_emb = s1.get_embedding() embed.embed(s2) s2_emb = s2.get_embedding() return torch.cosine_similarity(s1_emb.unsqueeze(0), s2_emb.unsqueeze(0)).item() except: return 0.5
def run_query(self, query, k=None): """Run a query on the given documents based on word embeddings Arguments: query {str} -- Query string. Keyword Arguments: k {int} -- The top documents to return (default: 10) Returns: list[tuple[float, int]] -- Sorted list of tuples, which contain the score and the document id. Made up example to show the formatting with k=5: [(0.89316645860672, 1567), (0.6174346804618835, 125), (0.5975501537321234, 1181), (0.5779426293373108, 3979), (0.5110726475715637, 7155)] """ if k is None: k = 10 sentence = Sentence(query) #self.embedding.embed(sentence) self.elmo.embed(sentence) sentence = [token.embedding.unsqueeze(0) for token in sentence][0] #print(sentence) # A returned list should look like this for k=5. Btw. the numbers are made up! #[ # (0.89316645860672, 1567), # (0.6174346804618835, 125), # (0.5975501537321234, 1181), # (0.5779426293373108, 3979), # (0.5110726475715637, 7155), # ] return self.knn(sentence, query, k=k)
def get_embeddings(encoder, sentence, input_lang): with torch.no_grad(): if word_vecs == "flair": flair_embedding = StackedEmbeddings([ FlairEmbeddings('de-forward'), FlairEmbeddings('de-backward'), ]) sent = Sentence(sentence + " <EOS>") flair_embedding.embed(sent) input_tensor = [token.embedding for token in sent.tokens] input_length = len(input_tensor) else: input_tensor = tensorFromSentence(input_lang, sentence) input_length = input_tensor.size()[0] encoder_hidden = encoder.initHidden() for ei in range(input_length): _, encoder_hidden = encoder(input_tensor[ei], encoder_hidden) return encoder_hidden
def glove_eucleadian(question, sentence_list): question = Sentence(question) euc = nn.PairwiseDistance(p=2) document_embeddings.embed(question) q_emd = question.get_embedding() q_emd = q_emd.unsqueeze(0) sentence_vectors = torch.empty((1, EMBEDDING_DIM)) # .to(device) for idx, sent in enumerate(sentence_list): sent = Sentence(sent) document_embeddings.embed(sent) sent_emd = sent.get_embedding() if idx == 0: sentence_vectors = sent_emd.unsqueeze(0) else: sentence_vectors = torch.cat( (sentence_vectors, sent_emd.unsqueeze(0))) output = euc(q_emd, sentence_vectors) return output
def glove_cosine_similarity(question, sentence_list): question = Sentence(question) cos = nn.CosineSimilarity(dim=1, eps=1e-6) document_embeddings.embed(question) q_emd = question.get_embedding() q_emd = q_emd.unsqueeze(0) sentence_vectors = torch.empty((1, EMBEDDING_DIM)) # .to(device) for idx, sent in enumerate(sentence_list): sent = Sentence(sent) document_embeddings.embed(sent) sent_emd = sent.get_embedding() if idx == 0: sentence_vectors = sent_emd.unsqueeze(0) else: sentence_vectors = torch.cat( (sentence_vectors, sent_emd.unsqueeze(0))) output = cos(q_emd, sentence_vectors) return output
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence, OneHotEmbeddings, \ DocumentRNNEmbeddings # initialize the word embeddings glove_embedding = WordEmbeddings('glove') flair_embedding_forward = FlairEmbeddings('news-forward') flair_embedding_backward = FlairEmbeddings('news-backward') # embeddings = OneHotEmbeddings(corpus) glove_embedding = WordEmbeddings('glove') # initialize the document embeddings, mode = mean document_embeddings = DocumentPoolEmbeddings( [glove_embedding], # flair_embedding_backward, flair_embedding_forward], # pooling='min', fine_tune_mode='nonlinear') document_embeddings = DocumentRNNEmbeddings([glove_embedding]) document_lstm_embeddings = DocumentRNNEmbeddings([glove_embedding], rnn_type='LSTM') # create an example sentence sentence = Sentence('The grass is green . And the sky is blue .') # embed the sentence with our document embedding document_embeddings.embed(sentence) # now check out the embedded sentence. print(sentence.get_embedding())
import numpy as np from pandas import read_csv import pickle from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings, Sentence data = read_csv('data/abcnews-date-text.csv', error_bad_lines=False) documents = data[['headline_text']].values.reshape(-1).tolist() # documents = list(pickle.load(open( "./corpus/df_proyectosFECYT.pkl", "rb" ) )['LEMAS_UC3M']) glove_embedding = WordEmbeddings('glove') document_embeddings = DocumentRNNEmbeddings([glove_embedding], hidden_size=512) embeddings = [] count = 0 try: for document in documents: count += 1 sentence = Sentence(document) document_embeddings.embed(sentence) embeddings.append(sentence.get_embedding().tolist()) if (count % 1000 == 0): print(count) finally: # In case an error occurs before finish, we store previous results embedings_array = np.array(embeddings) np.save("embeds_abcnews_512_2.npy", embedings_array)
corpus = pickle.load(open(inputFileName, 'br')) # In[5]: vectors = {} # In[6]: for d in corpus: print("processing ", d) totLen = len(corpus[d]['text']) for i, s in enumerate(corpus[d]['text']): if i % 10 == 0: print("processed {}/{} ".format(i, totLen), end='\r') sentence = Sentence(s) char_lm_embeddings.embed(sentence) for token in sentence: if not token.text in vectors: string = token.text for v in token.embedding.cpu().numpy(): string += ' {}'.format(v) vectors[token.text] = string print("processed {}/{} ".format(i, totLen)) # In[11]: with open(outputFileName, 'wt') as f: for k in vectors: f.write(vectors[k]) f.write("\n")