def elmo_embeddings_flair_lib_tokenization(sentences, output_file=None): if output_file: f = open(output_file, 'w') # init embedding # For English biomedical data you can use 'pubmed' embedding = ELMoEmbeddings( 'original') # English 4096-hidden, 2 layers, 93.6M parameters for i, sent in enumerate(sentences): print("Encoding the {}th input sentence!".format(i)) # create a sentence sentence = Sentence(sent[0]) # [sent] --> sent # embed words in sentence embedding.embed(sentence) for token in sentence: if output_file: f.write( token.text + " " + " ".join([str(num) for num in token.embedding.tolist()]) + '\n') else: print(token.text + " " + " ".join([str(num) for num in token.embedding.tolist()]) + '\n') f.write('\n')
def main(directory, embeddings, strategy): # 1. find corpora in data directory corpora = {"train": None, "dev": None, "test": None} for labelset in corpora: for file in sorted(os.listdir(directory)): if infer_split(file) == labelset: corpora[labelset] = pd.read_csv( os.path.join(directory, file), sep="\t", names=["text", "pos", "lemma", "label"], engine="python", error_bad_lines=False, quoting=csv.QUOTE_NONE).fillna("") break if embeddings == "elmo": embedder = ELMoEmbeddings("original") elif embeddings == "flair": embedder = FlairEmbeddings("news-forward") elif embeddings == "bert": embedder = TransformerWordEmbeddings('bert-base-cased') embeddings_dir = os.path.join(directory, embeddings + "_embeddings") if not os.path.exists(embeddings_dir): os.makedirs(embeddings_dir, exist_ok=True) strategy = np.mean if strategy == "mean" else np.max if strategy == "max" else np.sum if strategy == "sum" else None for labelset, corpus in corpora.items(): if corpus is None: print(f"empty corpus: {labelset}") continue voc = sorted(corpus["text"].unique()) print(f"Unique tokens: {len(voc)}") with open(os.path.join(embeddings_dir, labelset + ".w2v"), "w") as f: for word in voc: sentence = Sentence(word) if len(sentence) == 0: continue embedder.embed(sentence) token_embedding = strategy( [token.embedding.cpu().numpy() for token in sentence], axis=0) f.write( word + " " + " ".join([str(num) for num in token_embedding.tolist()]) + '\n')
def use_flair_to_extract_context_embeddings(file, dest_folder, embedding_type, embedding_size, pretrained_model=None): if embedding_type.lower() == 'elmo': context_embedding = ELMoEmbeddings(model='pubmed') elif embedding_type.lower() == 'elmo_transformer': context_embedding = ELMoTransformerEmbeddings() elif embedding_type.lower() == 'flair': context_embedding = PooledFlairEmbeddings() elif embedding_type.lower() == 'bioflair': flair_1 = PooledFlairEmbeddings('pubmed-forward') flair_2 = PooledFlairEmbeddings('pubmed-backward') elmo = ELMoEmbeddings(model='pubmed') #bert = BertEmbeddings(bert_model_or_path='bert-base-multilingual-cased', layers='-1') context_embedding = StackedEmbeddings(embeddings=[flair_1, flair_2, elmo]) elif embedding_type.lower() == 'biobert' or embedding_type.lower() == 'bert': context_embedding = BertEmbeddings(bert_model_or_path=pretrained_model, layers='-1') data = {} dest_name = os.path.basename(file).split('.') print(dest_folder) with open(file, 'r') as f, open('{}/{}.pickle'.format(dest_folder, dest_name[0]), 'wb') as d: sentence = '' instance = [] j = 0 for i in f.readlines(): if i != '\n': i = i.split() sentence += ' '+i[0] elif i == '\n': sent = Sentence(sentence.strip()) context_embedding.embed(sent) v = '' for i in sent: instance.append((i.text, i.embedding[:embedding_size])) sentence = '' if instance: data[j] = list(zip(*(instance.copy()))) j += 1 instance.clear() pickle.dump(data, d) f.close() d.close()
def elmo_embeddings(sentences, tokenized_contents, output_file=None): if output_file: f = open(output_file, 'w') # init embedding # For English biomedical data you can use 'pubmed' embedding = ELMoEmbeddings( 'original') # English 4096-hidden, 2 layers, 93.6M parameters for i, (sent, sent_tokens) in enumerate(zip(sentences, tokenized_contents)): print("Encoding the {}th input sentence for ELMO embedding!".format(i)) # Getting the tokens from our own tokenized sentence! tokens: List[Token] = [Token(token) for token in sent_tokens] if len(tokens) != len(sent_tokens): raise ValueError("token length does not match sent_tokens length") # Create new empty sentence sentence = Sentence() # add our own tokens sentence.tokens = tokens embedding.embed(sentence) for token in sentence: if output_file: f.write( token.text + " " + " ".join([str(num) for num in token.embedding.tolist()]) + '\n') else: print(token.text + " " + " ".join([str(num) for num in token.embedding.tolist()]) + '\n') f.write('\n')
class Database(object): def __init__(self, docs): #self.documents_orig = np.loadtxt(docs, delimiter='\n', dtype = str) # only 9999 documents self.documents_orig = [] with open(docs, 'r') as f: # getting 10k docs using this self.documents_orig = f.readlines() self.documents = [] self.elmo = ELMoEmbeddings() #self.embedding = DocumentPoolEmbeddings([self.elmo]) self.debug = True def knn(self, query, query_txt, k): #cos_sim = torch.mm(self.documents, query) / (torch.norm(query) * torch.norm(self.documents)) cos_sim = torch.nn.functional.cosine_similarity(self.documents, query) topk, topk_indices = torch.topk(cos_sim, k, 0, True) topk_indices = topk_indices.numpy().astype('int') topk = topk.numpy().astype('float') top_combined = np.vstack((topk, topk_indices)).T if self.debug: print("\n") print("Query: ", query_txt, " | index: ", topk_indices.T) [ print(self.documents_orig[int(i[1])], " --- ", i[0]) for i in top_combined ] return list(zip(topk, topk_indices)) #used to return tuples def load_documents_into_embedding(self): print("Embedding ", len(self.documents_orig), " Documents") #self.documents_orig = self.documents_orig[0:50] self.documents = [ self.elmo.embed(Sentence(elem)) for elem in self.documents_orig ] self.documents = torch.stack([ torch.cat([token.embedding.unsqueeze(0) for token in elem[0]], dim=0)[0] for elem in self.documents ]) np.save("./documents_embedded.npy", self.documents) def run_query(self, query, k=None): """Run a query on the given documents based on word embeddings Arguments: query {str} -- Query string. Keyword Arguments: k {int} -- The top documents to return (default: 10) Returns: list[tuple[float, int]] -- Sorted list of tuples, which contain the score and the document id. Made up example to show the formatting with k=5: [(0.89316645860672, 1567), (0.6174346804618835, 125), (0.5975501537321234, 1181), (0.5779426293373108, 3979), (0.5110726475715637, 7155)] """ if k is None: k = 10 sentence = Sentence(query) #self.embedding.embed(sentence) self.elmo.embed(sentence) sentence = [token.embedding.unsqueeze(0) for token in sentence][0] #print(sentence) # A returned list should look like this for k=5. Btw. the numbers are made up! #[ # (0.89316645860672, 1567), # (0.6174346804618835, 125), # (0.5975501537321234, 1181), # (0.5779426293373108, 3979), # (0.5110726475715637, 7155), # ] return self.knn(sentence, query, k=k) def run_query_txt(self, text): self.queries = np.loadtxt(text, delimiter='\n', dtype=str) results = [] for query in self.queries: out = self.run_query(query) results.append(out) #saving results file = open("results.txt", 'w') for elem in results: out = "" for res in elem: out += str(res[0]) + "," + str(res[1]) + ";" out = out[:-1] out += '\n' file.write(out) file.close()
) parser.add_argument( '--lm_emb_save_path', default='./wv/elmo.emb.pkl', action='store', ) args = parser.parse_args() embedding = ELMoEmbeddings(args.model_name) flag = args.dataset dataset = [] with open(f'./datasets/unified/train.{flag}.json') as f: dataset += json.load(f) with open(f'./datasets/unified/valid.{flag}.json') as f: dataset += json.load(f) with open(f'./datasets/unified/test.{flag}.json') as f: dataset += json.load(f) bert_emb_dict = {} for item in tqdm(dataset): tokens = tuple(item['tokens']) s = form_sentence(tokens) embedding.embed(s) emb = get_embs(s) bert_emb_dict[tokens] = emb.astype('float16') with open(args.lm_emb_save_path, 'wb') as f: pickle.dump(bert_emb_dict, f)
import numpy as np import pickle import tqdm as tqdm def get_elmo(word2idx): """Returns an ELMo embedding of the word tokens. Args: word2idx (dict): word to index pairs {'word': index} Returns: embedding_matrix (np.array): embedding matrix of shape (vocabulary_size + 1, embedding_dim=3072) """ embedding_matrix = np.zeros((vocabulary_size + 1, 3072)) elmo = ELMoEmbeddings() for word, index in tqdm(word2idx.items()): try: word_ = Sentence(word) elmo.embed(word_) embedding_vector = word_[0].embedding.cpu().detach().numpy() embedding_matrix[index] = embedding_vector except KeyError: embedding_matrix[index] = np.random.normal(0, np.sqrt(0.25), 3072) return embedding_matrix # save embeddings pickle_out = open("elmo_3072.pickle","wb") pickle.dump(embedding_matrix, pickle_out) pickle_out.close()