def dump_bert_vecs(df, dump_dir): print("Getting BERT vectors...") embedding = TransformerWordEmbeddings('bert-base-uncased') word_counter = defaultdict(int) stop_words = set(stopwords.words('english')) stop_words.add("would") except_counter = 0 for index, row in df.iterrows(): if index % 100 == 0: print("Finished sentences: " + str(index) + " out of " + str(len(df))) line = row["sentence"] sentences = sent_tokenize(line) for sentence_ind, sent in enumerate(sentences): flag = 0 i = 0 sentence = None while flag == 0: sentence = Sentence(sent[:(len(sent) - i * 100)], use_tokenizer=True) try: embedding.embed(sentence) flag = 1 except Exception as e: except_counter += 1 print("Length of sentence: ", len(sent) - i * 100) print("Exception Counter while getting BERT: ", except_counter, sentence_ind, index, e) i += 1 if sentence is None or len(sentence) == 0: print("Length of sentence is 0: ", index) for token_ind, token in enumerate(sentence): word = token.text word = word.translate( str.maketrans('', '', string.punctuation)) if word in stop_words or "/" in word or len(word) == 0: continue word_dump_dir = dump_dir + word os.makedirs(word_dump_dir, exist_ok=True) fname = word_dump_dir + "/" + str( word_counter[word]) + ".pkl" word_counter[word] += 1 vec = token.embedding.cpu().numpy() try: with open(fname, "wb") as handler: pickle.dump(vec, handler) except Exception as e: except_counter += 1 print("Exception Counter while dumping BERT: ", except_counter, sentence_ind, index, word, e)
def dump_bert_vecs(df, dump_dir): print("Getting BERT vectors...") embedding = TransformerWordEmbeddings('roberta-base', layers='-1') word_counter = defaultdict(int) stop_words = set(stopwords.words('english')) stop_words.add("would") except_counter = 0 key = list(word_cnt.keys()) for index, row in df.iterrows(): file1 = open("progress.txt", "w+") file1.write(str(index)) print(index) if index % 100 == 0: print("Finished sentences: " + str(index) + " out of " + str(len(df))) line = row["news"] sentences = sent_tokenize(line) for sentence_ind, sent in enumerate(sentences): sentence = Sentence(sent, use_tokenizer=True) try: embedding.embed(sentence) except Exception as e: except_counter += 1 print("Exception Counter while getting BERT: ", except_counter, sentence_ind, index, e) continue for token_ind, token in enumerate(sentence): word = token.text word = word.translate( str.maketrans('', '', string.punctuation)) if word in stop_words or "/" in word or len(word) == 0 or ( word not in key) or word_cnt[word] < 10: #print("word") continue word_dump_dir = dump_dir + word os.makedirs(word_dump_dir, exist_ok=True) fname = word_dump_dir + "/" + str( word_counter[word]) + ".pkl" word_counter[word] += 1 vec = token.embedding.cpu().numpy() try: with open(fname, "wb") as handler: pickle.dump(vec, handler) except Exception as e: except_counter += 1 print("Exception Counter while dumping BERT: ", except_counter, sentence_ind, index, word, e)
def test_transformer_weird_sentences(): embeddings = TransformerWordEmbeddings('distilbert-base-uncased', layers='all', use_scalar_mix=True) sentence = Sentence("Hybrid mesons , qq ̄ states with an admixture") embeddings.embed(sentence) for token in sentence: assert len(token.get_embedding()) == 768 sentence = Sentence("typical proportionalities of ∼ 1nmV − 1 [ 3,4 ] .") embeddings.embed(sentence) for token in sentence: assert len(token.get_embedding()) == 768
class WeVectorizer: def __init__(self, op_relations, vectorizer='spacy'): if vectorizer == 'spacy': self.vectorizer = en_core_web_md.load() else: self.vectorizer = TransformerWordEmbeddings('roberta-base') self.vectors = self.vectorizer_data(op_relations) def _vectorizer_data(self, relations): vecs = [] for sent_id, per_cand, org_cand, sent_raw in tqdm(relations): sent = sent_raw.strip("().\n") org = org_cand['text'] per = per_cand['text'] sent_clean = sent.replace(org, "").replace(per, "") vecs.append(self.vec_sent(sent_clean, per, org)) vecs = np.array(vecs) return vecs def vectorizer_data(self, relations): vecs = [] for sent_id, per_cand, org_cand, sent_raw in tqdm(relations): sent = sent_raw.strip("().\n") sent = Sentence(sent) self.vectorizer.embed(sent) vecs.append(sent[0].embedding.cpu().detach().numpy()) vecs = np.array(vecs) return vecs def vec_sent(self, sent, per_candidate, org_candidate): toks = [ t for t in self.vectorizer(sent) if not any([t.is_space, t.is_punct, t.is_stop, t.is_currency]) and t.has_vector ] sent_vecs = np.array([t.vector for t in toks]).mean(axis=0) per_vec = self.vectorize_ent(per_candidate) org_vec = self.vectorize_ent(org_candidate) res = np.concatenate([sent_vecs, per_vec, org_vec]) return res def vectorize_ent(self, org_candidate): return np.array([t.vector for t in self.vectorizer(org_candidate)]).mean(axis=0)
def test_transformer_word_embeddings_forward_language_ids(): cos = torch.nn.CosineSimilarity(dim=0, eps=1e-10) sent_en = Sentence(["This", "is", "a", "sentence"], language_code="en") sent_de = Sentence(["Das", "ist", "ein", "Satz"], language_code="de") embeddings = TransformerWordEmbeddings("xlm-mlm-ende-1024", allow_long_sentences=False) embeddings.embed([sent_de, sent_en]) expected_similarities = [ 0.7102344036102295, 0.7598986625671387, 0.7437312602996826, 0.5584433674812317 ] for (token_de, token_en, exp_sim) in zip(sent_de, sent_en, expected_similarities): sim = cos(token_de.embedding, token_en.embedding).item() assert abs(exp_sim - sim) < 1e-5
def test_transformer_word_embeddings(): embeddings = TransformerWordEmbeddings('distilbert-base-uncased', layers='-1,-2,-3,-4', layer_mean=False) sentence: Sentence = Sentence("I love Berlin") embeddings.embed(sentence) for token in sentence.tokens: assert len(token.get_embedding()) == 3072 token.clear_embeddings() assert len(token.get_embedding()) == 0 embeddings = TransformerWordEmbeddings('distilbert-base-uncased', layers='all', layer_mean=False) embeddings.embed(sentence) for token in sentence.tokens: assert len(token.get_embedding()) == 5376 token.clear_embeddings() assert len(token.get_embedding()) == 0 del embeddings embeddings = TransformerWordEmbeddings('distilbert-base-uncased', layers='all', layer_mean=True) embeddings.embed(sentence) for token in sentence.tokens: assert len(token.get_embedding()) == 768 token.clear_embeddings() assert len(token.get_embedding()) == 0 del embeddings
def test_transformer_weird_sentences(): embeddings = TransformerWordEmbeddings('distilbert-base-uncased', layers='all', layer_mean=True) sentence = Sentence("Hybrid mesons , qq ̄ states with an admixture") embeddings.embed(sentence) for token in sentence: assert len(token.get_embedding()) == 768 sentence = Sentence( "typical proportionalities of ∼ 1nmV − 1 [ 3,4 ] .") embeddings.embed(sentence) for token in sentence: assert len(token.get_embedding()) == 768 sentence = Sentence("🤟 🤟 🤟 hüllo") embeddings.embed(sentence) for token in sentence: assert len(token.get_embedding()) == 768 sentence = Sentence("🤟hallo 🤟 🤟 🤟 🤟") embeddings.embed(sentence) for token in sentence: assert len(token.get_embedding()) == 768 sentence = Sentence("🤟hallo 🤟 🤟 🤟 🤟") embeddings.embed(sentence) for token in sentence: assert len(token.get_embedding()) == 768 sentence = Sentence("🤟") embeddings.embed(sentence) for token in sentence: assert len(token.get_embedding()) == 768 sentence = Sentence("🤟") sentence_2 = Sentence("second sentence") embeddings.embed([sentence, sentence_2]) for token in sentence: assert len(token.get_embedding()) == 768 for token in sentence_2: assert len(token.get_embedding()) == 768
def contextualize(df, cluster_dump_dir): def get_cluster(tok_vec, cc): max_sim = -10 max_sim_id = -1 for i, cluster_center in enumerate(cc): sim = cosine_similarity(tok_vec, cluster_center) if sim > max_sim: max_sim = sim max_sim_id = i return max_sim_id print("Contextualizing the corpus..") embedding = TransformerWordEmbeddings('bert-base-uncased') stop_words = set(stopwords.words('english')) stop_words.add('would') except_counter = 0 word_cluster = {} for index, row in df.iterrows(): if index % 100 == 0: print("Finished rows: " + str(index) + " out of " + str(len(df))) line = row["sentence"] sentences = sent_tokenize(line) for sentence_ind, sent in enumerate(sentences): sentence = Sentence(sent, use_tokenizer=True) embedding.embed(sentence) for token_ind, token in enumerate(sentence): word = token.text if word in stop_words: continue word_clean = word.translate( str.maketrans('', '', string.punctuation)) if len( word_clean ) == 0 or word_clean in stop_words or "/" in word_clean: continue try: cc = word_cluster[word_clean] except: try: cc = word_cluster[word] except: word_clean_path = cluster_dump_dir + word_clean + "/cc.pkl" word_path = cluster_dump_dir + word + "/cc.pkl" try: with open(word_clean_path, "rb") as handler: cc = pickle.load(handler) word_cluster[word_clean] = cc except: try: with open(word_path, "rb") as handler: cc = pickle.load(handler) word_cluster[word] = cc except Exception as e: except_counter += 1 print( "Exception Counter while getting clusters: ", except_counter, index, e) continue if len(cc) > 1: tok_vec = token.embedding.cpu().numpy() cluster = get_cluster(tok_vec, cc) sentence.tokens[token_ind].text = word + "$" + str( cluster) sentences[sentence_ind] = to_tokenized_string(sentence) df["sentence"][index] = " . ".join(sentences) return df, word_cluster
from flair.data import Sentence from flair.embeddings import WordEmbeddings, FlairEmbeddings, TransformerWordEmbeddings embedding = TransformerWordEmbeddings('allenai/scibert_scivocab_uncased') f = open('./data/Sameas_Hyp_Entire_Test.txt') content = f.read() f1 = open('./data/Final_big_test.txt', 'w') f1.write('sentences1' + "\t" + 'type1' + '\tsentences2\t' + 'type2' + "\tis_similar" + "\n") for line in content.split("\n"): if (line != ""): if (line.split("\t")[0] != 'sentences1'): sentence1 = Sentence(line.split("\t")[0]) embedding.embed(sentence1) avg1 = [] for token in sentence1: #print(token.embedding) avg1.append(token.embedding) avg1 = sum(avg1) sentence2 = Sentence(line.split("\t")[2]) embedding.embed(sentence2) avg2 = [] for token in sentence2: avg2.append(token.embedding) avg2 = sum(avg2) if (float( np.dot(np.array(avg1), np.array(avg2)) /
class VWSDataset(Dataset): def __init__(self, args, name, asp_word2idx, selected_idx=None, need_neg_senti=False): self.asp_word2idx = asp_word2idx self.need_neg_senti = need_neg_senti self.args = args self.embedding = TransformerWordEmbeddings('bert-base-uncased',layers='-1') if name == 'train': self.load_corpus_with_NULL_ITEM(os.path.join(args.data_dir,args.train), 'train', selected_idx, filter_null=args.unsupervised) elif name == 'dev': self.load_corpus_with_NULL_ITEM(os.path.join(args.data_dir,args.dev), 'dev', selected_idx) elif name == 'test': self.load_corpus_with_NULL_ITEM(os.path.join(args.data_dir,args.test), 'test', selected_idx) else: raise NotImplementedError self.len = len(self.corpus_y) # print('-'*50) # t0 = time.time() # batch_x, batch_y, batch_senti, batch_neg_senti, batch_weight = self.create_one_batch_new_version(np.arange(64)) # t1 = time.time() # print("{:.4f}".format(t1-t0)) # batch_x = [] # batch_y = [] # batch_senti = [] # batch_neg_senti = [] # batch_weight = [] # print('-'*50) # t0 = time.time() # for idx in np.arange(64): # token_emb, y, senti_, neg_senti_, weight_ = self.create_one_batch(idx) # batch_x.append(token_emb) # batch_y.append(y) # batch_senti.append(senti_) # batch_neg_senti.append(neg_senti_) # batch_weight.append(weight_) # t1 = time.time() # print("{:.4f}".format(t1-t0)) def load_corpus(self, path, name, selected_idx=None, filter_null=False): args = self.args with open(path, "r", encoding="iso-8859-1") as fh: lines = fh.readlines() if selected_idx is None: segs = [line.strip().split('\t\t\t') for line in lines] else: segs = [line.strip().split('\t\t\t') for line_id, line in enumerate(lines) if line_id in selected_idx] corpus_x = [ seg[2] for seg in segs ] keywords = [ seg[1].split('\t') for seg in segs ] senti = [] weight = [] valid = [] senti_words = WordDict() for idx, sample in enumerate(keywords): sample_weight = [] sample_senti = [] sample_valid = False for i in range(len(sample)): senti_ = sample[i] wei_ = 1. if " no" in senti_: senti_ = senti_.split()[0].strip() wei_ = -1. if senti_ in self.asp_word2idx: sample_senti.append(self.asp_word2idx[senti_]) senti_words.add(self.asp_word2idx[senti_]) sample_weight.append(wei_) sample_valid = True senti.append(sample_senti) weight.append(sample_weight) valid.append(sample_valid) corpus_y = [ int(seg[0])-1 for seg in segs] assert len(corpus_x) == len(corpus_y) if filter_null: corpus_x = [corpus_x[i] for i, v in enumerate(valid) if v is True] corpus_y = [corpus_y[i] for i, v in enumerate(valid) if v is True] senti = [senti[i] for i, v in enumerate(valid) if v is True] weight = [weight[i] for i, v in enumerate(valid) if v is True] print(name,len(corpus_x)) self.corpus_x = [] for text in tqdm(corpus_x): if len(text.split(' ')) > args.max_len: text = ' '.join(text.split(' ')[:args.max_len]) token_emb = [] # create a sentence sentence = Sentence(text) # embed words in sentence self.embedding.embed(sentence) for i, token in enumerate(sentence): if i >= args.max_len: break token_emb.append(token.embedding) if len(sentence) < args.max_len: for i in range(args.max_len-len(sentence)): token_emb.append(torch.zeros_like(token.embedding)) token_emb = torch.stack(token_emb,dim=0) self.corpus_x.append(token_emb) self.corpus_y = corpus_y self.senti = senti self.weight = weight self.senti_words = senti_words def load_corpus_with_NULL_ITEM(self, path, name, selected_idx=None, filter_null=False): args = self.args with open(path, "r", encoding="iso-8859-1") as fh: lines = fh.readlines() if selected_idx is None: segs = [line.strip().split('\t\t\t') for line in lines] else: segs = [line.strip().split('\t\t\t') for line_id, line in enumerate(lines) if line_id in selected_idx] corpus_x = [ seg[2] for seg in segs ] asp_senti = [ seg[1].split('\t') for seg in segs ] senti = [] weight = [] valid = [] senti_words = WordDict() for idx, sample in enumerate(asp_senti): sample_weight = [] sample_senti = [] sample_valid = False for i in range(len(sample) // 2): asp_ = sample[2 * i] senti_ = sample[2 * i + 1] wei_ = 1. if " no" in senti_: senti_ = senti_.split()[0].strip() wei_ = -1. if senti_ in self.asp_word2idx: sample_senti.append(self.asp_word2idx[senti_]) senti_words.add(self.asp_word2idx[senti_]) sample_weight.append(wei_) sample_valid = True senti.append(sample_senti) weight.append(sample_weight) valid.append(sample_valid) corpus_y = [ int(seg[0])-1 for seg in segs] assert len(corpus_x) == len(corpus_y) if filter_null: corpus_x = [corpus_x[i] for i, v in enumerate(valid) if v is True] corpus_y = [corpus_y[i] for i, v in enumerate(valid) if v is True] senti = [senti[i] for i, v in enumerate(valid) if v is True] weight = [weight[i] for i, v in enumerate(valid) if v is True] print(name,len(corpus_x)) self.corpus_x = [] for text in tqdm(corpus_x): if len(text.split(' ')) > args.max_len: text = ' '.join(text.split(' ')[:args.max_len]) token_emb = [] # create a sentence sentence = Sentence(text) # embed words in sentence self.embedding.embed(sentence) for i, token in enumerate(sentence): if i >= args.max_len: break token_emb.append(token.embedding) if len(sentence) < args.max_len: for i in range(args.max_len-len(sentence)): token_emb.append(torch.zeros_like(token.embedding)) token_emb = torch.stack(token_emb,dim=0) self.corpus_x.append(token_emb) self.corpus_y = corpus_y self.senti = senti self.weight = weight self.senti_words = senti_words def create_one_batch(self, idx): args = self.args need_neg_senti = self.need_neg_senti senti = self.senti weight = self.weight senti_words = self.senti_words # batch_y = np.asarray([np.eye(args.score_scale)[y[i]] if y[i] >= 0 else np.zeros( # args.score_scale) for i in idx], dtype=np.float32) neg_senti_ = [] senti_count = senti_words.count words = [] p = [] for ii, word in enumerate(senti[idx]): if word in senti_count: words.append(ii) p.append(senti_count[word] ** -0.25) if len(p) > 0: total = sum(p) p = [k / total for k in p] ran_val = np.random.choice(words, args.num_senti, p=p) senti_ = [senti[idx][val] for val in ran_val] weight_ = [weight[idx][val] for val in ran_val] if need_neg_senti: neg_senti_ = [] for _ in range(args.num_neg): rand_senti = senti_[0] while rand_senti in senti_: rand_senti = senti_words.sample(min_count=args.min_count) neg_senti_.append(rand_senti) else: # a review has no extracted tuples senti_ = [0 for _ in range(args.num_senti)] weight_ = [0. for _ in range(args.num_senti)] if need_neg_senti: neg_senti_ = [int(0) for _ in range(args.num_neg)] return self.corpus_x[idx], self.corpus_y[idx], senti_, neg_senti_, weight_ def create_one_batch_new_version(self, idxs): args = self.args need_neg_senti = self.need_neg_senti senti = self.senti weight = self.weight senti_words = self.senti_words batch_x = [] batch_y = [] batch_senti = [] batch_neg_senti = [] batch_weight = [] # batch_y = np.asarray([np.eye(args.score_scale)[y[i]] if y[i] >= 0 else np.zeros( # args.score_scale) for i in idx], dtype=np.float32) for idx in idxs: neg_senti_ = [] senti_count = senti_words.count words = [] p = [] for ii, word in enumerate(senti[idx]): if word in senti_count: words.append(ii) p.append(senti_count[word] ** -0.25) if len(p) > 0: total = sum(p) p = [k / total for k in p] ran_val = np.random.choice(words, args.num_senti, p=p) senti_ = [senti[idx][val] for val in ran_val] weight_ = [weight[idx][val] for val in ran_val] if need_neg_senti: neg_senti_ = [] for _ in range(args.num_neg): rand_senti = senti_[0] while rand_senti in senti_: rand_senti = senti_words.sample(min_count=args.min_count) neg_senti_.append(rand_senti) else: # a review has no extracted tuples senti_ = [0 for _ in range(args.num_senti)] weight_ = [0. for _ in range(args.num_senti)] if need_neg_senti: neg_senti_ = [int(0) for _ in range(args.num_neg)] token_emb = [] # create a sentence sentence = Sentence(self.corpus_x[idx]) # embed words in sentence self.embedding.embed(sentence) for ii, token in enumerate(sentence): if ii >= args.max_len: break token_emb.append(token.embedding) if len(sentence) < args.max_len: for ii in range(args.max_len-len(sentence)): token_emb.append(torch.zeros_like(token.embedding)) token_emb = torch.stack(token_emb,dim=0) batch_x.append(token_emb) batch_y.append(self.corpus_y[idx]) batch_senti.append(senti_) batch_neg_senti.append(neg_senti_) batch_weight.append(weight_) return token_emb, self.corpus_y[idx], senti_, neg_senti_, weight_ def __len__(self): return self.len def __getitem__(self, idx): if torch.is_tensor(idx): idx = idx.tolist() batch_x, batch_y, batch_senti, batch_neg_senti, batch_weight = self.create_one_batch(idx) return batch_x, batch_y, batch_senti, batch_neg_senti, batch_weight
def bert_embeddings(sentences, tokenized_contents, output_file=None): # Using bert_tokenizer for checking for sequence wordpeice tokens length > 512 bert_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') if output_file: f = open(output_file, 'w') # init embedding # init multilingual BERT bert_embedding = TransformerWordEmbeddings('bert-large-uncased') long_sent = False for i, (sent, sent_tokens) in enumerate(zip(sentences, tokenized_contents)): print("Encoding the {}th input sentence for BERT embedding!".format(i)) # getting the length of bert tokenized sentence after wordpeice tokenization if len(bert_tokenizer.tokenize(sent[0])) >= 510: long_sent = True truncated_tokens = sent_tokens[:len(sent_tokens) // 2] sent_tokens = sent_tokens[len(sent_tokens) // 2:] # Using our own tokens (our own tokenization) tokens: List[Token] = [Token(token) for token in sent_tokens] # create an empty sentence sentence = Sentence() # add tokens from our own tokenization sentence.tokens = tokens bert_embedding.embed(sentence) for j, (token, st) in enumerate(zip(sentence, sent_tokens)): if token.text != st: raise ValueError("Invalid token text") if output_file: f.write( token.text + " " + " ".join([str(num) for num in token.embedding.tolist()]) + '\n') else: print(token.text + " " + " ".join([str(num) for num in token.embedding.tolist()]) + '\n') if long_sent: # tokenization for the rest of the sentence truncated_tokens: List[Token] = [ Token(token) for token in truncated_tokens ] # Create empty sentence truncated_sentence = Sentence() # add tokens from our own tokenization truncated_sentence.tokens = truncated_tokens bert_embedding.embed(truncated_sentence) for token in truncated_sentence: if output_file: f.write(token.text + " " + " ".join( [str(num) for num in token.embedding.tolist()]) + '\n') else: print(token.text + " " + " ".join( [str(num) for num in token.embedding.tolist()]) + '\n') long_sent = False f.write('\n')