"abstract": pubs.iloc[idx]["Abstract"], "abstract_length": pubs.iloc[idx]["Abstract Length"], "word_count": pubs.iloc[idx]["Word Count"] }) return results #process text, create model, and sentences pubs['Text Processed'] = pubs.apply(lambda row: preprocess_text(row['Text']), axis=1) pubs['Word Count'] = pubs.apply(lambda row: len(row['Text Processed'].split()), axis=1) text_df = pubs[[ 'Text Processed', ]].copy() embedder = SentenceTransformer('bert-base-nli-mean-tokens') sentences = list(text_df['Text Processed']) # Eaxmple query sentences queries = [ 'How to evolve architecture for constellations and simulation', 'Build behavior of complex aerospace and modeling of safety' ] text_embeddings = embedder.encode(sentences, show_progress_bar=True) # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity print("\nTop 5 most similar sentences in corpus:") for query in queries: pprint(get_search_result(embedder, text_embeddings, query, closest_n=5))
class DistilRobertaQA2Vec(BaseQA2Vec): definition = DistilRobertaQAModelDefinition def __init__(self): self.model = SentenceTransformer('distilroberta-base-msmarco-v1') self.vector_length = 768 @property def __name__(self): return "distilroberta_qa" @catch_vector_errors def encode_question(self, question: str): return self.model.encode(["[QRY] " + question])[0].tolist() @catch_vector_errors def bulk_encode_question(self, questions: list): return [self.encode(q) for q in questions] @catch_vector_errors def encode_answer(self, answer: str, context: str = None): return self.model.encode(["[DOC] " + answer])[0].tolist() @catch_vector_errors def bulk_encode_answers(self, answers: List[str]): return [self.encode(a) for a in answers] @catch_vector_errors def encode(self, string: str, context_string: str = None, string_type: str = 'answer'): """ Encode question/answer using LAReQA model. Args: String: Any string Context_string: The context of the string. string_type: question/answer. Example: >>> from vectorhub.bi_encoders.qa.tfhub.lareqa_qa import * >>> model = LAReQA2Vec() >>> model.encode_answer("Why?") """ if string_type.lower() == 'answer': return self.encode_answer(string, context=context_string) elif string_type.lower() == 'question': return self.encode_question(string, context=context_string) @catch_vector_errors def bulk_encode(self, strings: List[str], context_strings: List[str] = None, string_type: str = 'answer'): """ Bulk encode question/answer using LAReQA model. Args: String: List of strings. Context_string: List of context of the strings. string_type: question/answer. Example: >>> from vectorhub.bi_encoders.qa.tfhub.lareqa_qa import * >>> model = LAReQA2Vec() >>> model.bulk_encode("Why?", string_type='answer') """ if context_strings is not None: return [ self.encode(x, context_strings[i], string_type=string_type) for i, x in enumerate(strings) ] return [ self.encode(x, string_type=string_type) for x in enumerate(strings) ]
input_path_cases = os.listdir('cleaned_files/') model_sum = [] true_sum = [] embedder = SentenceTransformer('distiluse-base-multilingual-cased') root = "cleaned_files" for file in input_path_cases: file_path = os.path.join(root, file) if '.txt' in file_path: with open(file_path) as f: corpus = f.read() res = corpus.split("@summary ") text, corpus_summary = res[0], res[1] legal_corpus = text.split(" , ") sentence_embeddings = embedder.encode(legal_corpus) enc_embedding = sentence_embeddings n_clusters = int(np.ceil(len(enc_embedding)**0.5)) kmeans = KMeans(n_clusters=n_clusters, random_state=0) kmeans = kmeans.fit(enc_embedding) avg = [] closest = [] for j in range(n_clusters): idx = np.where(kmeans.labels_ == j)[0] avg.append(np.mean(idx)) closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,\ enc_embedding) ordering = sorted(range(n_clusters), key=lambda k: avg[k]) gen_summary = ' '.join( [legal_corpus[closest[idx]] for idx in ordering]) model_sum.append(gen_summary)
def distilroberta(data): sbert_model = SentenceTransformer('distilroberta-base-paraphrase-v1') return sbert_model.encode(data.tolist())
continue print('\n') print(f'Document: {documents_df.iloc[ix]["documents"]}') print(f'{matrix} : {similarity_matrix[doc_id][ix]}') documents_df = pd.DataFrame(titles, columns=['documents']) stop_words_l = stopwords.words('english') documents_df['documents_cleaned'] = documents_df.documents.apply( lambda x: " ".join( re.sub(r'[^a-zA-Z]', ' ', w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]', ' ', w).lower() not in stop_words_l)) sbert_model = SentenceTransformer('bert-base-nli-mean-tokens') start = time.time() document_embeddings = sbert_model.encode(documents_df['documents_cleaned']) stop = time.time() print(stop - start) # print(document_embeddings) #pairwise_similarities = cosine_similarity(document_embeddings) #pairwise_differences = euclidean_distances(document_embeddings) #most_similar(0, pairwise_similarities, 'Cosine Similarity') #most_similar(0, pairwise_differences, 'Euclidean Distance') while True: query = input("Please enter your search query : ") query_embedding = sbert_model.encode(query) maxcosineSimilarity = 0 document_id = 0 docfoundat = 0 # print(query_embedding)
def personalization(event_id): """ Generate event's vector representation, generate cosine similarity, and store to databasetriggered on event created/updated Args: event_id: id of the event (integer) """ event_description = "" session = DBSession() try: # query event info query_event = session.query(Event).filter( Event.id == event_id).scalar() if query_event is not None: event_description = query_event.description else: raise Exception("Event not found") # convert event description it into vector form sbert_model = SentenceTransformer("src/stsb-roberta-large-model") sentence_embeddings = sbert_model.encode(event_description, show_progress_bar=True) # store event vector into db event_vector = (session.query(EventRecommendation).filter_by( event_id=event_id).first()) if not event_vector: event_vector = EventRecommendation( event_id=event_id, event_vector=sentence_embeddings.tolist()) session.add(event_vector) else: event_vector.event_vector = sentence_embeddings.tolist() session.commit() except: session.rollback() raise finally: session.close() session = DBSession() try: # query all events, events_tag, events_vector from DB tags_ids = [] events_ids = [] vectors = [] events_vectors = (session.query(EventRecommendation).join( EventTag, EventTag.event_id == EventRecommendation.event_id).group_by( EventTag.event_id).with_entities( func.max(EventRecommendation.event_id), func.max(EventRecommendation.event_vector), func.array_agg(EventTag.tag_id), )) if events_vectors.count() == 0: raise Exception("No events available to generate recommendation") for item in events_vectors: delimiter = "," tags = map(str, item[2]) tags_ids.append(delimiter.join(tags)) events_ids.append(item[0]) vectors.append(item[1]) # calculate tf-idf vectorizer = TfidfVectorizer(tokenizer=tokenizer) tf_idf_sparce_array = vectorizer.fit_transform(tags_ids) tf_idf_feature = tf_idf_sparce_array.toarray() # calculate cosine similarity of tf-idf cosine_sim_des_tags = linear_kernel(tf_idf_feature, tf_idf_feature) # calculate cosine similarity of event vector cosine_sim_des_descriptions = cosine_similarity(vectors, vectors) # combine tf-idf with event vector cosine_result = np.mean( [cosine_sim_des_tags, cosine_sim_des_descriptions], axis=0) # sort cosine similarity k_highest_score = [] k = 50 for item in cosine_result: index_of_score = np.argsort(item)[-k:] index_of_score = np.flip(index_of_score) result = np.array( [np.array(events_ids)[index_of_score], item[index_of_score]]).T k_highest_score.append(result) # store score to db for score in k_highest_score: score_result = {} for item in score[1:]: score_result[int(item[0])] = item[1] event_vector = (session.query(EventRecommendation).filter_by( event_id=int(score[0][0])).first()) event_vector.score = score_result session.commit() except: session.rollback() raise finally: session.close()
async def main(req: func.HttpRequest) -> func.HttpResponse: client = GraphqlClient(endpoint="http://localhost:4000/graphql") query = """ query($id: String!, $isChoiceBased: Boolean!) { question(where: { id: $id }) { name elements { choice { label details } answer { label details } primaryEmbedding(isChoiceBased: $isChoiceBased) } } } """ result = list() wordList = list() queryResult = list( filter( filterQuestions, client.execute(query=query, variables={ "id": req.params.get('id'), "isChoiceBased": True if req.params.get('isChoiceBased') == "true" else False })["data"]["question"]["elements"])) for o in queryResult: label = o["choice" if req.params.get('isChoiceBased') == "true" else "answer"]["details" if req.params. get('isTextEntry') == "true" else "label"] if label not in wordList: wordList.append(label) result.append(o) corpus_embeddings = [ torch.FloatTensor(o["primaryEmbedding"]) for o in result ] query = req.params.get('query') if not query: try: req_body = req.get_json() except ValueError: pass else: query = req_body.get('query') if query: embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens') query_embedding = embedder.encode(query, convert_to_tensor=True) searched = util.semantic_search(query_embedding, corpus_embeddings)[0] ret = [] for o in searched: item = result[o["corpus_id"]] item["score"] = o["score"].astype(float) del item["primaryEmbedding"] ret.append(item) return func.HttpResponse(json.dumps(ret)) else: return func.HttpResponse( "This HTTP triggered function executed successfully. Pass a name in the query string or in the request body for a personalized response.", status_code=200)
# The provided file encoded the passages with the model 'msmarco-distilbert-base-v2' if model_name == 'msmarco-distilbert-base-v2': embeddings_filepath = 'simplewiki-2020-11-01-msmarco-distilbert-base-v2.pt' if not os.path.exists(embeddings_filepath): util.http_get( 'http://sbert.net/datasets/simplewiki-2020-11-01-msmarco-distilbert-base-v2.pt', embeddings_filepath) corpus_embeddings = torch.load(embeddings_filepath) corpus_embeddings = corpus_embeddings.float( ) #Convert embedding file to float if torch.cuda.is_available(): corpus_embeddings = corpus_embeddings.to('cuda') else: #Here, we compute the corpus_embeddings from scratch (which can take a while depending on the GPU) corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True) while True: query = input("Please enter a question: ") #Encode the query using the bi-encoder and find potentially relevant passages start_time = time.time() question_embedding = bi_encoder.encode(query, convert_to_tensor=True) hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k) hits = hits[0] # Get the hits for the first query #Now, score all retrieved passages with the cross_encoder cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
def get_meanBert(self,df=pd.DataFrame()): self.data_df = df model=SentenceTransformer('bert-base-nli-mean-tokens') embeddings=model.encode(self.data_df[self.sentence_col].values.tolist()) self.bert_mid_precision=pd.DataFrame({self.uniqueID:self.data_df[self.uniqueID].values.tolist(),self.sentence_col:self.data_df[self.sentence_col].values.tolist(),'embeddings':embeddings}) return np.asarray(embeddings)
class LetsNet: def __init__(self, embedding_sz=5): self.encoder_model = SentenceTransformer('bert-base-nli-mean-tokens') self.rouge = Rouge() self.cluster_n = 5 self.embedding_sz = embedding_sz self.kmeans = KMeans(n_clusters=self.cluster_n) def encode(self, sentences): sentence_embeddings = self.encoder_model.encode(sentences) features_n = len(sentence_embeddings[0]) sentences_n = len(sentences) norm_embedding = [[embed_i[idx] for idx in range(features_n)] for embed_i in sentence_embeddings] for idx in range(features_n): features = [embed_i[idx] for embed_i in sentence_embeddings] min_feature_val = min(features) max_feature_val = max(features) range_feature_val = max_feature_val - min_feature_val for sent_idx in range(sentences_n): norm_embedding[sent_idx][idx] = (norm_embedding[sent_idx][idx]-min_feature_val)/range_feature_val pca_embedding = [np.array([norm_vec[idx] for idx in range(features_n)]) for norm_vec in norm_embedding] # print(pca_embedding) # pca_embedding = np.copy(sentence_embeddings[0, 1, 2, 3, 4, 5]) return pca_embedding def getCentroidRepresentative(self, clusters, sentence_embeddings): centroids = [] for idx in range(self.cluster_n): centroid_id = np.where(clusters.labels_ == idx)[0] centroids.append(np.mean(centroid_id)) closest, _ = pairwise_distances_argmin_min(clusters.cluster_centers_, sentence_embeddings) ordering = sorted(range(self.cluster_n), key=lambda k: centroids[k]) return closest, ordering def evaluate(self, model_sum, gt_sum): """ Gives rouge score :param model_sum: list of summaries returned by the model :param gt_sum: list of ground truth summary from catchphrases :return: ROUGE score """ return self.rouge.get_scores(model_sum, gt_sum, avg=True) def getSentenceSummary(self, sentences: list): """ Returns summary of sentence :param sentences: list of sentences :return: summary text """ sentence_enc = self.encode(sentences) clusters = self.kmeans.fit(sentence_enc) closest, ordering = self.getCentroidRepresentative(clusters, sentence_enc) summary = '.'.join([sentences[closest[idx]] for idx in ordering]).replace('\n', ' ') return summary def main(self): """ Executes the entire pipeline of the code :return: void """ gt = getGroundTruth() model_sum, gt_sum = [], [] doc_n = len(gt) for doc_idx in range(20): print("{}/{}".format(doc_idx, doc_n)) full_text, catch_phrases = gt[doc_idx] summary = self.getSentenceSummary(full_text) model_sum.append(summary) gt_sum.append(".".join(catch_phrases)) print("ROUGE score: {}".format(self.evaluate(model_sum, gt_sum)))
for nr in range(len(all_sentences)): all_sentences[nr] = alpha(filter_stopwords(all_sentences[nr], stopwords)).lower() for nr in range(len(test_set)): test_set['sentence'][nr] = alpha(filter_stopwords(test_set['sentence'][nr], stopwords)).lower() """**Multilingual model 2**""" from sentence_transformers import SentenceTransformer, util import torch embedder = SentenceTransformer('distiluse-base-multilingual-cased-v2') #Make example sentence embeddings multilingual_embeddings = embedder.encode(all_sentences, convert_to_numpy=True) multilingual_embeddings.shape #Make test set embeddings input_sentences = list(test_set['sentence']) input_embedding = embedder.encode(input_sentences, convert_to_numpy=True) input_embeddings = np.reshape(input_embedding, (len(test_set['sentence']),512)) input_embeddings.shape #Calculate similarities similarities = cosine_similarity(input_embeddings, multilingual_embeddings) #Print neighbors and output i = 0 for similarity in similarities[0:50]: neighbors = np.argsort(np.reshape(similarity, 90))[-7:][::-1]
df.reset_index(inplace=True) df.drop(columns='index') sentence = [] tidysentence=[] for i in range(len(df)): st = df['snippet'][i] tidysentence.append(st) st = re.sub('[^a-zA-z0-9\s]','',st) st = '.'.join(st.rsplit('\n')) sentence.append(st) len(sentence) sentence_embb = embedder.encode(sentence) query = ['molded interconnect device MID electronics Two-shot molding'] Closest = len(df) query_embb = embedder.encode(query[0]) Closest = len(sentence) res = [] score = [] count=0 counts=[] for que, que_embb in zip(query,query_embb): distance = sc.distance.cdist([que_embb], sentence_embb,'correlation')[0] result = zip(range(len(distance)), distance) result = sorted(result, key=lambda x: x[1])
for id in data: if phase == 'claims': claim = data[id]['claim'] title = data[id]['title'] corpus.append(claim) corpus2.append(title) else: text = data[id]['text'] corpus.append(text) embed_dict[phase]['id'].append(id) if phase == 'claims': embeddings1 = model.encode(corpus, batch_size=32) embeddings2 = model.encode(corpus2, batch_size=32) for emb1, emb2 in zip(embeddings1, embeddings2): embed_dict[phase]['embs'].append(emb1.tolist()) embed_dict[phase]['embs2'].append(((emb1+emb2)/2).tolist()) else: embeddings = model.encode(corpus, batch_size=32) for emb in embeddings: embed_dict[phase]['embs'].append(emb.tolist()) json.dump(embed_dict, open(my_loc+'/bert_embs/%s_raw_text.json'%(md), 'w')) for md in model_types: model = SentenceTransformer(md).to(device)
def run_tester(single=False, file=None, model_str='transe'): if model_str == 'transe': model_path = './checkpoint/mapper.pt' if not os.path.exists(model_path): exit("Train mapper first!") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = mapper().to(device) model.load_state_dict(torch.load(model_path)) model.eval() filter = target_filter() if single: with open(file, 'r') as f: data = f.readlines() usr_r = data[0].strip() text = data[1].strip() nlp = SentenceTransformer('distilbert-base-nli-mean-tokens', device=device) this_x = nlp.encode(text) output = model(torch.from_numpy(this_x).float().to(device)) output = output.detach().cpu().numpy() e, r = load_model(model_str) i_j = {} for j, ee in enumerate(e): d = score_transe(output, r[int(usr_r)], ee) i_j[j] = d sorted_d = { k: v for k, v in sorted(i_j.items(), key=lambda item: item[1]) } entities = load_entities() for i in list(sorted_d)[:10]: print(entities[str(i)]) else: hs, ts, rs, hs_name = load_open_word_test(device, deep_filtered=True) r_t = relation_tail_train() count_1 = 0 count_3 = 0 count_10 = 0 print('Testing...') for i in tqdm(range(len(hs))): this_desc_embedding = hs[i] output = model( torch.from_numpy(this_desc_embedding).float().to(device)) output = output.detach().cpu().numpy() e, r = load_model(model_str) gt = ts[i] this_relation = rs[i] this_name = hs_name[i] this_filter = [] for tail in tuple(filter[this_name + ':' + this_relation]): if tail != gt: this_filter.append(tail) i_j = {} for j, ee in enumerate(e): d = score_transe(output, r[int(this_relation)], ee) if str(j) not in this_filter: i_j[str(j)] = d sorted_d = { k: v for k, v in sorted(i_j.items(), key=lambda item: item[1]) } if gt in list(sorted_d)[:10]: count_10 += 1 if gt in list(sorted_d)[:3]: count_3 += 1 if gt in list(sorted_d)[:1]: count_1 += 1 print('hits@1: %.1f' % (count_1 / len(hs) * 100)) print('hits@3: %.1f' % (count_3 / len(hs) * 100)) print('hits@10: %.1f' % (count_10 / len(hs) * 100)) elif model_str == 'complex': model_r_path = './checkpoint/mapper_complex_r.pt' model_i_path = './checkpoint/mapper_complex_i.pt' if not os.path.exists( model_r_path) and not not os.path.exists(model_i_path): exit("Train mapper first!") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_r = mapper().to(device) model_i = mapper().to(device) model_r.load_state_dict(torch.load(model_r_path)) model_i.load_state_dict(torch.load(model_i_path)) model_r.eval() model_i.eval() if single: with open(file, 'r') as f: data = f.readlines() usr_r = int(data[0].strip()) text = data[1].strip() nlp = SentenceTransformer( 'bert-base-wikipedia-sections-mean-tokens', device=device) this_x = nlp.encode(text) output_r = model_r(torch.from_numpy(this_x).float().to(device)) output_i = model_i(torch.from_numpy(this_x).float().to(device)) output_r = output_r.detach().cpu().numpy() output_i = output_i.detach().cpu().numpy() e, r = load_model(model_str) e_r = e[0] e_i = e[1] r_r = r[0] r_i = r[1] i_j = {} for j in range(len(e_r)): d = score_complex(output_r, output_i, e_r[j], e_i[j], r_r[usr_r], r_i[usr_r]) i_j[j] = d sorted_d = { k: v for k, v in sorted(i_j.items(), key=lambda item: item[1]) } entities = load_entities() for i in list(sorted_d)[:10]: print(entities[str(i)]) else: hs, ts, rs, _ = load_open_word_test(device, deep_filtered=True) count_1 = 0 count_3 = 0 count_10 = 0 print('Testing...') filter = target_filter() for i in tqdm(range(len(hs))): this_desc_embedding = hs[i] output_r = model_r( torch.from_numpy(this_desc_embedding).float().to(device)) output_i = model_i( torch.from_numpy(this_desc_embedding).float().to(device)) output_r = output_r.detach().cpu().numpy() output_i = output_i.detach().cpu().numpy() e, r = load_model(model_str) e_r = e[0] e_i = e[1] r_r = r[0] r_i = r[1] gt = ts[i] this_relation = rs[i] this_filter = [] for tail in tuple(filter[this_relation]): if tail != gt: this_filter.append(tail) i_j = {} for j in range(len(e_r)): d = score_complex(output_r, output_i, e_r[j], e_i[j], r_r[int(this_relation)], r_i[int(this_relation)]) if str(j) not in this_filter: i_j[str(j)] = d sorted_d = { k: v for k, v in sorted(i_j.items(), key=lambda item: item[1]) } if gt in list(sorted_d)[:10]: count_10 += 1 if gt in list(sorted_d)[:3]: count_3 += 1 if gt in list(sorted_d)[:1]: count_1 += 1 print('hits@1: %.1f' % (count_1 / len(hs) * 100)) print('hits@3: %.1f' % (count_3 / len(hs) * 100)) print('hits@10: %.1f' % (count_10 / len(hs) * 100))
#Read sentences from NLI dataset nli_sentences = set() with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: nli_sentences.add(row['sentence1']) nli_sentences.add(row['sentence2']) nli_sentences = list(nli_sentences) random.shuffle(nli_sentences) #To determine the PCA matrix, we need some example sentence embeddings. #Here, we compute the embeddings for 20k random sentences from the AllNLI dataset pca_train_sentences = nli_sentences[0:20000] train_embeddings = model.encode(pca_train_sentences, convert_to_numpy=True) #Compute PCA on the train embeddings matrix pca = PCA(n_components=new_dimension) pca.fit(train_embeddings) pca_comp = np.asarray(pca.components_) # We add a dense layer to the model, so that it will produce directly embeddings with the new size dense = models.Dense(in_features=model.get_sentence_embedding_dimension(), out_features=new_dimension, bias=False, activation_function=torch.nn.Identity()) dense.linear.weight = torch.nn.Parameter(torch.tensor(pca_comp)) model.add_module('dense', dense) # Evaluate the model with the reduce embedding size
if view_hierarchy_json.endswith('.json') and ( not view_hierarchy_json.startswith('.')): json_file_path = package_dir + '/' + trace_dir + '/' + 'view_hierarchies' + '/' + view_hierarchy_json try: with open(args.dataset + '/' + json_file_path) as f: rico_screen = load_rico_screen_dict( json.load(f)) text = get_all_texts_from_rico_screen( rico_screen) if text == []: text = [""] except TypeError as e: print(str(e) + ': ' + args.dataset) text = [""] word_embs = bert.encode(text) if len(word_embs) > 0: word_avg_emb = np.mean(word_embs, axis=0) else: word_avg_emb = word_embs[0] trace_words.append(word_avg_emb.tolist()) layout_screen = ScreenLayout(args.dataset + '/' + json_file_path) screen_pix = torch.from_numpy( layout_screen.pixels.flatten()).type( torch.FloatTensor) layout_emb = layout_autoencoder.enc(screen_pix) trace_layouts.append(layout_emb.detach().tolist()) vis_screen = ScreenVisualLayout(
sent2idx = {sentence: idx for idx, sentence in enumerate(sentences) } # storing id and sentence in dictionary duplicates = set( (sent2idx[data.texts[0]], sent2idx[data.texts[1]]) for data in gold_samples) # not to include gold pairs of sentences again # For simplicity we use a pretrained model semantic_model_name = 'bert-base-nli-stsb-mean-tokens' semantic_search_model = SentenceTransformer(semantic_model_name) logging.info("Encoding unique sentences with semantic search model: {}".format( semantic_model_name)) # encoding all unique sentences present in the training dataset embeddings = semantic_search_model.encode(sentences, batch_size=batch_size, convert_to_tensor=True) logging.info("Retrieve top-{} with semantic search model: {}".format( top_k, semantic_model_name)) # retrieving top-k sentences given a sentence from the dataset progress = tqdm.tqdm(unit="docs", total=len(sent2idx)) for idx in range(len(sentences)): sentence_embedding = embeddings[idx] cos_scores = util.pytorch_cos_sim(sentence_embedding, embeddings)[0] cos_scores = cos_scores.cpu() progress.update(1) #We use torch.topk to find the highest 5 scores top_results = torch.topk(cos_scores, k=top_k + 1)
import matplotlib.pyplot as plt import pandas as pd import numpy as np from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity dataset = pd.read_csv("csv/AM_210329_COVID7.csv", encoding="utf-8") data = dataset["MESSAGE"].values.tolist() category = dataset["CATEGORY"] # data = fetch_20newsgroups(subset='all')['data'] print(len(data)) # model = SentenceTransformer('distilbert-base-nli-mean-tokens') model = SentenceTransformer("distiluse-base-multilingual-cased-v1") embeddings = model.encode(data, show_progress_bar=True) print(len(embeddings[0]), embeddings[:2]) print("Reduce Dimension Using UMAP...") umap_embeddings = umap.UMAP(n_neighbors=15, n_components=5, metric='cosine').fit_transform(embeddings) print("Clustering Using KMeans...") cluster = KMeans(n_clusters=5).fit_predict(umap_embeddings) # print("Clustering Using HDBSCAN...") # cluster = hdbscan.HDBSCAN(min_cluster_size=30, metric='euclidean', cluster_selection_method='eom').fit(umap_embeddings) print("Save pyplot Image...") # Prepare data umap_data = umap.UMAP(n_neighbors=15,
pip install transformers -U # In[6]: pip install --force-reinstall numpy==1.18.5 # In[4]: from sentence_transformers import SentenceTransformer model = SentenceTransformer('distilbert-base-nli-mean-tokens') doc_embedding = model.encode([doc]) candidate_embeddings = model.encode(candidates) # In[ ]: from sklearn.metrics.pairwise import cosine_similarity top_n = 5 distances = cosine_similarity(doc_embedding, candidate_embeddings) keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]] # In[ ]:
return embed # In[394]: token_sent = [] for sentence in document: token_sent.append(word_tokenize(sentence.lower())) # In[ ]: token_sent # In[417]: embeddings = sentence_model.encode(document) import pickle #Store sentences & embeddings on disc with open('/Users/vishwa/Desktop/embeds/embeddings.pkl', "wb") as fOut: pickle.dump({ 'sentences': document, 'embeddings': embeddings }, fOut, protocol=pickle.HIGHEST_PROTOCOL) #Load sentences & embeddings from disc with open('/Users/vishwa/Desktop/embeds/embeddings.pkl', "rb") as fIn: stored_data = pickle.load(fIn) stored_sentences = stored_data['sentences']
def bert(data): sbert_model = SentenceTransformer('bert-base-nli-stsb-mean-tokens') return sbert_model.encode(data.tolist())
from sentence_transformers import SentenceTransformer from sklearn import datasets from .preprocessing import tokenize_20_newsgroup from .utils import get_distance_matrix if __name__ == "__main__": news20 = datasets.fetch_20newsgroups() news20_data = news20.data news20_target = news20.target target_names = news20.target_names model = SentenceTransformer('bert-base-nli-mean-tokens') sentences = defaultdict(list) embeddings = defaultdict(list) count = defaultdict(int) for i in range(len(news20_data)): text = tokenize_20_newsgroup(news20_data[i]) if count[target_names[news20_target[i]]] < 500: sentences[target_names[news20_target[i]]].append(text) count[target_names[news20_target[i]]] += 1 for i in target_names: embeddings[i] = model.encode(sentences[i]) distance_matrix = get_distance_matrix(embeddings, target_names)
def glove(data): sbert_model = SentenceTransformer('average_word_embeddings_glove.6B.300d') return sbert_model.encode(data.tolist())
class UndiagnosedFeatureExtractor: def __init__(self): self.tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') self.gpt = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt').cuda() self.embedder = SentenceTransformer('bert-base-nli-mean-tokens').cuda() self.pos_phrase = "I have an undiagnosed disease. " self.keywords = [term.strip().lower() for term in open('tweet_crawler/terms.txt').read().split('\n') if term != "" and term != "undiagnosed" and term != "disease"] self.udn_examples = list(open('data/UDN_patient_search_TWEET_samples.txt').read().split('\n')) + \ list(open('data/UDN_patient_search_WEB_samples.txt').read().split('\n')) # self.phrase_gpt_score = gpt_log_prob_score([self.phrase], self.gpt, self.tokenizer) self.pos_phrase_emb = self.embedder.encode([self.pos_phrase])[0] def extract_features(self, texts): # SBERT SIMILARITY FEATURE sbert_scores = sentence_bert_score(texts, [self.pos_phrase] * len(texts), self.embedder, return_all=True) # GPT LOG PROBABILITY FEATURES text_gpt_scores = gpt_log_prob_score(texts, self.gpt, self.tokenizer, return_all=True) pos_phrase_and_texts = [text + self.pos_phrase for text in texts] pos_phrase_text_gpt_scores = gpt_log_prob_score(pos_phrase_and_texts, self.gpt, self.tokenizer, return_all=True) # neg_phrase_text_gpt_scores = gpt_log_prob_score(neg_phrase_and_texts, self.gpt, self.tokenizer, return_all=True) phrase_text_mmis = [] for pos_phrase_score, text_score in zip(pos_phrase_text_gpt_scores, text_gpt_scores): # negate loss for log probability phrase_text_mmi = (pos_phrase_score - text_score) / text_score phrase_text_mmis.append(phrase_text_mmi) # TEXT LENGTH FEATURE text_lens = [math.log(len(text.split())) for text in texts] # KEYWORD FEATURE texts_have_keywords = [] for text in texts: text_has_keywords = False text_lower = text.lower() for keyword in self.keywords: if keyword in text_lower: text_has_keywords = True texts_have_keywords.append(text_has_keywords) # DOCTORS FEATURE texts_have_doctors = ['doctor' in text.lower() for text in texts] # UDN EXAMPLES FEATURE udn_features = [] for text in texts: udn_bleu = nltk.translate.bleu_score.sentence_bleu(self.udn_examples, text) udn_features.append(udn_bleu) # this line returns a single feature for ablation testing # return np.array(udn_features)[:, np.newaxis] return np.array(list(zip(sbert_scores, text_gpt_scores, phrase_text_mmis, text_lens, texts_have_keywords, texts_have_doctors, udn_features)))
class EmbeddingRetriever(BaseRetriever): def __init__( self, document_store: Type[BaseDocumentStore], embedding_model: str, gpu: bool = True, model_format: str = "farm", pooling_strategy: str = "reduce_mean", emb_extraction_layer: int = -1, ): """ TODO :param document_store: :param embedding_model: :param gpu: :param model_format: """ self.document_store = document_store self.model_format = model_format self.embedding_model = embedding_model self.pooling_strategy = pooling_strategy self.emb_extraction_layer = emb_extraction_layer logger.info( f"Init retriever using embeddings of model {embedding_model}") if model_format == "farm" or model_format == "transformers": self.embedding_model = Inferencer.load( embedding_model, task_type="embeddings", extraction_strategy=self.pooling_strategy, extraction_layer=self.emb_extraction_layer, gpu=gpu, batch_size=4, max_seq_len=512, num_processes=0) elif model_format == "sentence_transformers": from sentence_transformers import SentenceTransformer # pretrained embedding models coming from: https://github.com/UKPLab/sentence-transformers#pretrained-models # e.g. 'roberta-base-nli-stsb-mean-tokens' self.embedding_model = SentenceTransformer(embedding_model) else: raise NotImplementedError def retrieve(self, query: str, candidate_doc_ids: [str] = None, top_k: int = 10) -> [Document]: query_emb = self.create_embedding(texts=[query]) documents = self.document_store.query_by_embedding( query_emb[0], top_k, candidate_doc_ids) return documents def create_embedding(self, texts: [str]): """ Create embeddings for each text in a list of texts using the retrievers model (`self.embedding_model`) :param texts: texts to embed :return: list of embeddings (one per input text). Each embedding is a list of floats. """ # for backward compatibility: cast pure str input if type(texts) == str: texts = [texts] assert type( texts ) == list, "Expecting a list of texts, i.e. create_embeddings(texts=['text1',...])" if self.model_format == "farm": res = self.embedding_model.inference_from_dicts(dicts=[{ "text": t } for t in texts]) emb = [list(r["vec"]) for r in res] #cast from numpy elif self.model_format == "sentence_transformers": # text is single string, sentence-transformers needs a list of strings res = self.embedding_model.encode( texts) # get back list of numpy embedding vectors emb = [list(r) for r in res] #cast from numpy return emb
from sentence_transformers import SentenceTransformer, util from PIL import Image #Load CLIP model model = SentenceTransformer('clip-ViT-B-32') #Encode an image: img_emb = model.encode(Image.open('two_dogs_in_snow.jpg')) #Encode text descriptions text_emb = model.encode([ 'Two dogs in the snow', 'A cat on a table', 'A picture of London at night' ]) #Compute cosine similarities cos_scores = util.cos_sim(img_emb, text_emb) print(cos_scores)
class MemNav: def __init__(self, root_dir='.'): """Load models, preprocess text, precompute embeddings.""" self.root_dir = root_dir # Load language models self.qa = pipeline('question-answering') self.sum = pipeline('summarization') self.text_encoder = SentenceTransformer('msmarco-distilbert-base-v2') self.pair_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6') # Load list of entries self.entries = [ open(self.root_dir + '/' + file).read() for file in sorted(os.listdir(root_dir)) ] # Tokenize entries into sentences self.entries = [sent_tokenize(entry.strip()) for entry in self.entries] # Merge each 3 consecutive sentences into one passage self.entries = list( chain(*[[ ' '.join(entry[start_idx:min(start_idx + 3, len(entry))]) for start_idx in range(0, len(entry), 3) ] for entry in self.entries])) # Pre-compute passage embeddings self.passage_embeddings = self.text_encoder.encode( self.entries, show_progress_bar=True) def retrieval(self, query): """Utility for retrieving passages most relevant to a given query.""" # First pass, find passages most similar to query question_embedding = self.text_encoder.encode(query, convert_to_tensor=True) hits = util.semantic_search(question_embedding, self.passage_embeddings, top_k=100)[0] # Second pass, re-rank passages more thoroughly cross_scores = self.pair_encoder.predict( [[query, self.entries[hit['corpus_id']]] for hit in hits]) for idx in range(len(cross_scores)): hits[idx]['cross-score'] = cross_scores[idx] # Select best few results hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True) results = [] for hit in hits[:5]: if hit['cross-score'] > 1e-3: results += [self.entries[hit['corpus_id']]] return results def search(self, query): """Search knowledge base for passages most relevant to a given query.""" print(*self.retrieval(query), sep='\n\n') def ask(self, question): """Obtain an answer to a question posed to the knowledge base. Provides retrieved passages as context for a question-answering pipeline.""" return self.qa(question, ' '.join(self.retrieval(question)))['answer'] def summarize(self, query): """Obtain a summary related to the query using the knowledge base. Provides retrieved passages as input for a summarization pipeline.""" return self.sum(' '.join(self.retrieval(query)), 130, 30, False)[0]['summary_text']
) scores_df = pd.concat( pd.read_parquet(parquet_file) for parquet_file in scores_dir.glob('*.parquet')) logger.info(f'Loaded {str(scores_df.shape[0])} scores') text_df = pd.concat( pd.read_parquet(parquet_file) for parquet_file in random_set_dir.glob('*.parquet')) logger.info(f'Loaded {str(text_df.shape[0])} tweets') df = scores_df.merge(text_df, on="tweet_id", how='inner') logger.info(f'Merged scores and text. Merge size: {str(df.shape[0])}') df['rank'] = df['score'].rank(method='dense', ascending=False) logger.info('Start encoding') corpus_embeddings = embedder.encode(df['text'].tolist(), show_progress_bar=True, convert_to_numpy=True) logger.info('Done encoding') if not os.path.exists(os.path.join(args.output_folder, args.model_type)): os.makedirs(os.path.join(args.output_folder, args.model_type)) output_path = f'{args.output_folder}/{args.model_type}/embeddings-{label}.pkl' with open(output_path, "wb") as fOut: pickle.dump( { 'sentences': df['text'].tolist(), 'rank': df['rank'].tolist(), 'embeddings': corpus_embeddings }, fOut) logger.info(f'Embeddings for {label} saved at {output_path}')
def evaluate_neighbors(index, docMessages, embeddingtolabelmap, docStringLength_avg, droppedClassWithLessLength, docLabelToTextForSentenceTokenizationAndAnalysis, model): k = 3 fp = 0 fn = 0 tp = 0 tn = 0 efp = 0 efn = 0 etp = 0 etn = 0 positivepresent = False exactpositivepresent = False totaldocs = 0 embed = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4') originalout = sys.stdout transformer = SentenceTransformer(model) with open( '../../data/codeGraph/stackoverflow_questions_per_class_func_3M_filtered_new.json', 'r') as data, open('./stackNewJsonAllMask_' + model + '_.txt', 'w') as outputFile: firstJsonCollect = ijson.items(data, 'results.bindings.item') postMap = {} for jsonObject in firstJsonCollect: objectType = jsonObject['class_func_type']['value'].replace( 'http://purl.org/twc/graph4code/ontology/', '') if objectType != 'Class': continue stackText = jsonObject['content_wo_code']+ \ " " + jsonObject['answer_wo_code'] soup = BeautifulSoup(stackText, 'html.parser') for code in soup.find_all('code'): code.decompose() stackText = soup.get_text() classLabel = jsonObject['class_func_label']['value'] if stackText in postMap: postMap[stackText].append(classLabel) else: postMap[stackText] = [classLabel] data.close() newData = open( '../../data/codeGraph/stackoverflow_questions_per_class_func_3M_filtered_new.json', 'r') jsonCollect = ijson.items(newData, 'results.bindings.item') sys.stdout = outputFile for jsonObject in jsonCollect: totaldocs += 1 objectType = jsonObject['class_func_type']['value'].replace( 'http://purl.org/twc/graph4code/ontology/', '') if objectType != 'Class': continue title = jsonObject['title']['value'] classLabel = jsonObject['class_func_label']['value'] originalStackText = jsonObject['content_wo_code']+ \ " " + jsonObject['answer_wo_code'] if classLabel in droppedClassWithLessLength: continue soup = BeautifulSoup(originalStackText, 'html.parser') for code in soup.find_all('code'): code.decompose() stackText = soup.get_text() if len(stackText) < 50: continue # print('\nTitle of Stack Overflow Post:', title) print('Class associated with post:', classLabel, '\n') print('Text of post before masking:', stackText, '\n') maskedText = None for foundLabel in postMap[stackText]: splitLabel = foundLabel.lower().split('.') wholePattern = re.compile(foundLabel.lower(), re.IGNORECASE) maskedText = wholePattern.sub(' ', stackText) for labelPart in splitLabel: partPattern = re.compile(labelPart, re.IGNORECASE) maskedText = partPattern.sub( ' ', maskedText) #maskedText.replace(labelPart, ' ') embeddedText = transformer.encode([maskedText]) #maskedText D, I = index.search(np.asarray(embeddedText, dtype=np.float32), k) distances = D[0] indices = I[0] # print("Distances of related vectors:", distances) # print("Indices of related vectors:", indices) positivepresent = False exactpositivepresent = False for p in range(0, k): properIndex = indices[p] embedding = docMessages[properIndex] adjustedembedding = np.asarray(embedding, dtype=np.float32).tobytes() label = embeddingtolabelmap[adjustedembedding] ##multiple docstrings associated with the same embedding mapped ##array of labels mapped j = 0 for l in label: if l.startswith(classLabel.split(".")[0]): positivepresent = True if j == 0: print( "\n True positive label being contributed by \n", l) else: print("and \t", l) else: print("class not associated", l) if l == classLabel: exactpositivepresent = True print( "\n Exact positive label being contributed by \n", l) j = j + 1 if not positivepresent: fp = fp + 1 print("Loose False Positive Present \n") print( "Investigating the reason with sentence tokenized docstring for:", classLabel, "\n") print( sent_tokenize( docLabelToTextForSentenceTokenizationAndAnalysis[ classLabel])) else: tp = tp + 1 # print("Loose True Positive Present -------------------------------------------------------- \n") if not exactpositivepresent: efp = efp + 1 # print("match False Positive Present ------------------------------------------------------- \n") else: etp = etp + 1 # print("match True Positive Present -------------------------------------------------------- \n") print("--------------------------------------------- \n") print(tp / (tp + fp), " Loose Precision at 5 with all masking for model: " + model) print(etp / (etp + efp), "Exact Precision at 5 with all masking for model: " + model) sys.stdout = originalout
class EmbKnn: def __init__(self, path: str, args): self.args = args self.device = torch.device("cuda:0" if torch.cuda.is_available() and not self.args.no_cuda else "cpu") with DisableLogger(): if path is not None and os.path.exists(path): self.model = SentenceTransformer(path) elif 'roberta' in self.args.bert_model: self.model = SentenceTransformer( 'roberta-base-nli-stsb-mean-tokens') else: self.model = SentenceTransformer('bert-base-nli-mean-tokens') self.model.to(self.device) self.cached_embeddings = None def save(self, dir_path): self.model.save(dir_path) def cache(self, example_sentences): self.model.eval() self.cached_embeddings = self.model.encode(example_sentences, show_progress_bar=False) def encode(self, text): self.model.eval() query_embeddings = self.model.encode(text, show_progress_bar=False) return torch.FloatTensor(query_embeddings) def predict(self, text): assert self.cached_embeddings is not None self.model.eval() query_embeddings = self.model.encode(text, show_progress_bar=False) distances = scipy.spatial.distance.cdist(query_embeddings, self.cached_embeddings, "cosine") distances = 1.0 - distances return torch.FloatTensor(distances) def train(self, train_examples, dev_examples, dir_path=None): train_examples = SentencesDataset(train_examples, self.model) dev_examples = SentencesDataset(dev_examples, self.model) train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=self.args.train_batch_size) dev_dataloader = DataLoader(dev_examples, shuffle=False, batch_size=self.args.eval_batch_size) train_loss = losses.CosineSimilarityLoss(model=self.model) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) warmup_steps = math.ceil( len(train_examples) * self.args.num_train_epochs / self.args.train_batch_size * self.args.warmup_proportion) self.model.zero_grad() self.model.train() self.model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=self.args.num_train_epochs, evaluation_steps=10000, warmup_steps=warmup_steps, output_path=None, optimizer_params={ 'lr': self.args.learning_rate, 'eps': 1e-6, 'correct_bias': False })