from sentence_transformers import SentenceTransformer, util model = SentenceTransformer('paraphrase-distilroberta-base-v1') # Single list of sentences sentences = [ '我糙你媽', '我想跟你作愛', '你媽媽沒屁眼', '你兒子沒屁眼', '我想買一堆房子', '你腦袋破洞', '你腦袋進水了嗎', '這幾個房子我好想要' ] #Compute embeddings embeddings = model.encode(sentences, convert_to_tensor=True) #Compute cosine-similarities for each sentence with each other sentence cosine_scores = util.pytorch_cos_sim(embeddings, embeddings) #Find the pairs with the highest cosine similarity scores pairs = [] for i in range(len(cosine_scores) - 1): for j in range(i + 1, len(cosine_scores)): pairs.append({'index': [i, j], 'score': cosine_scores[i][j]}) #Sort scores in decreasing order pairs = sorted(pairs, key=lambda x: x['score'], reverse=True) for pair in pairs[0:10]: i, j = pair['index'] print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], pair['score']))
# Our input document we want to summarize # As example, we take the first section from Wikipedia document = """ New York City (NYC), often called simply New York, is the most populous city in the United States. With an estimated 2019 population of 8,336,817 distributed over about 302.6 square miles (784 km2), New York City is also the most densely populated major city in the United States. Located at the southern tip of the U.S. state of New York, the city is the center of the New York metropolitan area, the largest metropolitan area in the world by urban landmass. With almost 20 million people in its metropolitan statistical area and approximately 23 million in its combined statistical area, it is one of the world's most populous megacities. New York City has been described as the cultural, financial, and media capital of the world, significantly influencing commerce, entertainment, research, technology, education, politics, tourism, art, fashion, and sports. Home to the headquarters of the United Nations, New York is an important center for international diplomacy. Situated on one of the world's largest natural harbors, New York City is composed of five boroughs, each of which is a county of the State of New York. The five boroughs—Brooklyn, Queens, Manhattan, the Bronx, and Staten Island—were consolidated into a single city in 1898. The city and its metropolitan area constitute the premier gateway for legal immigration to the United States. As many as 800 languages are spoken in New York, making it the most linguistically diverse city in the world. New York is home to more than 3.2 million residents born outside the United States, the largest foreign-born population of any city in the world as of 2016. As of 2019, the New York metropolitan area is estimated to produce a gross metropolitan product (GMP) of $2.0 trillion. If the New York metropolitan area were a sovereign state, it would have the eighth-largest economy in the world. New York is home to the highest number of billionaires of any city in the world. New York City traces its origins to a trading post founded by colonists from the Dutch Republic in 1624 on Lower Manhattan; the post was named New Amsterdam in 1626. The city and its surroundings came under English control in 1664 and were renamed New York after King Charles II of England granted the lands to his brother, the Duke of York. The city was regained by the Dutch in July 1673 and was subsequently renamed New Orange for one year and three months; the city has been continuously named New York since November 1674. New York City was the capital of the United States from 1785 until 1790, and has been the largest U.S. city since 1790. The Statue of Liberty greeted millions of immigrants as they came to the U.S. by ship in the late 19th and early 20th centuries, and is a symbol of the U.S. and its ideals of liberty and peace. In the 21st century, New York has emerged as a global node of creativity, entrepreneurship, and environmental sustainability, and as a symbol of freedom and cultural diversity. In 2019, New York was voted the greatest city in the world per a survey of over 30,000 people from 48 cities worldwide, citing its cultural diversity. Many districts and landmarks in New York City are well known, including three of the world's ten most visited tourist attractions in 2013. A record 62.8 million tourists visited New York City in 2017. Times Square is the brightly illuminated hub of the Broadway Theater District, one of the world's busiest pedestrian intersections, and a major center of the world's entertainment industry. Many of the city's landmarks, skyscrapers, and parks are known around the world. Manhattan's real estate market is among the most expensive in the world. Providing continuous 24/7 service and contributing to the nickname The City that Never Sleeps, the New York City Subway is the largest single-operator rapid transit system worldwide, with 472 rail stations. The city has over 120 colleges and universities, including Columbia University, New York University, Rockefeller University, and the City University of New York system, which is the largest urban public university system in the United States. Anchored by Wall Street in the Financial District of Lower Manhattan, New York City has been called both the world's leading financial center and the most financially powerful city in the world, and is home to the world's two largest stock exchanges by total market capitalization, the New York Stock Exchange and NASDAQ. """ # Split the document into sentences sentences = nltk.sent_tokenize(document) print("Num sentences:", len(sentences)) # Compute the sentence embeddings embeddings = model.encode(sentences, convert_to_tensor=True) # Compute the pair-wise cosine similarities cos_scores = util.pytorch_cos_sim(embeddings, embeddings).numpy() # Compute the centrality for each sentence centrality_scores = degree_centrality_scores(cos_scores, threshold=None) # We argsort so that the first element is the sentence with the highest score most_central_sentence_indices = np.argsort(-centrality_scores) # Print the 5 sentences with the highest scores print("\n\nSummary:") for idx in most_central_sentence_indices[0:5]: print(sentences[idx].strip())
def evaluate_glosses(gloss_fname, bnid_fname, batch_size, all_gloss_sentences, all_gloss_bnids, all_bnids): """ gloss_fname(str): Name of file containing glosses for a specific language. bnid_fname(str): Name of file containing BNids (e.g. unique indices) for each gloss in `gloss_fname`. batch_size(int): Batch size for Sentence BERT. all_sentences(list[str]): All glosses loaded from `gloss_fname`. all_gloss_bnids(list[str]): All gloss BNids loaded from `bnid_fname`. all_bnids(list[str]): All BNids/nodes in Visualsem. """ # evaluate on random subset of 2000 glosses valid_idxs = numpy.random.permutation( numpy.array(range(all_gloss_bnids.shape[0])))[:2000] n_examples = len(valid_idxs) print("... Number of examples randomly selected for evaluation: ", n_examples) with h5py.File(gloss_fname + ".sentencebert.h5", 'r') as fh_gloss: glosses_feats = fh_gloss["features"][:] glosses_feats = torch.tensor(glosses_feats) ranks_predicted = [] for idxs_ in grouper(batch_size, valid_idxs): idxs = [] for i in idxs_: if not i is None: idxs.append(i) queries = glosses_feats[idxs] scores = util.pytorch_cos_sim(queries, glosses_feats) scores = scores.cpu().numpy() # query vs. itself is in main diagonal. fill it with large negative not to retrieve it. other_glosses_with_same_bnid_minibatch = [] for x, y in zip(list(range(batch_size)), idxs): scores[x, y] = -10.0 # retrieve ground-truth bnids for each example this_gloss_bnid = all_gloss_bnids[y] other_glosses_with_same_bnid = numpy.where( all_gloss_bnids == this_gloss_bnid)[0] other_glosses_with_same_bnid_minibatch.append( other_glosses_with_same_bnid) ranks = numpy.argsort( scores) # sort scores by cosine similarity (low to high) ranks = ranks[:, ::-1] # sort by cosine similarity (high to low) for r, index, bnid in zip(range(ranks.shape[0]), idxs, other_glosses_with_same_bnid_minibatch): # uncomment the print statement below to debug/gather details on the predicted nodes. #print( # "bnid: ", all_gloss_bnids[bnid[0]], ", bnid indices: ", bnid, # ", numpy.in1d(ranks[r], bnid_indices): ", # numpy.in1d(ranks[r], bnid), # numpy.where( numpy.in1d(ranks[r], bnid) )[0], # "bnid rank 0: ", all_gloss_bnids[ranks[r,0]] #) rank_predicted = numpy.where(numpy.in1d(ranks[r], bnid))[0][0] ranks_predicted.append(rank_predicted) ranks_predicted = numpy.array(ranks_predicted) #print("ranks_predicted: ", ranks_predicted) print("... Rank mean/std: ", ranks_predicted.mean().item(), ranks_predicted.std().item()) for k in [1, 3, 5, 10]: print("... Accuracy (hits@%i): %.2f%%" % (k, (ranks_predicted <= (k - 1)).sum() * 1.0 / ranks_predicted.shape[0] * 100))
def compare(self, claim: str, targets: list): claim_embedding = self.embed([claim]) targets_embedding = self.embed(targets) cosine_scores = util.pytorch_cos_sim(claim_embedding, targets_embedding) return cosine_scores
def ent_sim(self, blob: Tensor, entity: EntityObject): ent_tensor = self.transformer.encode(entity.text, convert_to_tensor=True) weight = float(util.pytorch_cos_sim(blob, ent_tensor)) return weight
def dist(x, y): return util.pytorch_cos_sim(x, y)[0]
def get_cosine(query_embeddings): distances = util.pytorch_cos_sim(query_embeddings, ingredient_embedding)[0] distances = distances.cpu() score = distances.sum().item() return score
# Score for each text from topic (query, question, narrative) ############################################################## # Extracting text from each topic topics_score = np.zeros( ( len(list(topics.keys())), len(corpus) ) ) for topic_idx, (n_topic, topic_data) in enumerate(tqdm(topics.items(), desc = "Topic", position =0, leave = False)): query = topic_data["query"] question = topic_data["question"] narrative = topic_data["narrative"] # Corpus vs Topic Query scores query_embedding = embedder.encode([query], convert_to_tensor=True, show_progress_bar=False) query_embedding = torch.flatten(query_embedding, start_dim=0) cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0] query_cos_scores = cos_scores.cpu() # Corpus vs Topic Question scores question_embedding = embedder.encode([question], convert_to_tensor=True, show_progress_bar=False) question_embedding = torch.flatten(question_embedding, start_dim=0) cos_scores = util.pytorch_cos_sim(question_embedding, corpus_embeddings)[0] question_cos_scores = cos_scores.cpu() # Corpus vs Topic Narrative scores narrative_embedding = embedder.encode([narrative], convert_to_tensor=True, show_progress_bar=False) narrative_embedding = torch.flatten(narrative_embedding, start_dim=0) cos_scores = util.pytorch_cos_sim(narrative_embedding, corpus_embeddings)[0]
def rankDocuments(self, query): query_encoded = self.model.encode(query) return util.pytorch_cos_sim(query_encoded, self.document).numpy()
end = datetime.now() print("time : " + str(end - start)) # 검색 문장 임베딩 생성 start = datetime.now() test_encode = model.encode(test_sentence) end = datetime.now() print("time : " + str(end - start)) # 유사도 추출 start = datetime.now() cos_sim = util.pytorch_cos_sim(sent_encode, test_encode) end = datetime.now() print("time : " + str(end - start)) # 가장 유사한 5개 문장 start = datetime.now() with open("data/output/result.txt", "w", encoding="utf-8") as f: for i in range(len(test_sentence)): simil = [] for j in range(len(sentences)): simil.append( round(
positive_df = df.loc[df['class'] == 1].reset_index(drop=True) # print(f'# positives: {positive_df.shape[0]}') # compute expansion path_initial_labels = '/home/manuto/Documents/world_bank/bert_twitter_labor/twitter-labor-data/data/train_test/US/jan5_iter0/raw' seed_positive_df = pd.read_parquet( os.path.join(path_initial_labels, 'all_labels_with_text.parquet')) seed_positive_df = seed_positive_df.loc[ seed_positive_df[label] == 'yes'].reset_index(drop=True) seed_positive_tweet_list = seed_positive_df['text'].tolist() seed_positive_embeddings = model.encode(seed_positive_tweet_list, convert_to_tensor=True) positive_tweet_list = positive_df['text'].tolist() positive_embeddings = model.encode(positive_tweet_list, convert_to_tensor=True) cosine_scores = util.pytorch_cos_sim(positive_embeddings, seed_positive_embeddings) expansion_rate = (-torch.sum(cosine_scores) / (len(positive_tweet_list) * len(seed_positive_tweet_list))).item() # store results results_dict[inference_folder][label][ 'expansion_rate'] = expansion_rate # organize results results_df = pd.DataFrame.from_dict(results_dict) results_list = list() for inference_folder in inference_folder_dict[args.country_code]: results_iter_df = results_df[inference_folder].apply(pd.Series) iter_number = int(re.findall('iter_(\d)', inference_folder)[0]) results_iter_df['iter'] = iter_number results_list.append(results_iter_df) results_df = pd.concat(results_list)
def similarity_matrix(sentences): ## with sentence_transformers embeddings = model.encode(sentences, convert_to_tensor=True) cosine_scores = abs(util.pytorch_cos_sim(embeddings, embeddings).cpu()) return cosine_scores
def find_similar_sentences( self, query: str, cands: List[str], ) -> Dict: """ Conduct find similar sentences Args: query (str): query sentence to be acted as anchor cands (List[str]): candidate sentences to be compared Returns: Dict[str, List[Tuple[str, float]]]: list of tuple containing candidate sentence and its score Examples: >>> se = Pororo(task="sentence_embedding") >>> query = "He is the tallest person in the world" >>> cands = [ >>> "I hate this guy.", >>> "You are so lovely!.", >>> "Tom is taller than Jim." >>> ] >>> se.find_similar_sentences(query, cands) { 'query': 'He is the tallest person in the world', 'ranking': [(2, 'Tom is taller than Jim.', 0.49), (1, 'You are so lovely!.', 0.47), (0, 'I hate this guy.', 0.22)] } >>> se = Pororo(task="sentence_embedding", lang="ko") >>> query = "고양이가 창 밖을 바라본다" >>> cands = [ >>> "고양이가 카메라를 켠다", >>> "남자와 여자가 걷고 있다", >>> "고양이가 개를 만지려 하고 있다", >>> "두 마리의 고양이가 창문을 보고 있다", >>> "테이블 위에 앉아 있는 고양이가 창밖을 내다보고 있다", >>> "창밖을 내다보는 고양이" >>> ] >>> se.find_similar_sentences(query, cands) { 'query': '고양이가 창 밖을 바라본다', 'ranking': [(5, '창밖을 내다보는 고양이', 0.93), (4, '테이블 위에 앉아 있는 고양이가 창밖을 내다보고 있다', 0.91), (3, '두 마리의 고양이가 창문을 보고 있다', 0.78), (0, '고양이가 카메라를 켠다', 0.74), (2, '고양이가 개를 만지려 하고 있다', 0.41)] } >>> se = Pororo(task="sentence_embedding", lang="ja") >>> query = "おはようございます" # Good morning >>> cands = ["こんにちは", "失礼します", "こんばんは"] # Hello | Please Excuse Me (for Leaving) | Good evening >>> se.find_similar_sentences(query, cands) { 'query': 'おはようございます', 'ranking': [(0, 'こんにちは', 0.58), (2, 'こんばんは', 0.48), (1, '失礼します', 0.27)] } >>> se = Pororo(task="sentence_embedding", lang="zh") >>> query = "欢迎光临" # Welcome >>> cands = ["你好。", "你会说英语吗?", "洗手间在哪里?"] # Hello | Do you speak English? | Where is the bathroom? >>> se.find_similar_sentences(query, cands) { 'query': '欢迎光临', 'ranking': [(0, '你好。', 0.53), (2, '洗手间在哪里?', 0.2), (1, '你会说英语吗?', 0.09)] } """ query_embedding = self._model.encode(query, convert_to_tensor=True) corpus_embeddings = self._model.encode(cands, convert_to_tensor=True) cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0] cos_scores = cos_scores.cpu() k = min(len(cos_scores), 5) top_results = np.argpartition(-cos_scores, range(k))[0:k] top_results = top_results.tolist() result = list() for idx in top_results: result.append( (idx, cands[idx].strip(), round(cos_scores[idx].item(), 2))) return { "query": query.strip(), "ranking": result, }
def cosine_sim(q1, q2): embeddings1 = sbert_model.encode([clean_text(q1)], convert_to_tensor=True) embeddings2 = sbert_model.encode([clean_text(q2)], convert_to_tensor=True) cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2) return cosine_scores[0][0].item()
def BertEM(path_train, path_valid, path_test, path_error,epochs_num, warmup_steps_num, evaluation_steps_num): #实例化进度条 bar = progressbar #定义模型 #model = SentenceTransformer('bert-large-nli-stsb-mean-tokens',device='cuda:1') model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens',device='cuda:2') #model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens',device='cuda:4') data_type = {"text_a": str, "text_b": str} train_data = pd.read_csv(path_train, encoding='utf-8',dtype=data_type) valid_data = pd.read_csv(path_valid, encoding='utf-8',dtype=data_type) test_data = pd.read_csv(path_test, encoding='utf-8',dtype=data_type) #训练集 train_examples = [] for i in bar.progressbar(range(len(train_data))): time.sleep(0.0001) text_a = train_data.iloc[i]['text_a'] text_b = train_data.iloc[i]['text_b'] text_a = str(text_a) text_b = str(text_b) label_data = train_data.iloc[i]['label'] label_data = float(label_data) train_examples.append(InputExample(texts=[text_a,text_b], label=label_data)) print(InputExample) #验证集 sentence_a = [] sentence_b = [] label_valid = [] for i in bar.progressbar(range(len(valid_data))): time.sleep(0.0001) sentence1 = valid_data.iloc[i]['text_a'] sentence2 = valid_data.iloc[i]['text_b'] label_valid_t = valid_data.iloc[i]['label'] label_valid_t = float(label_valid_t) sentence_a.append(sentence1) sentence_b.append(sentence2) label_valid.append(label_valid_t) #定义评估器 #evaluator = evaluation.EmbeddingSimilarityEvaluator(sentence_a, sentence_b, label_valid) evaluator = evaluation.BinaryClassificationEvaluator(sentence_a, sentence_b, label_valid) #定义数据集,损失函数 train_dataset = SentencesDataset(train_examples, model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=64) train_loss = losses.CosineSimilarityLoss(model) #计算时间 start_time = time.clock() #训练模型 model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=epochs_num, warmup_steps=warmup_steps_num,evaluator=evaluator, evaluation_steps=evaluation_steps_num, use_amp=True) end_time = time.clock() #=========================================评估过程=================================================== #读取并把test所有属性转化成str test_data = pd.read_csv(path_test, encoding='utf-8') test_data['text_a'] = test_data['text_a'].map(lambda x: str(x)) test_data['text_b'] = test_data['text_b'].map(lambda x: str(x)) #循环创建预测的list字典 list_num = 38 prefix = 'pred_list_' test_map = {prefix + str(i): [] for i in range(list_num)} for i in range(len(test_map.keys())): test_map[prefix + str(i)].append(0) test_map[prefix + str(i)].append(0) test_map[prefix + str(i)].append(0) test_map[prefix + str(i)].append(0) label_list = [] score = 0.20 #记录错误的dataframe error_csv = pd.DataFrame(columns=('id','text_a','text_b','cos_scores')) #记录计算分数的dataframe score_df = pd.DataFrame(columns=('label','pred')) #进入测试集测试 for i in bar.progressbar(range(len(test_data))): time.sleep(0.0001) text_a_embedding = model.encode(test_data.iloc[i]['text_a'], convert_to_tensor=True) text_b_embedding = model.encode(test_data.iloc[i]['text_b'], convert_to_tensor=True) cos_scores = util.pytorch_cos_sim(text_a_embedding, text_b_embedding)[0] cos_scores = cos_scores.cpu() #标签list label = test_data.iloc[i]['label'] label = int(label) label_list.append(label) #记录下错误的数据 if cos_scores >= 0.80: pred_test = 1 else: pred_test = 0 if pred_test != label: error_text_a = test_data.iloc[i]['text_a'] error_text_b = test_data.iloc[i]['text_b'] error_cos_scores = cos_scores error_csv = error_csv.append(pd.DataFrame({'id':[i],'text_a':[error_text_a],'text_b':[error_text_b],'cos_scores':[error_cos_scores]}),ignore_index=True) #生成预测list statistics_pred(score, label, cos_scores, prefix, test_map) #compute_pred(score,cos_scores,prefix,test_map) # error_csv.to_csv(path_error, index=0) max_f1 = 0 target_threshold = 0.01 target_precision = 0.01 target_recall = 0.01 threshold = 0.20 #循环所有列表,输出各种得分结果 for i in range(len(test_map.keys())): #循环计算得分 precision, recall, f1 = compute_score(test_map[prefix + str(i)][0], test_map[prefix + str(i)][1], test_map[prefix + str(i)][2], test_map[prefix + str(i)][3]) if f1 >= max_f1: max_f1 = f1 target_threshold = threshold target_precision = precision target_recall = recall print('The score > {} result is precision: {}, | recall:{}, | f1: {}'.format(round(threshold,2), precision, recall, f1)) threshold += 0.02 #输出所有结果 print('================dataset_name==================',path_a) print('================threshold:{}, target_precision:{}, target_recall:{}, max_f1:{}'.format(target_threshold, target_precision, target_recall, max_f1)) print('================train_time:{}'.format(str(end_time-start_time)))
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1): if epoch != -1: out_txt = " after epoch {}:".format(epoch) if steps == -1 else " in epoch {} after {} steps:".format(epoch, steps) else: out_txt = ":" logging.info("Image Text Retrieval Evaluation on " + self.name + " dataset" + out_txt) self.i2t_num_hits_at_k = {k: [] for k in self.recall_at_k} self.i2t_MRR = {k: [] for k in self.mrr_at_k} self.i2t_ndcg = {k: [] for k in self.ndcg_at_k} self.i2t_AveP_at_k = {k: [] for k in self.map_at_k} self.t2i_num_hits_at_k = {k: [] for k in self.recall_at_k} self.t2i_MRR = {k: [] for k in self.mrr_at_k} self.t2i_ndcg = {k: [] for k in self.ndcg_at_k} self.t2i_AveP_at_k = {k: [] for k in self.map_at_k} self.mR = 0 max_k = max(max(self.mrr_at_k), max(self.ndcg_at_k), max(self.recall_at_k), max(self.map_at_k)) time_start = time.time() image_embeddings = model.encode(sentences=self.tags, images=self.images, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_tensor=True) caption_embeddings = model.encode(sentences=self.captions, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_tensor=True) delta_time = time.time() - time_start i2t_result_list = [[] for _ in range(0, len(image_embeddings))] t2i_result_list = [[] for _ in range(0, len(caption_embeddings))] time_start = time.time() if self.batched_sim > 0: for start_idx in tqdm(range(0, len(image_embeddings), self.batched_sim), desc="Similarity"): cos_scores = pytorch_cos_sim(image_embeddings[start_idx:start_idx+self.batched_sim], caption_embeddings) # Image to Text cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(cos_scores, min(max_k, len(cos_scores[0]) - 1), dim=1, largest=True, sorted=False) i2t_cos_scores_top_k_values = cos_scores_top_k_values.tolist() i2t_cos_scores_top_k_idx = cos_scores_top_k_idx.tolist() for query_itr in range(len(i2t_cos_scores_top_k_values)): for captions_id, score in zip(i2t_cos_scores_top_k_idx[query_itr], i2t_cos_scores_top_k_values[query_itr]): captions_id = self.captions_ids[captions_id] i2t_result_list[start_idx+query_itr].append({'captions_id': captions_id, 'score': score}) else: #Compute cosine similarites cos_scores = pytorch_cos_sim(image_embeddings, caption_embeddings) # Image to Text cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(cos_scores, min(max_k, len(cos_scores[0]) - 1), dim=1, largest=True, sorted=False) i2t_cos_scores_top_k_values = cos_scores_top_k_values.tolist() i2t_cos_scores_top_k_idx = cos_scores_top_k_idx.tolist() for query_itr in range(len(i2t_cos_scores_top_k_values)): for captions_id, score in zip(i2t_cos_scores_top_k_idx[query_itr], i2t_cos_scores_top_k_values[query_itr]): captions_id = self.captions_ids[captions_id] i2t_result_list[query_itr].append({'captions_id': captions_id, 'score': score}) delta_time = time.time() - time_start + delta_time self._eval_i2t(i2t_result_list) time_start = time.time() if self.batched_sim > 0: for start_idx in tqdm(range(0, len(caption_embeddings), self.batched_sim), desc="Similarity"): cos_scores = pytorch_cos_sim(image_embeddings, caption_embeddings[start_idx:start_idx+self.batched_sim]) cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(cos_scores, min(max_k, len(cos_scores[1]) - 1), dim=0, largest=True, sorted=False) t2i_cos_scores_top_k_values = cos_scores_top_k_values.permute(1, 0).tolist() t2i_cos_scores_top_k_idx = cos_scores_top_k_idx.permute(1, 0).tolist() for query_itr in range(len(t2i_cos_scores_top_k_values)): for image_id, score in zip(t2i_cos_scores_top_k_idx[query_itr], t2i_cos_scores_top_k_values[query_itr]): image_id = self.images[image_id] t2i_result_list[start_idx+query_itr].append({'image_id': image_id, 'score': score}) else: cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(cos_scores, min(max_k, len(cos_scores[1]) - 1), dim=0, largest=True, sorted=False) t2i_cos_scores_top_k_values = cos_scores_top_k_values.permute(1, 0).tolist() t2i_cos_scores_top_k_idx = cos_scores_top_k_idx.permute(1, 0).tolist() for query_itr in range(len(t2i_cos_scores_top_k_values)): for image_id, score in zip(t2i_cos_scores_top_k_idx[query_itr], t2i_cos_scores_top_k_values[query_itr]): image_id = self.images[image_id] t2i_result_list[query_itr].append({'image_id': image_id, 'score': score}) delta_time = time.time() - time_start + delta_time self._eval_t2i(t2i_result_list) self._average_results() self._log_results(delta_time) self._write_output(output_path, epoch, steps) return self.mR
sent_token_list.append(sent_token) token_group = group_list[ind_1:(ind_1 + len(sent_token))] real_group_vec.append(int(max(set(token_group), key=token_group.count))) ind_1 = ind_1 + len(sent_token) sent_weights.append(len(sent_token)) sent_weights = np.array(sent_weights) / sum(sent_weights) real_group_vec = np.array(real_group_vec) n_groups = len(set(real_group_vec)) # Load sentence model sbert_model = SentenceTransformer("all-mpnet-base-v2") # Make the sentence vectors sentence_embeddings = sbert_model.encode(sent_list) # Make sim matrix sim_mat = np.array( util.pytorch_cos_sim(sentence_embeddings, sentence_embeddings)) # Compute the dissimilarity matrix d_ext_mat = similarity_to_dissimilarity(sim_mat, dist_option=dist_option) # Compute the exchange and transition matrices exch_mat, w_mat = exchange_and_transition_matrices(len(sent_list), exch_mat_opt=exch_mat_opt, exch_range=exch_range) # Compute the membership matrix result_matrix = spatial_clustering(d_ext_mat=d_ext_mat, exch_mat=exch_mat, w_mat=w_mat, n_groups=n_groups, alpha=alpha,
def __call__(self, models, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: if epoch != -1: out_txt = " after epoch {}:".format(epoch) if steps == -1 else " in epoch {} after {} steps:".format(epoch, steps) else: out_txt = ":" if isinstance(models, list): biencoder, crossencoder = models else: biencoder = models crossencoder = models logging.info("Image Text Retrieval Evaluation on " + self.name + " dataset" + out_txt) max_k = max(self.retrieve, max(self.mrr_at_k), max(self.ndcg_at_k), max(self.recall_at_k), max(self.map_at_k)) time_start = time.time() # Compute embedding for the images image_embeddings = biencoder.encode(sentences=self.tags, images=self.images, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_tensor=True) #Compute embedding for the captions caption_embeddings = biencoder.encode(sentences=self.captions, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_tensor=True) #Init score computation values self.i2t_num_hits_at_k = {k: [] for k in self.recall_at_k} self.i2t_MRR = {k: [] for k in self.mrr_at_k} self.i2t_ndcg = {k: [] for k in self.ndcg_at_k} self.i2t_AveP_at_k = {k: [] for k in self.map_at_k} self.t2i_num_hits_at_k = {k: [] for k in self.recall_at_k} self.t2i_MRR = {k: [] for k in self.mrr_at_k} self.t2i_ndcg = {k: [] for k in self.ndcg_at_k} self.t2i_AveP_at_k = {k: [] for k in self.map_at_k} self.mR = 0 i2t_result_list = [[] for _ in range(0, len(image_embeddings))] t2i_result_list = [[] for _ in range(0, len(caption_embeddings))] #Compute cosine similarites cos_scores = pytorch_cos_sim(image_embeddings, caption_embeddings).cpu() cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(cos_scores, max_k, dim=1, largest=True, sorted=False) i2t_cos_scores_top_k_values = cos_scores_top_k_values i2t_cos_scores_top_k_idx = cos_scores_top_k_idx images = [] captions = [] assign = [] for i in range(len(self.images)): if self.images[i] in self.imageid2captions: for c, c_idx in enumerate(i2t_cos_scores_top_k_idx[i]): images.append(self.images[i]) if self.tags: captions.append([self.captions[c_idx], self.tags[i]]) else: captions.append(self.captions[c_idx]) assign.append((i,c)) #mb_sims = i2t_cos_scores_top_k_values.detach().clone() if self.scoring == "combination": sims = torch.softmax(crossencoder.encode(captions, images, batch_size=self.batch_size, output_value="logits", show_progress_bar=True, convert_to_tensor=True), 1)[:,1] for s, (i,c) in zip(sims, assign): emb_score = i2t_cos_scores_top_k_values[i][c] i2t_cos_scores_top_k_values[i][c] = (1-self.scoring_factor)*s + self.scoring_factor*emb_score elif self.scoring == "combination_normalized": sims = torch.softmax(crossencoder.encode(captions, images, batch_size=self.batch_size, output_value="logits", show_progress_bar=True, convert_to_tensor=True), 1)[:,1] sims = sims.reshape(len(self.images), -1) for i in range(len(self.images)): emb_min_abs = torch.abs(torch.min(i2t_cos_scores_top_k_values[i])) emb_score = (i2t_cos_scores_top_k_values[i]-emb_min_abs)/(torch.max(i2t_cos_scores_top_k_values[i])-emb_min_abs) i2t_cos_scores_top_k_values[i] = (1-self.scoring_factor)*(sims[i]-torch.min(sims[i]))/(torch.max(sims[i])-torch.min(sims[i])) + \ self.scoring_factor*emb_score elif self.scoring == "rank": sims = crossencoder.encode(captions, images, batch_size=self.batch_size, output_value="logits", show_progress_bar=True, convert_to_tensor=True)[:,1] sims = sims.reshape(len(self.images), -1) for i in range(len(self.images)): emb_rank = torch.argsort(i2t_cos_scores_top_k_values[i]).float() sim_rank = torch.argsort(sims[i]).float() i2t_cos_scores_top_k_values[i] = (1-self.scoring_factor)*emb_rank + self.scoring_factor*sim_rank else: sims = crossencoder.encode(captions, images, batch_size=self.batch_size, output_value="logits", show_progress_bar=True, convert_to_tensor=True)[:,1] for s, (i,c) in zip(sims, assign): i2t_cos_scores_top_k_values[i][c] = s delta_time = time.time() - time_start for query_itr in range(len(i2t_cos_scores_top_k_values)): for captions_id, score in zip(i2t_cos_scores_top_k_idx[query_itr], i2t_cos_scores_top_k_values[query_itr]): captions_id = self.captions_ids[captions_id] i2t_result_list[query_itr].append({'captions_id': captions_id, 'score': score}) self._eval_i2t(i2t_result_list) time_start = time.time() cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(cos_scores, max_k, dim=0, largest=True, sorted=False) t2i_cos_scores_top_k_values = cos_scores_top_k_values.permute(1, 0) t2i_cos_scores_top_k_idx = cos_scores_top_k_idx.permute(1, 0) images = [] captions = [] assign = [] for c in range(len(self.captions)): if self.captions_ids[c] in self.imageid2captions: for i, i_idx in enumerate(t2i_cos_scores_top_k_idx[c]): images.append(self.images[i_idx]) if self.tags: captions.append([self.captions[c], self.tags[i_idx]]) else: captions.append(self.captions[c]) assign.append((c,i)) if self.scoring == "combination": sims = torch.softmax(crossencoder.encode(captions, images, batch_size=self.batch_size, output_value="logits", show_progress_bar=True, convert_to_tensor=True), 1)[:,1] for s, (i,c) in zip(sims, assign): emb_score = t2i_cos_scores_top_k_values[i][c] t2i_cos_scores_top_k_values[i][c] = (1-self.scoring_factor)*s + self.scoring_factor*emb_score elif self.scoring == "combination_normalized": sims = torch.softmax(crossencoder.encode(captions, images, batch_size=self.batch_size, output_value="logits", show_progress_bar=True, convert_to_tensor=True), 1)[:,1] sims = sims.reshape(len(self.captions), -1) for i in range(len(self.images)): emb_min_abs = torch.abs(torch.min(t2i_cos_scores_top_k_values[i])) emb_score = (t2i_cos_scores_top_k_values[i]-emb_min_abs)/(torch.max(t2i_cos_scores_top_k_values[i])-emb_min_abs) sims_score = (sims[i]-torch.min(sims[i]))/(torch.max(sims[i])-torch.min(sims[i])) t2i_cos_scores_top_k_values[i] = (1-self.scoring_factor)*sims_score + self.scoring_factor*emb_score elif self.scoring == "rank": sims = crossencoder.encode(captions, images, batch_size=self.batch_size, output_value="logits", show_progress_bar=True, convert_to_tensor=True)[:,1] sims = sims.reshape(len(self.captions), -1) for i in range(len(self.images)): emb_rank = torch.argsort(t2i_cos_scores_top_k_values[i]).float() sim_rank = torch.argsort(sims[i]).float() t2i_cos_scores_top_k_values[i] = (1-self.scoring_factor)*emb_rank + self.scoring_factor*sim_rank else: sims = crossencoder.encode(captions, images, batch_size=self.batch_size, output_value="logits", show_progress_bar=True, convert_to_tensor=True)[:,1] for s, (i,c) in zip(sims, assign): t2i_cos_scores_top_k_values[i][c] = s delta_time = time.time() - time_start + delta_time for query_itr in range(len(t2i_cos_scores_top_k_values)): for image_id, score in zip(t2i_cos_scores_top_k_idx[query_itr], t2i_cos_scores_top_k_values[query_itr]): image_id = self.images[image_id] t2i_result_list[query_itr].append({'image_id': image_id, 'score': score}) self._eval_t2i(t2i_result_list) self._average_results() self._log_results(delta_time) self._write_output(output_path, epoch, steps) return self.mR
def dist_w(x, y): return util.pytorch_cos_sim(en(x), en(y))[0]
def __getitem__(self, idx): QuesID = self.dat[idx]['QuesID'] title = self.dat[idx]['title'] answers = self.dat[idx]['answer'] sents = self.dat[idx]['sentences'] if QuesID.split('_')[1] == 'a': ques = 'What is the method of induction of disease model?' else: ques = 'What is the intervention?' ques_new = ques + ' ' + title ###### sentence-bert similarity ###### if self.method == 'sbert': sent_embeds = self.model.encode(sents, convert_to_tensor=True) ques_embed = self.model.encode([ques_new], convert_to_tensor=True) # Compute cosine-similarities for each sent with other sents cosine_scores = util.pytorch_cos_sim(sent_embeds, ques_embed) # Find the pairs with the highest cosine similarity scores pairs = [] for i in range(cosine_scores.shape[0]): pairs.append({'index': i, 'score': cosine_scores[i][0]}) pairs = sorted(pairs, key=lambda x: x['score'], reverse=True) sent_ls = [] if len(sents) > self.max_n_sent: for pair in pairs[:self.max_n_sent]: sent_ls.append(sents[pair['index']]) else: for pair in pairs: sent_ls.append(sents[pair['index']]) ###### bigram-tfidf similarity ###### if self.method == 'tfidf': sents_ques = sents + [ques_new] # vectorizer = CountVectorizer(ngram_range=(1,2), tokenizer=spacy_tokenizer, min_df=1) vectorizer = CountVectorizer(ngram_range=(1, 1), tokenizer=spacy_tokenizer, min_df=1) # vectorizer = CountVectorizer(ngram_range=(2,2), tokenizer=spacy_tokenizer, min_df=1) sents_ques_vec = tfidf_transformer.fit_transform( vectorizer.fit_transform(sents_ques)).toarray() sents_vec = torch.from_numpy(sents_ques_vec[:-1]).float() ques_vec = torch.from_numpy(sents_ques_vec[-1]).float() cosine_scores = util.pytorch_cos_sim(sents_vec, ques_vec) pairs = [] for i in range(cosine_scores.shape[0]): pairs.append({'index': i, 'score': cosine_scores[i][0]}) pairs = sorted(pairs, key=lambda x: x['score'], reverse=True) sent_ls = [] if len(sents) > self.max_n_sent: for pair in pairs[:self.max_n_sent]: sent_ls.append(sents[pair['index']]) else: for pair in pairs: sent_ls.append(sents[pair['index']]) ###### bm25 ###### if self.method == 'bm25': sent_tokens = [text2tokens(s) for s in sents] ques_tokens = text2tokens(ques_new) bm25 = BM25(sent_tokens) scores = bm25.get_scores(ques_tokens) scores_dict = dict(zip(range(len(scores)), scores)) sorted_idx = sorted(scores_dict, key=scores_dict.get, reverse=True) sent_ls = [] if len(sents) > self.max_n_sent: for sidx in sorted_idx[:self.max_n_sent]: sent_ls.append(sents[sidx]) else: for sidx in sorted_idx: sent_ls.append(sents[sidx]) return answers, sent_ls
def choose_top_n(self, claim, targets, n, concat=True, cosine_sim=True, pad=0): """ Chooses top n sentences from targets using cosine similarity to claim Parameters ---------- claim : str Input claim to match against targets targets : list List of target sentences n : int Number of sentences to return concat : bool If true, the claim will be concatenated with each of the target vectors cosine_sim : bool If true, the cosine similarity will be concatenated with each of the target vectors pad : int If non-zero, adds vectors of pad until output is length `n` Returns ------- tensor : Tensor of shape `[n, embedding_dim]`, `[n, embedding_dim * 2]`, or `[n, 1 + embedding_dim * 2]` if neither, `concat` and `cosine_sim` are True, respectively """ embedded = self.embedder.embed([claim] + targets) claim = embedded[0] # If we have an empty targets list, just return back padding if not targets: empty = torch.full(size=(n, 1 + (2 * claim.shape[-1])), fill_value=pad) return empty, torch.Tensor([]) targets = embedded[1:] k = min(len(targets), n) cosines = util.pytorch_cos_sim(claim, targets) # filter the targets/scores with top_indices top_k = torch.topk(cosines, k) top_cosines = top_k.values.T top_targets = targets[top_k.indices].squeeze() # concatenate the cosine similarity & claims if true stacked_claims = torch.stack([claim] * k) if concat and cosine_sim: result = torch.cat([stacked_claims, top_cosines, top_targets], dim=-1) elif concat: result = torch.cat([stacked_claims, top_targets], dim=-1) elif cosine_sim: result = torch.cat([top_cosines, top_targets], dim=-1) if pad != 0 and k < n: add = n - k empty = torch.full(size=(add, result.shape[-1]), fill_value=pad) result = torch.cat([result, empty], dim=0) return result, top_k.indices.squeeze()
def f_filter(tokenizer, model, device, data_loader, id2sent_dict): sim_threshold = 0.9 print("data_loader", len(data_loader)) new_data = [] iter_idx = 0 for data in data_loader: iter_idx += 1 if iter_idx % 50 == 0: print("iter_idx", iter_idx) # print("data load inside", len(data)) for data_i in data: new_data_i = {} user_i = data_i["user"] item_i = data_i["item"] candidate_i = data_i["candidate"] review_i = data_i["review"] candidate_num_i = len(candidate_i) review_num_i = len(review_i) target_i = [] candidate_sentence_i = [] # print("before candidate_num_i", candidate_num_i) if candidate_num_i > 12000: candidate_num_i = int(12000) # print("after candidate_num_i", candidate_num_i) batch_size = 4000 batch_num = candidate_num_i // batch_size else: batch_num = 4 batch_size = candidate_num_i // batch_num candidate_embed_i = [] for batch_idx in range(batch_num): candidate_sentence_batch_i = [] for j in range(batch_size): candidate_idx = batch_idx * batch_size + j candidate_ij = candidate_i[candidate_idx] sent_ij = id2sent_dict[candidate_ij] candidate_sentence_batch_i.append(sent_ij) candidate_embed_batch_i = get_sentence_embed( tokenizer, model, candidate_sentence_batch_i, device) # print("candidate_embed_batch_i", candidate_embed_batch_i.size()) candidate_embed_i.append(candidate_embed_batch_i) candidate_embed_i = torch.cat(candidate_embed_i, dim=0) # print("candidate_embed_i", candidate_embed_i.size()) review_sentence_i = [] for k in range(review_num_i): review_ik = review_i[k] sent_ik = id2sent_dict[review_ik] review_sentence_i.append(sent_ik) review_embed_i = get_sentence_embed(tokenizer, model, review_sentence_i, device) cos_scores = util.pytorch_cos_sim(candidate_embed_i, review_embed_i) mask = cos_scores > sim_threshold mask_sum = torch.sum(mask, dim=1) nonzero_index = torch.nonzero(mask_sum) nonzero_index = nonzero_index.squeeze(1) nonzero_index = nonzero_index.cpu().numpy() target_i = list(np.array(candidate_i)[nonzero_index]) # print("target_i", target_i) new_data_i["user"] = user_i new_data_i["item"] = item_i new_data_i["target"] = target_i new_data_i["review"] = review_i new_data.append(new_data_i) return new_data
def GetSimilar(self, ljtext='', simtype='COS'): def doRule(Situation): my_regex = r"[\(\(]([^一二三四五六七八九零十1234567890]{1}[^\)\)]+|[一二三四五六七八九零十1234567890]{1}[^\)\)]{1,})[\)\)]" Situation = Situation.replace(my_regex, "") # /○{2,}/ regex1 = r"○" Situation = Situation.replace(regex1, "") # /\d{10}/ regex2 = r"\d{10}" Situation = Situation.replace(regex2, "") # /[A-Z0-9]{2,3}\-[A-Z0-9]{3,4}/ 車牌 regex3 = r"[A-Z0-9]{2,3}\-[A-Z0-9]{3,4}" Situation = Situation.replace(regex3, "") Source = Situation try: words = pseg.cut(Situation) seq = '' for w, l in words: if (w in self.dt_idf): continue elif (l in ('m', 't', 'c')): continue else: seq += w Situation = seq except: Situation = Source return Situation # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity i = -1 tmpSpecial = '' show = 0 queries_ori = ljtext query = doRule(ljtext) top_k = self.top_k tStart2 = time.time() query_embedding = self.embedder.encode(query, convert_to_tensor=True) if (simtype == 'COS'): cos_scores = util.pytorch_cos_sim(query_embedding, self.corpus_embeddings)[0] cos_scores = cos_scores.cpu() # We use np.argpartition, to only partially sort the top_k results top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k] elif (simtype == 'DIS'): cos_scores = util.pytorch_Dis(query_embedding, self.corpus_embeddings) cos_scores = cos_scores.cpu() # We use np.argpartition, to only partially sort the top_k results top_results = np.argpartition(cos_scores, range(top_k))[0:top_k] elif (simtype == 'DPS'): cos_scores = util.pytorch_DPS(query_embedding, self.corpus_embeddings) cos_scores = cos_scores.cpu() # We use np.argpartition, to only partially sort the top_k results top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k] elif (simtype == 'BS'): cos_scores = util.pytorch_BS(query_embedding, self.corpus_embeddings) cos_scores = cos_scores.cpu() # We use np.argpartition, to only partially sort the top_k results top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k] elif (simtype == 'MD'): cos_scores = util.pytorch_Manhattan_Dis(query_embedding, self.corpus_embeddings) cos_scores = cos_scores.cpu() # We use np.argpartition, to only partially sort the top_k results top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k] elif (simtype == 'JS'): cos_scores = util.pytorch_Jaccard_Sim(query_embedding, self.corpus_embeddings) cos_scores = cos_scores.cpu() # We use np.argpartition, to only partially sort the top_k results top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k] else: return ljtext print("\n\n\n\n") for qlen in range(0, len(queries_ori), 50): print(show + 1 if qlen == 0 else '', "Query:" if qlen == 0 else '\t', queries_ori[qlen:qlen + 50]) # print("\nTop 5 most similar sentences in corpus:") print('') no = 1 SimilarList = [] for idx in top_results[0:top_k]: SimilarList.append(self.corpusList[idx]) print(no, " ", self.corpusList[idx].id, self.corpusList[idx].no, "\t", self.corpusList[idx].reason, "\t(Score: %.4f)" % (cos_scores[idx])) for qlen in range(0, len(self.corpusList[idx].Situation.strip()), 50): print(self.corpusList[idx].Situation.strip()[qlen:qlen + 50]) print('\n') no = no + 1 if (no > 5): break show += 1 tEnd2 = time.time() # 計時結束 print("COS相似計算時間", "%f sec" % (tEnd2 - tStart2)) return SimilarList
def compare_rels(lang_edge, img_edge): lang_rel_emb = model.encode(lang_edge.rel, convert_to_tensor=True) img_rel_emb = model.encode(img_edge.rel, convert_to_tensor=True) cosine_scores = util.pytorch_cos_sim(lang_rel_emb, img_rel_emb) return cosine_scores
"This framework provides an easy method to compute dense vector representations for sentences and paragraphs (also known as sentence embeddings). The models are based on transformer networks like BERT / RoBERTa / XLM-RoBERTa etc. and are tuned specificially meaningul sentence embeddings such that sentences with similar meanings are close in vector space.", "Notwithstanding the order provided by the CoI Framework, perhaps the main reason that the framework was widely adopted is the methodological guidelines for measuring each of the presences that constituted a community of inquiry. The first of these presences that required rigorous definition and operational rigor was social presence. Extending the original socio-emotional perspective, social presence is most recently defined as the ability of participants to identify with the community (e.g., course of study), communicate purposefully in a trusting environment, and develop inter-personal relationships by way of projecting their individual personalities", "Another scenario needs to be considered when scrutinizing the Teaching Presence construct. Much as the more general construct of Presence in an online learning environment can be explained more in depth by separating out Teaching, Social, and Cognitive subfactors, it may be that the Teaching Presence construct's potential bifurcation reflects a strength, and not necessarily a weakness in the subscale's construction. That is, since this factor represents a greater chunk of the total variance, results may simply be pointing to the Teaching Presence subscale itself having two or more subscales. At this early stage of development of measures to operationalize the CoI framework it is important not to assume that a subscale's multidimensionality is necessarily a weakness. Further studies conducted with larger samples and within other contexts will help clarify this issue." ] corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True) # Query sentences: queries = [ 'The second part of my project relates to the research based on social learning in VR. I propose a second mode of giving peer feedback - through a MUVE called Hubs \citep{hubs} (which is VR ready but only requires a the use of a web browser and can even run on mobile). Peer reviews groups would be formed from the working groups, with a small number of random links outside the clusters as well. The emergent network will be evaluated based on 2 groups - those who take part in the new mode of peer review and those who do not. We should be able to model the networks from interactions on the Piazza working groups and deduce whether VR peer review sessions have any effect on emergent leadership, network centrality and strength of network links (trust) to name a few. Quantitative analysis will also be undertaken on pre \& post surveys given to students about their perception of the value of peer feedback, again split by whether or not they participated in the VR sessions.', 'If all goes well, at the end of the research project we should have ample motivation to continue to build a full scale social VR campus. For CS6460 the motivation for students to use it would be the peer feedback focus groups, but for other courses a similar peer feedback system might need to be implemented or another student motivation found to encourage the use of the platform. In any case, as leaders emerge and the VR campus network grows, I would expect moderators to appear who will be enthusiastic in mediating focus groups for their respective courses. Luckily in Hubs, moderator super privileges are easily controlled a linked through discord server, and the server side of the campus is hosted on AWS, so the capabilities of scaling the VR campus up significantly are already in place.', 'In order to seed the network, I would like to learn a set of embeddings that represent each students research interests from their research logs. Clustering based on the cosine similarity of these embeddings will create a semantic social network of topics being investigated in the class and within each cluster will be students with a high degree of homophily. I would encourage these clusters to form working groups where they can easily share their research and project updates on Piazza.' ] # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity top_k = 3 for query in queries: query_embedding = embedder.encode(query, convert_to_tensor=True) cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0] cos_scores = cos_scores.cpu() # We use torch.topk to find the highest 5 scores top_results = torch.topk(cos_scores, k=top_k) print("\n\n======================\n\n") print("Query:", query) print("\nTop 5 most similar sentences in corpus:") for score, idx in zip(top_results[0], top_results[1]): print(corpus[idx], "(Score: %.4f)" % (score))
def compare_name(lang_node, img_node): lang_name_emb = model.encode(lang_node.name, convert_to_tensor=True) img_name_emb = model.encode(img_node.name, convert_to_tensor=True) cosine_scores = util.pytorch_cos_sim(lang_name_emb, img_name_emb) return cosine_scores
logging.info("Encoding unique sentences with semantic search model: {}".format( semantic_model_name)) # encoding all unique sentences present in the training dataset embeddings = semantic_search_model.encode(sentences, batch_size=batch_size, convert_to_tensor=True) logging.info("Retrieve top-{} with semantic search model: {}".format( top_k, semantic_model_name)) # retrieving top-k sentences given a sentence from the dataset progress = tqdm.tqdm(unit="docs", total=len(sent2idx)) for idx in range(len(sentences)): sentence_embedding = embeddings[idx] cos_scores = util.pytorch_cos_sim(sentence_embedding, embeddings)[0] cos_scores = cos_scores.cpu() progress.update(1) #We use torch.topk to find the highest 5 scores top_results = torch.topk(cos_scores, k=top_k + 1) for score, iid in zip(top_results[0], top_results[1]): if iid != idx and (iid, idx) not in duplicates: silver_data.append((sentences[idx], sentences[iid])) duplicates.add((idx, iid)) progress.reset() progress.close() logging.info("Length of silver_dataset generated: {}".format(len(silver_data)))
def lexrank_nodes_edges_show_text(sentences, encoder, rec_method_name, reduction_method, reduction_method_params, q): print(encoder) sentence_encoder = encoder() # load sentences embeddings = sentence_encoder.encode_sentences(sentences) # Compute the pair-wise cosine similarities cos_scores = util.pytorch_cos_sim(embeddings.numpy(), embeddings.numpy()) # rescale scaler = MinMaxScaler(feature_range=(0, 1)) cos_scores = scaler.fit_transform(cos_scores.flatten().reshape( -1, 1)).reshape(len(embeddings), len(embeddings)) # Compute the centrality for each sentence centrality_scores = degree_centrality_scores(cos_scores, threshold=0.2) # scale for visualization purposes if q: centrality_scores = scale_centrality_scores(centrality_scores, q=q) # We argsort so that the first element is the sentence with the highest score # most_central_sentence_indices = np.argsort(-centrality_scores) # reduce dimensionality print(rec_method_name) assert rec_method_name != 'None' rm = reduction_method(**reduction_method_params) pos = rm.fit_transform(embeddings) # get weights weights = cos_scores np.fill_diagonal(weights, 0) G = nx.from_numpy_array(weights) edge_x = [] edge_y = [] for edge in G.edges(): x0, y0 = pos[edge[0]] x1, y1 = pos[edge[1]] if weights[edge[0], edge[1]] > 0.5: edge_x.append(x0) edge_x.append(x1) edge_x.append(None) edge_y.append(y0) edge_y.append(y1) edge_y.append(None) edge_trace = go.Scatter(x=edge_x, y=edge_y, line=dict(width=0.75, color='#888'), hoverinfo='none', mode='lines') node_x = [] node_y = [] for node in G.nodes(): x, y = pos[node] node_x.append(x) node_y.append(y) important_sentences = [ sent if weight > 8.0 else 'X' for sent, weight in zip(sentences, centrality_scores) ] print(important_sentences) node_trace = go.Scatter( x=node_x, y=node_y, # mode='markers', # hoverinfo='text', mode="markers+text", name="Markers and Text", text=important_sentences, textposition="bottom center", marker=dict( showscale=True, # colorscale options # 'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' | # 'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' | # 'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' | colorscale='Reds', # reversescale=True, color=[], size=[s * 10 for s in centrality_scores], colorbar=dict(thickness=15, title='Centrality Score', xanchor='left', titleside='right'), line_width=1)) node_adjacencies = [] node_text = [] for node, weight in enumerate(centrality_scores): node_adjacencies.append(weight) if weight > 8.0: node_text.append(sentences[node]) else: node_text.append('') node_trace.marker.color = node_adjacencies node_trace.text = node_text fig = go.Figure( data=[edge_trace, node_trace], layout=go.Layout( title=f'<b>LexRank Summarization File{i}</b>', showlegend=False, hovermode='closest', margin=dict(b=20, l=5, r=5, t=40), # annotations=[ dict( # #text="Python code: <a href='https://plotly.com/ipython-notebooks/network-graphs/'> # # https://plotly.com/ipython-notebooks/network-graphs/</a>", # showarrow=False, # xref="paper", yref="paper", # x=0.005, y=-0.002 ) ], # xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), # yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)) )) fig.show()