def test_paraphrase_mining(self): model = SentenceTransformer('paraphrase-distilroberta-base-v1') sentences = [ "This is a test", "This is a test!", "The cat sits on mat", "The cat sits on the mat", "On the mat a cat sits", "A man eats pasta", "A woman eats pasta", "A man eats spaghetti" ] duplicates = util.paraphrase_mining(model, sentences) for score, a, b in duplicates: if score > 0.5: assert (a,b) in [(0,1), (2,3), (2,4), (3,4), (5,6), (5,7), (6,7)]
def range_by_allcs(sentences, sent, smodel, threshold=0.7): sentences = sentences + [sent] hypothesis = [] paraphrases = util.paraphrase_mining(smodel, sentences) good_hyp = set() for paraphrase in paraphrases: score, i, j = paraphrase if threshold < score < 1.00: if sentences[i] == sent: good_hyp.add(sentences[j]) elif sentences[j] == sent: good_hyp.add(sentences[i]) if len(list(good_hyp)) > 1: hypothesis.extend(list(good_hyp)) else: if paraphrases[0][1] != sent: hypothesis.append(sentences[paraphrases[0][1]]) else: hypothesis.append(sentences[paraphrases[0][2]]) return hypothesis
def remove_similar( self, followups: Dict[str, FollowupQuestion], answered: List[AnswerInfo], similarity_threshold: float = SIMILARITY_THRESHOLD, ) -> Dict[str, FollowupQuestion]: followups_text = [ followups[followup].question for followup in followups.keys() ] answered_text = [question.question_text for question in answered] questions = answered_text + followups_text paraphrases = util.paraphrase_mining(self.transformer, questions) for paraphrase in paraphrases: score, i, j = paraphrase if score > similarity_threshold: if questions[i] in followups: followups.pop(questions[i]) elif questions[j] in followups: followups.pop(questions[j]) return followups
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: if epoch != -1: out_txt = f" after epoch {epoch}:" if steps == -1 else f" in epoch {epoch} after {steps} steps:" else: out_txt = ":" logger.info("Paraphrase Mining Evaluation on " + self.name + " dataset" + out_txt) #Compute embedding for the sentences pairs_list = paraphrase_mining(model, self.sentences, self.show_progress_bar, self.batch_size, self.query_chunk_size, self.corpus_chunk_size, self.max_pairs, self.top_k) logger.info("Number of candidate pairs: " + str(len(pairs_list))) #Compute F1 score and Average Precision n_extract = n_correct = 0 threshold = 0 best_f1 = best_recall = best_precision = 0 average_precision = 0 for idx in range(len(pairs_list)): score, i, j = pairs_list[idx] id1 = self.ids[i] id2 = self.ids[j] #Compute optimal threshold and F1-score n_extract += 1 if self.duplicates[id1][id2] or self.duplicates[id2][id1]: n_correct += 1 precision = n_correct / n_extract recall = n_correct / self.total_num_duplicates f1 = 2 * precision * recall / (precision + recall) average_precision += precision if f1 > best_f1: best_f1 = f1 best_precision = precision best_recall = recall threshold = (pairs_list[idx][0] + pairs_list[min(idx + 1, len(pairs_list) - 1)][0]) / 2 average_precision = average_precision / self.total_num_duplicates logger.info("Average Precision: {:.2f}".format(average_precision * 100)) logger.info("Optimal threshold: {:.4f}".format(threshold)) logger.info("Precision: {:.2f}".format(best_precision * 100)) logger.info("Recall: {:.2f}".format(best_recall * 100)) logger.info("F1: {:.2f}\n".format(best_f1 * 100)) if output_path is not None and self.write_csv: csv_path = os.path.join(output_path, self.csv_file) if not os.path.isfile(csv_path): with open(csv_path, mode="w", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(self.csv_headers) writer.writerow([ epoch, steps, best_precision, best_recall, best_f1, threshold, average_precision ]) else: with open(csv_path, mode="a", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow([ epoch, steps, best_precision, best_recall, best_f1, threshold, average_precision ]) return average_precision
print("\n\n======================\n\n") print("Query:", query) print("\nTop 5 most similar sentences in corpus:") # for score, idx in zip(top_results[0], top_results[1]): # print(s3[idx], "(Score: {:.4f})".format(score)) hits = util.semantic_search(query_embedding, corpus_embeddings2, top_k=5) hits = hits[0] #Get the hits for the first query for hit in hits: print(s3[hit['corpus_id']], "(Score: {:.4f})".format(hit['score'])) # Paraphrase Mining - finding texts with similar meaning for large colections of sentences 10000+ largecorpus =corpus_test['sentence_A'].unique() paraphrases = util.paraphrase_mining(model,largecorpus) df = pd.DataFrame.from_records(paraphrases) df[1] = [largecorpus[idx] for idx in df[1]] df[2] = [largecorpus[idx] for idx in df[2]] # df.to_csv("Paraphrase_Mining_pl.csv",index=False,header=["score","sentence1","sentence2"]) import spacy nlp = spacy.load("pl_core_news_lg") s1 = nlp(sentences1) s2 = nlp(sentences2) s3 = nlp(sentences3) # sentence similarity print(s1.similarity(s2))
# Questions can be a long list of sentences up to 100k sentences or more. # For demonstration purposes, we limit it to a few questions which all have on duplicate questions = [ 'How did you catch your spouse cheating?', 'How can I find out if my husband is cheating?', 'Is my wife cheating?', 'How do I know if my partner is cheating?', 'Why is Starbucks in India overrated?', 'Is Starbucks overrated in india?', 'How can I lose weight fast without exercise?', 'Can I lose weight without exercise?', 'Which city is the best in India? Why?', 'Which is the best city in India?', 'How can I stay focused in class?', 'How can I stay focused on my school work?', 'How can I Remotely hack a mobile phone?', 'How can I hack my phone?', 'Where should I stay in Goa?', 'Which are the best hotels in Goa?', 'Why does hair turn white?', 'What causes older peoples hair to turn grey?', 'What is the easiest way to get followers on Quora?', 'How do I get more followers for my Quora?' ] model = SentenceTransformer('paraphrase-MiniLM-L6-v2') # Given a model and a List of strings (texts), evaluation.ParaphraseMiningEvaluator.paraphrase_mining performs a # mining task by computing cosine similarity between all possible combinations and returning the ones with the highest scores. # It returns a list of tuples (score, i, j) with i, j representing the index in the questions list. pairs = util.paraphrase_mining(model, questions) #Output Top-20 pairs: for score, qid1, qid2 in pairs[0:20]: print("{:.3f}\t{}\t\t\t{}".format(score, questions[qid1], questions[qid2]))