Exemplo n.º 1
0
    def test_paraphrase_mining(self):
        model = SentenceTransformer('paraphrase-distilroberta-base-v1')
        sentences = [
            "This is a test", "This is a test!",
            "The cat sits on mat", "The cat sits on the mat", "On the mat a cat sits",
            "A man eats pasta", "A woman eats pasta", "A man eats spaghetti"
        ]
        duplicates = util.paraphrase_mining(model, sentences)

        for score, a, b in duplicates:
            if score > 0.5:
                assert (a,b) in [(0,1), (2,3), (2,4), (3,4), (5,6), (5,7), (6,7)]
def range_by_allcs(sentences, sent, smodel, threshold=0.7):
    sentences = sentences + [sent]
    hypothesis = []
    paraphrases = util.paraphrase_mining(smodel, sentences)
    good_hyp = set()
    for paraphrase in paraphrases:
        score, i, j = paraphrase
        if threshold < score < 1.00:
            if sentences[i] == sent:
                good_hyp.add(sentences[j])
            elif sentences[j] == sent:
                good_hyp.add(sentences[i])
    if len(list(good_hyp)) > 1:
        hypothesis.extend(list(good_hyp))
    else:
        if paraphrases[0][1] != sent:
            hypothesis.append(sentences[paraphrases[0][1]])
        else:
            hypothesis.append(sentences[paraphrases[0][2]])
    return hypothesis
Exemplo n.º 3
0
 def remove_similar(
     self,
     followups: Dict[str, FollowupQuestion],
     answered: List[AnswerInfo],
     similarity_threshold: float = SIMILARITY_THRESHOLD,
 ) -> Dict[str, FollowupQuestion]:
     followups_text = [
         followups[followup].question for followup in followups.keys()
     ]
     answered_text = [question.question_text for question in answered]
     questions = answered_text + followups_text
     paraphrases = util.paraphrase_mining(self.transformer, questions)
     for paraphrase in paraphrases:
         score, i, j = paraphrase
         if score > similarity_threshold:
             if questions[i] in followups:
                 followups.pop(questions[i])
             elif questions[j] in followups:
                 followups.pop(questions[j])
     return followups
    def __call__(self,
                 model,
                 output_path: str = None,
                 epoch: int = -1,
                 steps: int = -1) -> float:
        if epoch != -1:
            out_txt = f" after epoch {epoch}:" if steps == -1 else f" in epoch {epoch} after {steps} steps:"
        else:
            out_txt = ":"

        logger.info("Paraphrase Mining Evaluation on " + self.name +
                    " dataset" + out_txt)

        #Compute embedding for the sentences
        pairs_list = paraphrase_mining(model, self.sentences,
                                       self.show_progress_bar, self.batch_size,
                                       self.query_chunk_size,
                                       self.corpus_chunk_size, self.max_pairs,
                                       self.top_k)

        logger.info("Number of candidate pairs: " + str(len(pairs_list)))

        #Compute F1 score and Average Precision
        n_extract = n_correct = 0
        threshold = 0
        best_f1 = best_recall = best_precision = 0

        average_precision = 0

        for idx in range(len(pairs_list)):
            score, i, j = pairs_list[idx]
            id1 = self.ids[i]
            id2 = self.ids[j]

            #Compute optimal threshold and F1-score
            n_extract += 1
            if self.duplicates[id1][id2] or self.duplicates[id2][id1]:
                n_correct += 1
                precision = n_correct / n_extract
                recall = n_correct / self.total_num_duplicates
                f1 = 2 * precision * recall / (precision + recall)
                average_precision += precision
                if f1 > best_f1:
                    best_f1 = f1
                    best_precision = precision
                    best_recall = recall
                    threshold = (pairs_list[idx][0] +
                                 pairs_list[min(idx + 1,
                                                len(pairs_list) - 1)][0]) / 2

        average_precision = average_precision / self.total_num_duplicates

        logger.info("Average Precision: {:.2f}".format(average_precision *
                                                       100))
        logger.info("Optimal threshold: {:.4f}".format(threshold))
        logger.info("Precision: {:.2f}".format(best_precision * 100))
        logger.info("Recall: {:.2f}".format(best_recall * 100))
        logger.info("F1: {:.2f}\n".format(best_f1 * 100))

        if output_path is not None and self.write_csv:
            csv_path = os.path.join(output_path, self.csv_file)
            if not os.path.isfile(csv_path):
                with open(csv_path, mode="w", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(self.csv_headers)
                    writer.writerow([
                        epoch, steps, best_precision, best_recall, best_f1,
                        threshold, average_precision
                    ])
            else:
                with open(csv_path, mode="a", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow([
                        epoch, steps, best_precision, best_recall, best_f1,
                        threshold, average_precision
                    ])

        return average_precision
Exemplo n.º 5
0
    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    # for score, idx in zip(top_results[0], top_results[1]):
    #     print(s3[idx], "(Score: {:.4f})".format(score))

    hits = util.semantic_search(query_embedding, corpus_embeddings2, top_k=5)
    hits = hits[0]      #Get the hits for the first query
    for hit in hits:
        print(s3[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))


# Paraphrase Mining - finding texts with similar meaning for large colections of sentences 10000+
largecorpus =corpus_test['sentence_A'].unique()
paraphrases = util.paraphrase_mining(model,largecorpus)

df = pd.DataFrame.from_records(paraphrases)
df[1] = [largecorpus[idx] for idx in df[1]] 
df[2] = [largecorpus[idx] for idx in df[2]] 
# df.to_csv("Paraphrase_Mining_pl.csv",index=False,header=["score","sentence1","sentence2"])


import spacy
nlp = spacy.load("pl_core_news_lg")

s1 = nlp(sentences1)
s2 = nlp(sentences2)
s3 = nlp(sentences3)
# sentence similarity
print(s1.similarity(s2)) 
Exemplo n.º 6
0
# Questions can be a long list of sentences up to 100k sentences or more.
# For demonstration purposes, we limit it to a few questions which all have on duplicate
questions = [
    'How did you catch your spouse cheating?',
    'How can I find out if my husband is cheating?', 'Is my wife cheating?',
    'How do I know if my partner is cheating?',
    'Why is Starbucks in India overrated?', 'Is Starbucks overrated in india?',
    'How can I lose weight fast without exercise?',
    'Can I lose weight without exercise?',
    'Which city is the best in India? Why?',
    'Which is the best city in India?', 'How can I stay focused in class?',
    'How can I stay focused on my school work?',
    'How can I Remotely hack a mobile phone?', 'How can I hack my phone?',
    'Where should I stay in Goa?', 'Which are the best hotels in Goa?',
    'Why does hair turn white?',
    'What causes older peoples hair to turn grey?',
    'What is the easiest way to get followers on Quora?',
    'How do I get more followers for my Quora?'
]

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Given a model and a List of strings (texts), evaluation.ParaphraseMiningEvaluator.paraphrase_mining performs a
# mining task by computing cosine similarity between all possible combinations and returning the ones with the highest scores.
# It returns a list of tuples (score, i, j) with i, j representing the index in the questions list.
pairs = util.paraphrase_mining(model, questions)

#Output Top-20 pairs:
for score, qid1, qid2 in pairs[0:20]:
    print("{:.3f}\t{}\t\t\t{}".format(score, questions[qid1], questions[qid2]))