Exemplo n.º 1
0
 def test_ParaphraseMiningEvaluator(self):
     """Tests that the ParaphraseMiningEvaluator can be loaded"""
     model = SentenceTransformer('paraphrase-distilroberta-base-v1')
     sentences = {
         0: "Hello World",
         1: "Hello World!",
         2: "The cat is on the table",
         3: "On the table the cat is"
     }
     data_eval = evaluation.ParaphraseMiningEvaluator(
         sentences, [(0, 1), (2, 3)])
     score = data_eval(model)
     assert score > 0.99
Exemplo n.º 2
0
        dev_sentences[row['qid']] = row['question']

        if len(dev_sentences) >= max_dev_samples:
            break

with open(os.path.join(dataset_path, "duplicate-mining/dev_duplicates.tsv"),
          encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        if row['qid1'] in dev_sentences and row['qid2'] in dev_sentences:
            dev_duplicates.append([row['qid1'], row['qid2']])

# The ParaphraseMiningEvaluator computes the cosine similarity between all sentences and
# extracts a list with the pairs that have the highest similarity. Given the duplicate
# information in dev_duplicates, it then computes and F1 score how well our duplicate mining worked
paraphrase_mining_evaluator = evaluation.ParaphraseMiningEvaluator(
    dev_sentences, dev_duplicates, name='dev')
evaluators.append(paraphrase_mining_evaluator)

###### Duplicate Questions Information Retrieval ######
# Given a question and a large corpus of thousands questions, find the most relevant (i.e. duplicate) question
# in that corpus.

# For faster processing, we limit the development corpus to only 10,000 sentences.
max_corpus_size = 100000

ir_queries = {}  #Our queries (qid => question)
ir_needed_qids = set()  #QIDs we need in the corpus
ir_corpus = {}  #Our corpus (qid => question)
ir_relevant_docs = {
}  #Mapping of relevant documents for a given query (qid => set([relevant_question_ids])