Пример #1
0
 def rank_documents_bm25f(self, inverted_indexes_bm25f,
                          doc_length_info_bm25f,
                          parameters,
                          path_topics,
                          path_results_dir=r"../trec_eval-master/our_data/",
                          results_file_name="results_BM25F"):
     
     # Retrieve the queries
     queries = self.extract_queries(path_topics)
     
     # The path to the output file
     output_file_path = path_results_dir + results_file_name + ".txt"
     
     # Clear the contents of the output file
     open(output_file_path, "w").close()
     
     query_nr = 1 # Used to keep track of which query is being processed
     for query in queries: # For each query ...
         print(f"Processing query {query_nr}: '{query}'")
         
         # Transform the query terms to the desired form (i.e. tokenized, stemmed, ...)
         query_terms = processQuery(query)
         
         # Compute the BM25 score for each document for the current query
         doc_scores = self.compute_doc_scores_BM25F(query_terms,
                                                    inverted_indexes_bm25f,
                                                    doc_length_info_bm25f,
                                                    parameters)
         
         # Write the top 1000 document scores for this query to a .txt file
         self.write_output_file(query_nr, doc_scores, output_file_path)
         
         # Increment the query number for the next iteration
         query_nr += 1
Пример #2
0
 def rank_documents(self, inverted_indexes,
                     doc_lengths,
                     parameters,
                     path_topics,
                     path_results_dir=r"../trec_eval-master/our_data/",
                     results_file_name="results"):
     """
     Scores and ranks each document for each query.
     
     Given a list of queries this function determines for each query the
     BM25 score for each document. For each query the best 10,000 documents
     are then ranked based on their score (higher is better) and the
     results are written to a .txt file in such a form that it can be used
     as input to the TREC evaluation tool.
     """
     
     # Retrieve the queries
     queries = self.extract_queries(path_topics)
     
     # The path to the output file
     output_file_path = path_results_dir + results_file_name + ".txt"
     
     # Clear the contents of the output file
     open(output_file_path, "w").close()
     
     query_nr = 1 # Used to keep track of which query is being processed
     for query in queries: # For each query ...
         print(f"Processing query {query_nr}: '{query}'")
         
         # Transform the query terms to the desired form (i.e. tokenized, stemmed, ...)
         query_terms = processQuery(query)
         
         # Compute the BM25 score for each document for the current query
         doc_scores = self.compute_doc_scores(query_terms, inverted_indexes,
                                             doc_lengths, parameters)
         
         # Write the top 1000 document scores for this query to a .txt file
         self.write_output_file(query_nr, doc_scores, output_file_path)
         
         # Increment the query number for the next iteration
         query_nr += 1
Пример #3
0
    def rank_documents_rocchio(self, inverted_indexes, 
                            doc_lengths, documents, path_topics,
                            path_results_dir=r"../trec_eval-master/our_data/",
                            results_file_name="results"):
            """
            Scores and ranks each document for each query, then expand the query and rank it again.
            
            Given a list of queries this function determines for each query the
            BM25 score for each document. For each query the best 10,000 documents
            are then ranked based on their score (higher is better) and the
            results are written to a .txt file in such a form that it can be used
            as input to the TREC evaluation tool.
            """
            
            # Retrieve the queries
            queries = self.extract_queries(path_topics)
            
            # The path to the output file
            output_file_path = path_results_dir + results_file_name + ".txt"
            output_file_path_2 = path_results_dir + results_file_name + "_2.txt"
            
            # Clear the contents of the output file
            open(output_file_path, "w").close()
            open(output_file_path_2, "w").close()
            
            query_nr = 1 # Used to keep track of which query is being processed
            for query in queries: # For each query ...
                print(f"Processing query {query_nr}: '{query}'")
                
                # Transform the query terms to the desired form (i.e. tokenized, stemmed, ...)
                query_terms = processQuery(query)
                
                # Compute the BM25 score for each document for the current query
                doc_scores = self.compute_doc_scores(query_terms, inverted_indexes,
                                                    doc_lengths)

                # Set of relevant documents
                top_k = 20
                rel_docs = dict()
                for rank in dict(list(doc_scores.items())[:top_k]):
                    rel_docs[rank] = doc_scores[rank]

                expansion = rocchio(query_terms, rel_docs, inverted_indexes, documents)

                expanded_query = list(expansion.keys())

                # Compute the BM25 score for each document for the expanded query
                doc_scores_2 = self.compute_doc_scores(expanded_query, inverted_indexes,
                                                    doc_lengths)
                '''
                top_n = dict(sorted(doc_scores.items(), key = itemgetter(1), reverse = True)[:10])
                top_n_2 = dict(sorted(doc_scores_2.items(), key = itemgetter(1), reverse = True)[:10])
                for t in top_n:
                    print("{}: {}, {}, doc len: {}".format(t, doc_scores[t], doc_scores_2[t], doc_lengths[t]))
                
                print("----------------------")
                for t in top_n_2:
                    print("{}: {}, {}".format(t, doc_scores[t], doc_scores_2[t]))

                print(query_terms)
                print("***********************")
                print(expanded_query)
                '''
                # Write the top 1000 document scores for this query to a .txt file
                self.write_output_file(query_nr, doc_scores, output_file_path)
                self.write_output_file(query_nr, doc_scores_2, output_file_path_2)
                
                # Increment the query number for the next iteration
                query_nr += 1
                if query_nr > 50:
                    break
Пример #4
0
 def rank_BM25F_with_reranker(self, inverted_indexes_bm25f,
                                        doc_length_info_bm25f,
                                        documents_dict, parameters,
                                        path_topics, 
                                        path_results_dir=r"../trec_eval-master/our_data/",
                                        results_file_name="results__BM25F_rerank"):
     """
     Scores and ranks each document for each query with BM24F, then reranks them.
     
     This function first scores each document with BM25F, then it reranks
     the highest scored 1000 of them using a BERT model trained on MS MARCO
     (real user queries from Bing seach engine).
     """
     
     def rerank(query, documents_dict, doc_scores, model):
         
         query_embedding = model.encode(query)
         for cord_uid in doc_scores.keys():
             document = documents_dict[cord_uid]
             title = document.title
             abstract = document.abstract
             text_string = title + " " + abstract
 
             passage_embedding = model.encode(text_string)
             rerank_score = util.pytorch_cos_sim(query_embedding, passage_embedding)[0][0].item()
             doc_scores[cord_uid] = doc_scores[cord_uid] +  Constants.rerank_multiplier * rerank_score
             
     model = SentenceTransformer('distilroberta-base-msmarco-v2')
     
     # Retrieve the queries
     queries = self.extract_queries(path_topics)
     
     # The path to the output file
     output_file_path = path_results_dir + results_file_name + ".txt"
     
     # Clear the contents of the output file
     open(output_file_path, "w").close()
     
     query_nr = 1 # Used to keep track of which query is being processed
     for query in queries: # For each query ...
         print(f"Processing query {query_nr}: '{query}'")
         
         # Transform the query terms to the desired form (i.e. tokenized, stemmed, ...)
         query_terms = processQuery(query)
          
         # Compute the BM25 score for each document for the current query
         doc_scores = self.compute_doc_scores_BM25F(query_terms,
                                                    inverted_indexes_bm25f,
                                                    doc_length_info_bm25f, parameters)
         
         # Sort by score and select the n highest scored documents
         doc_scores = dict(sorted(doc_scores.items(),
                                        key = itemgetter(1), reverse = True)[:1000])
         
         
         rerank(query, documents_dict, doc_scores, model)
         
         # Sort by score and select the n highest scored documents
         doc_scores = dict(sorted(doc_scores.items(),
                                        key = itemgetter(1), reverse = True))
         
         # Write the top 1000 document scores for this query to a .txt file
         self.write_output_file(query_nr, doc_scores, output_file_path)
         
         # Increment the query number for the next iteration
         query_nr += 1