def compute_bm25(self, query): """ Compute the Okapi BM25 ranking formula to rank retrieved documents by relevance. :param query: query to be conducted. """ terms = [ ps.stem(term) for term in word_tokenize(query) if term.casefold() not in stopwords ] query = " ".join(terms) doc_ids = OrQuery(self.index).execute(query) rank = {} for doc_id in doc_ids: score = 0 for term in terms: score += self.compute_idf_weight( term) * self.compute_numerator( term, doc_id) / self.compute_denominator(term, doc_id) rank[doc_id] = score sorted_rank = sorted(rank.items(), key=itemgetter(1), reverse=True) print("{} documents found.".format(len(doc_ids))) for k, v in sorted_rank: print("Document {} score: {}".format(k, v)) print()
def stem_index(index): """ We will use this method to convert all index keys to lowercase, which will produce more consistent results when we conduct queries. Note: This is only used for the queries. The index that appears in the index.txt file hasn't been modified. :param index: dictionary containing terms and the postings in which they appear. :return: index with all keys stemmed, making sure the postings combine as well. For example: {'hello': [1, 2], 'Hello': [1, 3], 'HELLO': [2, 3, 5]} would return a dictionary of: {'hello': [1, 2, 3, 5]}. """ new_index = {} for k, v in index.items(): if ps.stem(k) not in new_index.keys(): new_index[ps.stem(k)] = index[k] else: new_index[ps.stem(k)] = new_index[ps.stem(k)] + index[k] return new_index
def get_postings_list_of_one_term(self, term): """ Get a term's postings list, i.e. list of document ID's of documents that it's in. :param term: the term. :return: list of doc IDs. """ try: return self.index[ps.stem(term)] except KeyError: return []
def get_document_frequency(self, term): """ Call the get_postings_list method of the Query class, and return its length. :param term: the term we want the frequency of. :return: the number of documents the term appears in, or df_t. """ try: postings_list = self.index[ps.stem(term)] return len(set(postings_list)) except KeyError: return 0
def get_frequency_of_term_in_doc(self, term, doc_id): """ Count the number of times a certain document ID appears in a term's postings list. :param term: the term. :param doc_id: the document we're looking at. :return: the number of times the term appears in the document. """ try: postings_list = self.index[ps.stem(term)] return postings_list.count(doc_id) except KeyError: return 0
def get_stemmed(self): """ Stem all terms. :return: row featuring information on index with all terms stemmed. """ self.terms_stemmed = list( set([ps.stem(term) for term in self.terms_remove_150_stopwords])) reduction_from_previous = self.get_reduction_percentage( self.terms_remove_150_stopwords, self.terms_stemmed) total_reduction = self.get_reduction_percentage( self.terms, self.terms_stemmed) return [ "stemmed", "{:,}".format(len(self.terms_stemmed)), reduction_from_previous, total_reduction ]
def get_postings_lists(self, terms): """ Split the query into individual terms. For each terms, store in a dictionary the documents in which the term appears (postings list). :param: the user's query. :return: list of postings lists found from the terms in the query. """ self.original_terms = terms self.terms = [ps.stem(term) for term in word_tokenize(terms)] results = {} for term in self.terms: try: results[term] = self.index[term] except KeyError: results[term] = [] return list(results.values())
def compress(self, terms): """ Remove stopwords from terms list, or stem terms in terms list, or lower terms to their lowercase variant, or remove terms that are just numbers. Or do any combination of all. :param terms: list of terms to be compressed. :return: compressed list of terms. """ if self.remove_stopwords: terms = [term for term in terms if term.lower() not in stopwords] if self.stem: terms = [ps.stem(term) for term in terms] if self.case_folding: terms = [term.casefold() for term in terms] if self.remove_numbers: terms = [ term for term in terms if not term.replace(",", "").replace(".", "").isdigit() ] return terms