def get_top_k_docs(self, query): query_words = p.preprocess_query(query) # preprocessing query if len(query_words) > 10: # long query search r = Rake(min_length=1, max_length=4) r.extract_keywords_from_text(query) phrases = list(set(' '.join(r.get_ranked_phrases()).split())) query_words = p.preprocess_query(' '.join(phrases)) top_k_docs = self.model.get_top_n(query_words, self.corpus, 100) # get top 100 docs insensitive_comparers = {} for qw in query_words: insensitive_comparers[qw] = re.compile(re.escape(qw), re.IGNORECASE) results = { 'titles': [], 'abstracts': [], 'ids': [], 'authors': [], 'links': [], 'category': [] } for i in top_k_docs: abstract = i['abstract'].replace('\n', '') title = i['title'].replace('\n', '') authors = i['authors'].replace('\n', '') id = i['id'] category = self.cat_data.iloc[id]['pred_category'] if abstract == '' or title == '' or authors == '': continue abstract = p.remove_punctuations(abstract) doc_text = title.lower() + ' ' + abstract.lower( ) + ' ' + authors.lower() query_words_found = False for qw in query_words: if qw in doc_text: query_words_found = True break if not query_words_found: continue for qw in query_words: abstract = insensitive_comparers[qw].sub( '<b>' + qw + '</b>', abstract) results['titles'].append(title.title()) results['authors'].append(authors) results['abstracts'].append(abstract) results['ids'].append(i['id']) results['links'].append(i['link']) results['category'].append(category) return (results)
def store_relevance_judgements(self, query, doc_id, rel_score): """ Args: query: str doc_id: unique ID corresponding to a document rel_score: -1 (not rel) or 1 (rel) """ if rel_score not in ['-1', '1']: print('Invalid Relevance Feedback:', rel_score) return with open(self.relevance_scores_file, 'a') as f: f.write(','.join( [str(i) for i in (query.replace(',', ''), doc_id, rel_score)]) + '\n') query_words = preprocess_query(query) if len(query_words) > 10: # long query search r = Rake(min_length=1, max_length=4) r.extract_keywords_from_text(query) phrases = r.get_ranked_phrases() query_words = ' '.join(phrases).split() self.relevance_scores[(tuple(query_words), int(doc_id))] += int(rel_score)
def get_top_k_docs(model, query, corpus, k=100): """ Args: model: Search Engine that has `get_top_n(tokenized_query, corpus, n=k)` method query: string k: int (default: 1) Returns: top_k_docs: dictionary keys: titles, abstracts, ids. Each element in dict[key] is a list of k elements in descending order of relevance """ tokenized_query = preprocess_query(query) query_words = preprocess_query(query, stemming=False, lower_case=True, lemma=False, stopword_removal=True) top_k_docs = model.get_top_n(tokenized_query, corpus, n=k) results = {'titles': [], 'abstracts': [], 'ids': []} for i in top_k_docs: abstract = i['abstract'].replace('\n', '') if abstract == '': abstract = i['introduction'].replace('\n', '') if abstract == '': continue doc_text = i['title'].replace( '\n', '').lower() + ' ' + i['abstract'].replace( '\n', '').lower() + ' ' + i['introduction'].replace( '\n', '').lower() query_words_found = False for qw in query_words: if qw in doc_text: query_words_found = True break if not query_words_found: continue results['abstracts'].append(abstract) results['ids'].append(i['id']) results['titles'].append(i['title'].replace('\n', '')) return results
def find(query, result_count): con = db.connection() start_time = time.perf_counter_ns() query_tokens = pp.preprocess_query(query) results = db.get_all_multiword_postings(con, query_tokens) end_time = time.perf_counter_ns() search_time = round((end_time - start_time) / 1000000) result_count = min(len(results), result_count) tab = ' ' * 2 freq = 'Frequencies' doc = 'Document' snip = 'Snippet' longest_name = 0 for i in range(result_count): if len(results[i].document_name) > longest_name: longest_name = len(results[i].document_name) longest_name = max(longest_name, len(doc)) print(f"Results for query: \"{query}\"") print() print(f"{tab}Results found in {search_time}ms.") print(f"{tab}{freq} {doc}{' ' * (longest_name - len(doc))} {snip}") print() print(f"{tab}{'-' * len(freq)} {'-' * longest_name} {'-' * longest_name}") for i in range(result_count): result = results[i] snippets = create_snippets(result, 3) snippet_string = '' for j in range(len(snippets)): snippet = snippets[j] if snippet.front: if j == 0: snippet_string += '... ' elif not snippets[j - 1].back: snippet_string += '... ' snippet_string += snippet.text if snippet.back: snippet_string += ' ... ' else: snippet_string += ' ' print(f"{tab}{result.frequency_sum}{' ' * (len(freq) - len(str(result.frequency_sum)))} {result.document_name}" f"{' ' * (longest_name - len(result.document_name))} {snippet_string}") print()
def __init__(self, model, corpus, relevance_scores_file): """ model: Search Engine that has `get_top_n(query_words, corpus, n=k)` method relevance_scores_file: stores (query, doc_id, rel_score) in this file """ self.model = model self.corpus = corpus self.relevance_scores_file = relevance_scores_file self.relevance_scores = defaultdict(int) with open(self.relevance_scores_file, 'r') as f: for line in f: query, doc_id, rel_score = line.replace('\n', '').split(',') query_words = preprocess_query(query) if len(query_words) > 10: # long query search r = Rake(min_length=1, max_length=4) r.extract_keywords_from_text(query) phrases = r.get_ranked_phrases() query_words = ' '.join(phrases).split() self.relevance_scores[(tuple(query_words), int(doc_id))] += int(rel_score)
def get_top_k_docs(self, query, k=100): """ Args: query: string k: int (default: 1) Returns: top_k_docs: dictionary keys: titles, abstracts, ids. Each element in dict[key] is a list of k elements in descending order of relevance """ query_words = preprocess_query(query) if len(query_words) > 10: # long query search r = Rake(min_length=1, max_length=4) r.extract_keywords_from_text(query) phrases = list(set(' '.join(r.get_ranked_phrases()).split())) query_words = preprocess_query(' '.join(phrases)) top_k_docs = self.model.get_top_n(query_words, self.corpus, n=k) insensitive_comparers = {} for qw in query_words: insensitive_comparers[qw] = re.compile(re.escape(qw), re.IGNORECASE) results = {'titles': [], 'abstracts': [], 'ids': [], 'links': []} relevant = {'titles': [], 'abstracts': [], 'ids': [], 'links': []} not_relevant = {'titles': [], 'abstracts': [], 'ids': [], 'links': []} for i in top_k_docs: abstract = i['abstract'].replace('\n', '') if abstract == '': abstract = i['introduction'].replace('\n', '') if abstract == '': continue abstract = remove_punctuations(abstract) title = i['title'].replace('\n', '') if title == '': continue doc_text = title.lower() + ' ' + abstract.lower( ) + ' ' + i['introduction'].replace('\n', '').lower() query_words_found = False for qw in query_words: if qw in doc_text: query_words_found = True break if not query_words_found: continue # Bold mark query words in abstract for qw in query_words: abstract = insensitive_comparers[qw].sub( '<b>' + qw + '</b>', abstract) rel_score = self.relevance_scores[(tuple(query_words), i['id'])] if rel_score > 0: relevant['titles'].append(title.title()) relevant['abstracts'].append(abstract) relevant['ids'].append(i['id']) relevant['links'].append(i['link']) elif rel_score < 0: not_relevant['titles'].append(title.title()) not_relevant['abstracts'].append(abstract) not_relevant['ids'].append(i['id']) not_relevant['links'].append(i['link']) else: results['titles'].append(title.title()) results['abstracts'].append(abstract) results['ids'].append(i['id']) results['links'].append(i['link']) for key in ['abstracts', 'ids', 'titles', 'links']: results[key] = relevant[key] + results[key] + not_relevant[key] return results