def get_RF(query_title, query_desc, K): # get query terms query_title_terms = utils.get_terms_list(' '.join(word for word in query_title)) query_desc_terms = utils.get_terms_list(' '.join(word for word in query_desc)) # get document scores and retrieve documents scores = get_document_scores(query_title_terms, query_desc_terms) top_docIds = [i[0] for i in sorted(scores.items(), key = lambda x:x[1], reverse = True)][:K] return utils.query_expansion_corpus(corpus_dir, top_docIds)
def search(): # read in metadata and process zone dictionaries and postings files process_main_dict(dict_file) process_main_post(postings_file) # get title and descritpion terms from query query_title, query_desc = utils.XML_query_parser(query_file) # Expand query title and description using google if (USE_GOOGLE): expanded_title, expanded_desc = utils.query_expansion_google(query_title) query_title += expanded_title query_desc += expanded_desc # Expand query title and decription using wordnet if (USE_WORDNET): query_title += filter(lambda x: x in zones['title']['dict'] or x in zones['abstract']['dict'], utils.query_expansion_wordnet(utils.get_terms_list(' '.join(word for word in query_title)))) query_desc += filter(lambda x: x in zones['title']['dict'] or x in zones['abstract']['dict'], utils.query_expansion_wordnet(utils.get_terms_list(' '.join(word for word in query_desc)))) # Expand query title and description using RF on corpus if (USE_CORPUS): expanded_title, expanded_desc = get_RF(query_title, query_desc, 5) query_title += expanded_title query_desc += expanded_desc # get query terms query_title_terms = utils.get_terms_list(' '.join(word for word in query_title)) query_desc_terms = utils.get_terms_list(' '.join(word for word in query_desc)) # get document scores and retrieve documents scores = get_document_scores(query_title_terms, query_desc_terms) output = filter_documents_with_threshold(scores, 0.15) output = map(lambda x: x[0], output) # write output write_output_file(output) # close postings files for zone_name in zones: zones[zone_name]['post'].close()