示例#1
0
def get_RF(query_title, query_desc, K):
    # get query terms
    query_title_terms = utils.get_terms_list(' '.join(word for word in query_title))
    query_desc_terms = utils.get_terms_list(' '.join(word for word in query_desc))
    # get document scores and retrieve documents
    scores = get_document_scores(query_title_terms, query_desc_terms)
    top_docIds = [i[0] for i in sorted(scores.items(), key = lambda x:x[1], reverse = True)][:K]
    return utils.query_expansion_corpus(corpus_dir, top_docIds)
示例#2
0
def search():
    # read in metadata and process zone dictionaries and postings files
    process_main_dict(dict_file)
    process_main_post(postings_file)

    # get title and descritpion terms from query
    query_title, query_desc = utils.XML_query_parser(query_file)

    # Expand query title and description using google
    if (USE_GOOGLE):
        expanded_title, expanded_desc = utils.query_expansion_google(query_title)
        query_title += expanded_title
        query_desc += expanded_desc

    # Expand query title and decription using wordnet
    if (USE_WORDNET):
        query_title += filter(lambda x: x in zones['title']['dict'] or x in zones['abstract']['dict'], utils.query_expansion_wordnet(utils.get_terms_list(' '.join(word for word in query_title))))
        query_desc += filter(lambda x: x in zones['title']['dict'] or x in zones['abstract']['dict'], utils.query_expansion_wordnet(utils.get_terms_list(' '.join(word for word in query_desc))))
    
    # Expand query title and description using RF on corpus
    if (USE_CORPUS):
        expanded_title, expanded_desc = get_RF(query_title, query_desc, 5)
        query_title += expanded_title
        query_desc += expanded_desc

    # get query terms
    query_title_terms = utils.get_terms_list(' '.join(word for word in query_title))
    query_desc_terms = utils.get_terms_list(' '.join(word for word in query_desc))

    # get document scores and retrieve documents
    scores = get_document_scores(query_title_terms, query_desc_terms)
    output = filter_documents_with_threshold(scores, 0.15)
    output = map(lambda x: x[0], output)
    
    # write output
    write_output_file(output)

    # close postings files
    for zone_name in zones:
        zones[zone_name]['post'].close()