示例#1
0
def search(dictionary_file, postings_file, query_file, output_file):
    try:
        # Remove previous output file
        os.remove(output_file)
    except OSError:
        pass
    inverted_index = InvertedIndex(dictionary_file, postings_file)
    meta_data = get_meta_data()

    tree = ET.parse(query_file)
    root = tree.getroot()
    title_tokens = []
    description_tokens = []

    raw_tokens = []

    for child in root:
        if child.tag == 'title':
            title_tokens = build_tokens(child.text)
            raw_tokens.extend(word_tokenize(child.text))
        elif child.tag == 'description':
            description_tokens = build_tokens(child.text)
            raw_tokens.extend(word_tokenize(child.text))

    raw_tokens = helper.remove_stop_words_without_normalize(helper.filter_invalid_characters(raw_tokens))
    additional_tokens = []
    for token in list(set(raw_tokens)):
        additional_tokens.extend(helper.get_similar_words(token))
        

    title_tokens = helper.remove_stop_words(helper.filter_invalid_characters(title_tokens))
    description_tokens = helper.remove_stop_words(helper.filter_invalid_characters(description_tokens))

    # tight results are results which favour high precision. We use this as a proxy for true positive
    tight_results = execute_query(title_tokens, description_tokens, [], inverted_index, meta_data)
    global top_UPC_classes
    global top_IPC_classes
    global top_family_members
    global top_cited_by

    # Get top UPC, IPC, family members and cited by from our true positive proxy results
    # This helps us determine which documents are more similar to the original top results
    # when we add in the additional similar words
    top_UPC_classes = get_top_classes(tight_results, meta_data['UPC_class'], 6)
    top_IPC_classes = get_top_classes(tight_results, meta_data['IPC_class'], 4)
    top_family_members = get_top_members(tight_results, meta_data['family_members'], 20)
    top_cited_by = get_top_members(tight_results, meta_data['cited_by'], 20)
    
    # query expansion 
    # supplementary_results = expand_query(tight_results, meta_data['doc_top_terms'], inverted_index, meta_data)
    
    # synonyms, hypernyms
    additional_tokens = helper.normalize_tokens(list(set(additional_tokens)))

    results = execute_query(title_tokens, description_tokens, additional_tokens, inverted_index, meta_data)

    k = int(TOP_X_PERCENT_RESULTS * len(results))
    # j = int(TOP_X_PERCENT_RESULTS * len(supplementary_results))
    # results = list(set(results[:k] + supplementary_results[:j]))
    write_to_output(output_file, results[:k])
示例#2
0
def search(dictionary_file, postings_file, query_file, output_file):
    try:
        # Remove previous output file
        os.remove(output_file)
    except OSError:
        pass
    inverted_index = InvertedIndex(dictionary_file, postings_file)
    meta_data = get_meta_data()
    tree = ET.parse(query_file)
    root = tree.getroot()
    title_tokens = []
    description_tokens = []

    raw_tokens = []

    for child in root:
        if child.tag == 'title':
            title_tokens = build_tokens(child.text)
            raw_tokens.extend(word_tokenize(child.text))
        elif child.tag == 'description':
            description_tokens = build_tokens(child.text)
            raw_tokens.extend(word_tokenize(child.text))

    raw_tokens = helper.remove_stop_words_without_normalize(
        helper.filter_invalid_characters(raw_tokens))
    additional_tokens = []
    for token in list(set(raw_tokens)):
        additional_tokens.extend(helper.get_similar_words(token))
        pass

    title_tokens = helper.remove_stop_words(
        helper.filter_invalid_characters(title_tokens))
    description_tokens = helper.remove_stop_words(
        helper.filter_invalid_characters(description_tokens))

    # tight results are results which favour high precision. We use this as a proxy for true positive
    tight_results = execute_query(title_tokens, description_tokens, [],
                                  inverted_index, meta_data)
    global top_UPC_classes
    global top_IPC_classes
    global top_family_members
    top_UPC_classes = get_top_classes(tight_results, meta_data['UPC_class'], 6)
    top_IPC_classes = get_top_classes(tight_results, meta_data['IPC_class'], 4)
    top_family_members = get_top_members(tight_results,
                                         meta_data['family_members'], 30)
    # supplementary_results = expand_query(tight_results, meta_data['doc_top_terms'], inverted_index, meta_data)

    additional_tokens = helper.normalize_tokens(list(set(additional_tokens)))

    results = execute_query(title_tokens, description_tokens,
                            additional_tokens, inverted_index, meta_data)

    k = int(TOP_X_PERCENT_RESULTS * len(results))
    # j = int(TOP_X_PERCENT_RESULTS * len(supplementary_results))
    # results = list(set(results[:k] + supplementary_results[:j]))
    write_to_output(output_file, results[:k])
示例#3
0
def build_tokens(text):
    tokens = word_tokenize(text)
    return helper.normalize_tokens(tokens)
示例#4
0
def build_tokens(text):
    tokens = word_tokenize(text)
    return helper.normalize_tokens(tokens)