Exemplo n.º 1
0
def GetSimilarNames(str):
    similarNames = []
    for word in str.split():
        for synonym in wn.synsets(word):
            for lemma in synonym.lemmas():
                new_query_term = lemma.name().lower()
                if word != new_query_term:
                    similarNames.append(new_query_term)
    return inverted_index.process_text({'TEXT': " ".join(similarNames)})
Exemplo n.º 2
0
def generate_snippet_html(args):
    global __bm25_score__
    __load_inverted_index__()
    query_string = inverted_index.process_text({'TEXT': args['QUERY_STRING']})
    __bm25_score__ = {}
    __bm25__({'QUERY_STRING': query_string})
    documentids = sorted(__bm25_score__, key=__bm25_score__.get,
                         reverse=True)[:10]
    snippet_html = generate_html({
        'BASE_DIRECTORY': __base_directory__,
        'DOCUMENTIDS': documentids,
        'QUERY_STRING': query_string
    })
    Start_Dialog.DisplayResult(args['QUERY_STRING'], snippet_html)
Exemplo n.º 3
0
def generate_bm25(args):
    global __bm25_score__
    __load_inverted_index__({'BASE_DIRECTORY': args['BASE_DIRECTORY']})
    print "\tCompleted loading inverted index from pickle files."
    username = getpass.getuser()
    query_strings = {}
    fileLocation = args['FILELOCATION']
    if fileLocation:
        query_strings = cacm_query.get_queries({'FILELOCATION': fileLocation})
    else:
        querylist = open(args['STEM_FILELOCATION']).readlines()
        i = 1
        for query in querylist:
            query_strings[str(i)] = query.strip()
            i += 1
    relevanceFileLocation = args['RELEVANCE_FILELOCATION']
    if relevanceFileLocation:
        __load_relevance_data__(
            {'RELEVANCE_FILELOCATION': relevanceFileLocation})
    for query_id in sorted(query_strings.keys(), key=lambda x: int(x)):
        __bm25_score__ = {}
        query_string = inverted_index.process_text(
            {'TEXT': query_strings[query_id]})
        if args['EXPAND_QUERY']:
            __bm25__({'QUERY_STRING': query_string, 'QUERY_ID': query_id})
            for documentid in sorted(__bm25_score__,
                                     key=__bm25_score__.get,
                                     reverse=True)[:20]:
                docLines = open(
                    os.path.join(args['BASE_DIRECTORY'], 'corpus',
                                 documentid + ".txt"), 'r').read()
                for word in docLines.split():
                    if word not in __pseudo_relevance__:
                        __pseudo_relevance__[word] = 1
                    else:
                        __pseudo_relevance__[word] += 1
            query_string += ' ' + ' '.join(
                sorted(__pseudo_relevance__,
                       key=__pseudo_relevance__.get,
                       reverse=True)[:20])
            __bm25_score__ = {}
        __bm25__({'QUERY_STRING': query_string, 'QUERY_ID': query_id})
        bm25_scores = []
        bm25_scores.append(
            ['query_id', 'Q0', 'doc_id', 'rank', 'BM25_score', 'system_name'])
        rank = 0
        for documentid in sorted(__bm25_score__,
                                 key=__bm25_score__.get,
                                 reverse=True)[:100]:
            rank += 1
            bm25_scores.append([
                query_id, 'Q0', documentid, rank, __bm25_score__[documentid],
                username
            ])

        filename = 'bm25_'
        fileExtension = args['FILE_EXT']
        if fileExtension:
            filename += '%s_' % fileExtension
        filename += 'query_%s.txt' % query_id
        with open(os.path.join(args['BASE_DIRECTORY'], filename),
                  'w') as bm25_file:
            bm25_file.write("\n".join("\t\t".join(map(str, l))
                                      for l in bm25_scores))
        print "\t\tProcessed query {}".format(query_id)

    print "\tSaved query results in {} directory".format(
        args['BASE_DIRECTORY'])
Exemplo n.º 4
0
def generate_bm25(args):
    global __bm25_score__
    __load_inverted_index__({'BASE_DIRECTORY': args['BASE_DIRECTORY']})
    print "\tCompleted loading inverted index from pickle files."
    username = getpass.getuser()
    query_strings = {}
    fileLocation = args['FILELOCATION']
    if fileLocation:
        query_strings = cacm_query.get_queries({'FILELOCATION': fileLocation})
    else:
        querylist = open(args['STEM_FILELOCATION']).readlines()
        i = 1
        for query in querylist:
            query_strings[str(i)] = query.strip()
            i += 1
    relevanceFileLocation = args['RELEVANCE_FILELOCATION']
    if relevanceFileLocation:
        __load_relevance_data__(
            {'RELEVANCE_FILELOCATION': relevanceFileLocation})
    for query_id in sorted(query_strings.keys(), key=lambda x: int(x)):
        __bm25_score__ = {}
        query_string = inverted_index.process_text(
            {'TEXT': query_strings[query_id]})
        origional_query_string = query_string
        if args['EXPAND_QUERY']:
            query_string += ' ' + Thesaurus.GetSimilarNames(query_string)
        __bm25__({'QUERY_STRING': query_string, 'QUERY_ID': query_id})
        bm25_scores = []
        bm25_scores.append(
            ['query_id', 'Q0', 'doc_id', 'rank', 'BM25_score', 'system_name'])
        rank = 0
        documentids = sorted(__bm25_score__,
                             key=__bm25_score__.get,
                             reverse=True)[:100]
        for documentid in documentids:
            rank += 1
            bm25_scores.append([
                query_id, 'Q0', documentid, rank, __bm25_score__[documentid],
                username
            ])

        filename = 'bm25_'
        fileExtension = args['FILE_EXT']
        if fileExtension:
            filename += '%s_' % fileExtension
        filename += 'query_%s.txt' % query_id
        with open(os.path.join(args['BASE_DIRECTORY'], filename),
                  'w') as bm25_file:
            bm25_file.write("\n".join("\t\t".join(map(str, l))
                                      for l in bm25_scores))
        filename = os.path.splitext(filename)[0] + '.html'
        with open(os.path.join(args['BASE_DIRECTORY'], filename),
                  'w') as snippet_file:
            snippet_html = generate_snippet.generate_html({
                'BASE_DIRECTORY':
                args['BASE_DIRECTORY'],
                'DOCUMENTIDS':
                documentids[:10],
                'QUERY_STRING':
                origional_query_string
            })

            snippet_html = '<b>BM25</b><br><br>Query - <b>%s</b><br><br>' % query_strings[
                query_id] + snippet_html
            snippet_file.write(snippet_html)
        print "\t\tProcessed query {}".format(query_id)

    print "\tSaved query results in {} directory".format(
        args['BASE_DIRECTORY'])