Exemplo n.º 1
0
def BKing(file_name, query_string, return_count = 10):

    # set idx file and dict file path
    idx_file = file_name + ".index"
    # error detection
    if not os.path.isfile(idx_file):
        print "Error: index dictionary file inverted index file (.index) not found. Please Run:\
        python corpusParser.py -f %s -s %s" % (file_name, 'stopword')
        exit(1)

    # read dict file to dict
    dicts = decompress_dict(idx_file)
    (N, docsyID, docsByName) = getCorpusDoc(file_name)

    # docs's parameter table
    docs_table = {}
  
    # docs score hash, use cosine similarity score with weight use tf-idf
    docs_score = {}
    # stem the query string
    query_string = stem_query(query_string)

    (query_table, docs_set, term_index) = query_weight(query_string, dicts, idx_file, N)
   
    docs_table = docs_weight(docs_set, term_index, query_string, query_table)

    

    docs_score = cos_vector_space_model(query_table, docs_table)

    print "Query terms:", query_string
    print "Top", return_count, "results:"
    print "doc#\tscore"
    count = return_count
    for i in sorted(docs_score, key=docs_score.get, reverse=True):
        count -= 1
        if count < 0:
            break
        print "%d\t%.6f" % (int(i), docs_score[i])

    if count >= 0:
        print "Only have found  %d relevant documents." % (return_count - count)

    print '\n*****************Some information about corpus*************************'
    print "The query string in corups information:"
    for i in term_index:
         if term_index[i].df > 0:
            term_index[i].output()

    print '\n*****************Some information about corpus*************************'
    for doc in docsByName.values():
        print doc[1], " docID:", doc[0], " docLength:", doc[2] 
Exemplo n.º 2
0
def query(file_name, query_string, return_count=10):

    # set idx file and dict file path
    idx_file = file_name + ".index"
    # error detection
    if not os.path.isfile(idx_file):
        print "Error: index dictionary file(.index.dict or inverted index file (.index.idx) not found."
        exit(1)

    # read dict file to dict
    dicts = decompress_dict(idx_file)

    # docs's parameter table
    docs_table = {}

    # docs score hash, use cosine similarity score with weight use tf-idf
    docs_score = {}
    # stem the query string
    query_string = stem_query(query_string)

    (query_table, docs_set, term_index) = query_weight(query_string, dicts,
                                                       idx_file)

    docs_table = docs_weight(docs_set, term_index, query_string, query_table)

    docs_score = cos_vector_space_model(query_table, docs_table)

    print "Query terms:", query_string
    print "Top", return_count, "results:"
    print "doc#\tscore"

    for i in sorted(docs_score, key=docs_score.get, reverse=True):
        return_count -= 1
        if return_count < 0:
            break
        print "%d\t%.3f" % (int(i), docs_score[i])
Exemplo n.º 3
0
def query(file_name, query_string, return_count = 10):

    # set idx file and dict file path
    idx_file = file_name + ".index"
    # error detection
    if not os.path.isfile(idx_file):
        print "Error: index dictionary file(.index.dict or inverted index file (.index.idx) not found."
        exit(1)

    # read dict file to dict
    dicts = decompress_dict(idx_file)
    

    # docs's parameter table
    docs_table = {}
  
    # docs score hash, use cosine similarity score with weight use tf-idf
    docs_score = {}
    # stem the query string
    query_string = stem_query(query_string)

    (query_table, docs_set, term_index) = query_weight(query_string, dicts, idx_file)
   
    docs_table = docs_weight(docs_set, term_index, query_string, query_table)

    docs_score = cos_vector_space_model(query_table, docs_table)

    print "Query terms:", query_string
    print "Top", return_count, "results:"
    print "doc#\tscore"

    for i in sorted(docs_score, key=docs_score.get, reverse=True):
        return_count -= 1
        if return_count < 0:
            break
        print "%d\t%.3f" % (int(i), docs_score[i])