Пример #1
0
def run_ap_baseline(query_pickle, data_dir):

    ranker = KLRanker(data_dir)

    query = load_from_pickle(query_pickle)

    _start_time = time.time()

    aps = []

    for query_idx in tqdm(query):
        if 'languagemodel' in query[query_idx]:
            query_lm = query[query_idx]['languagemodel']
        else:
            query_lm = normalize(query[query_idx]['wordcount'], inplace=False)

        answer_set = query[query_idx]['answer']

        name_answer_set = {}
        for key, val in answer_set.items():
            docname = 'T' + str(key).zfill(4)
            name_answer_set[docname] = val

        ret = ranker.rank(query_lm, negquery=None)

        # Calculate Mean Average Precision
        ap = average_precision(ret, name_answer_set)
        aps.append(ap)

    # Time end
    _end_time = time.time()
    print("Mean Average Precision: {}".format(sum(aps) / len(aps)))
    print("Time taken: {} seconds".format(_end_time - _start_time))
Пример #2
0
def test_klranker():
    cwd = os.path.dirname(__file__)

    test_collection_dir = os.path.join(cwd, 'PTV.test')
    test_query_pickle = os.path.join(test_collection_dir,
                                     'PTV.test.query.pickle')
    test_scores_pickle = os.path.join(test_collection_dir,
                                      'PTV.test.scores.pickle')

    test_query = load_from_pickle(test_query_pickle)
    test_scores = load_from_pickle(test_scores_pickle)
    ranker = KLRanker(test_collection_dir)

    for query_idx, test_ret in zip(test_query, test_scores):
        query_lm = normalize(test_query[query_idx]['wordcount'], inplace=False)
        ret = ranker.rank(query_lm)

        assert ret == test_ret
Пример #3
0
def test_pickle():
    obj = {1: 2, 3: 4}

    test_pickle_file = 'test.pickle'

    utils.save_to_pickle(test_pickle_file, obj)

    obj2 = utils.load_from_pickle(test_pickle_file)

    assert obj == obj2

    os.remove(test_pickle_file)
Пример #4
0
def run_reformat(data_dir, out_dir, lex_file, query_pickle):
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    data_name = os.path.basename(data_dir)
    print("Reading lex...")
    encoded_lex_file = os.path.join(data_dir, data_name + '.lex')
    encoded_lex_dict = reader.readLex(encoded_lex_file)

    print("Reading background...")
    background_file = os.path.join(data_dir, data_name + '.background')
    background = reader.readBackground(background_file, encoded_lex_dict)

    print("Reading doclengs...")
    docleng_file = os.path.join(data_dir, data_name + '.doclength')
    doclengs = reader.readDocLength(docleng_file)

    # Change key from index to docname
    print("Change doclengths keys to document names...")
    namekey_doclengs = {}
    for doc_idx, length in doclengs.items():
        docname = 'T' + str(doc_idx).zfill(4)
        namekey_doclengs[docname] = length

    print("Reading inverted index...")
    index_file = os.path.join(data_dir, data_name + '.index')
    inverted_index = reader.readInvIndex(index_file)

    print("Converting inverted index docnames...")
    named_inverted_index = {}
    for wordID, docs_prob in tqdm(inverted_index.items()):
        named_docs_prob = {}
        for docID, prob in docs_prob.items():
            docname = 'T' + str(docID).zfill(4)
            named_docs_prob[docname] = prob
        named_inverted_index[wordID] = named_docs_prob

    print("Reading document models...")
    documents_lm = {}
    docmodel_dir = os.path.join(data_dir, 'docmodel', '*')
    for docpath in tqdm(glob(docmodel_dir)):
        docname = os.path.basename(docpath)
        documents_lm[docname] = reader.readDocModel(docpath)

    print(
        "Converting documents language model to wordcount with document lengths"
    )
    documents_wc = {}
    for docname, lm in tqdm(documents_lm.items()):
        wc = {}
        length = namekey_doclengs[docname]
        for word_idx, word_prob in lm.items():
            wc[word_idx] = round(word_prob * length)
        documents_wc[docname] = {'wordcount': wc}

    print("Saving lex to pickle...")
    lex_dict = reader.readLex(lex_file)
    lex_pickle = os.path.join(out_dir, 'lex.pickle')
    utils.save_to_pickle(lex_pickle, lex_dict)

    print("Saving documents to pickle...")
    document_pickle = os.path.join(out_dir, 'document.pickle')
    utils.save_to_pickle(document_pickle, documents_wc)

    print("Saving indices to pickle...")
    indices = {
        'background': background,
        'doclengs': namekey_doclengs,
        'inverted_index': named_inverted_index
    }

    index_pickle = os.path.join(out_dir, 'indices.pickle')
    utils.save_to_pickle(index_pickle, indices)

    print("Saving query to pickle...")
    old_query_pickle = os.path.join(data_dir, 'query.pickle')
    old_query = utils.load_from_pickle(old_query_pickle)
    query = {}
    for query_lm, ans_dict, q_idx in old_query:
        query_wc = {}
        length = len(query_lm.keys())
        for word_idx, word_prob in query_lm.items():
            count = round(word_prob * length)
            assert count == 1, query_lm
            query_wc[word_idx] = count

        query[q_idx] = {
            'answer': ans_dict,
            'wordcount': query_wc,
            'languagemodel': query_lm,
        }

    utils.save_to_pickle(query_pickle, query)
Пример #5
0
    return query


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-l',
        "--lex_pickle",
        type=str,
        default=
        './iscr/searchengine/data/PTV_onebest_fromMATBN_charSeg/lex.pickle')
    parser.add_argument('-q',
                        '--query_file',
                        type=str,
                        default='./data/query/PTV.utf8.jieba.query')
    parser.add_argument('-a',
                        '--answer_file',
                        type=str,
                        default='./data/query/PTV.ans')
    parser.add_argument('-o',
                        '--out_pickle',
                        type=str,
                        default='./queries/dnn.query.pickle')
    args = parser.parse_args()

    lex_dict = load_from_pickle(args.lex_pickle)

    query_answer = build_query_answer(lex_dict, args.query_file,
                                      args.answer_file, args.out_pickle)