示例#1
0
def run(args):
    if args.build:
        model = build(args)

    if args.best:
        args.rocchio = True

    try:
        print('loading model from %s' % args.load_model)
        model = VSM(model_path=args.load_model)
    except:
        print('failed to load model, build from raw.')
        model = build(args)

    query_list, query_id = process_query(args.query_file)
    output_file = open(args.ranked_list, 'w+')
    print('query_id,retrieved_docs', file=output_file)
    for i, query in enumerate(query_list):
        doc_id, doc_score = model.get_ranking(query, args.rocchio)
        print('%s,%s' % (query_id[i], ' '.join(doc_id[:100])),
              file=output_file)
示例#2
0
def run():
    import pandas as pd
    model = VSM(model_path='./model.pl')
    query_list, query_id = process_query('../queries/query-train.xml')
    answer = pd.read_csv('../queries/ans_train.csv')[['retrieved_docs']].values
    answer = [a[0].strip().split(' ') for a in answer]

    for p in product_dict(params): 
        model.k1 = p['k1']
        model.b = p['b']
        model.k3 = p['k3']
        model.tf_type = p['tf']
        model.idf_type = p['idf']
        model.doc_len_norm = p['norm']
        model._compute_idf()
        
        score = []
        for i, query in enumerate(query_list):
            doc_id, doc_score = model.get_ranking(query, p['rocchio'], p['n'], p['k'])
            score.append(MAP(doc_id[:100], answer[i]))
        score = sum(score) / len(score)
        model_str = 'k1=%.2f,b=%.2f,k3=%d,idf_type=%s,rocchio=%s,n=%d,k=%d,score=%.5f' % (p['k1'], p['b'], p['k3'], p['idf'], p['rocchio'], p['n'], p['k'], score)
        print(model_str)