def run(args): if args.build: model = build(args) if args.best: args.rocchio = True try: print('loading model from %s' % args.load_model) model = VSM(model_path=args.load_model) except: print('failed to load model, build from raw.') model = build(args) query_list, query_id = process_query(args.query_file) output_file = open(args.ranked_list, 'w+') print('query_id,retrieved_docs', file=output_file) for i, query in enumerate(query_list): doc_id, doc_score = model.get_ranking(query, args.rocchio) print('%s,%s' % (query_id[i], ' '.join(doc_id[:100])), file=output_file)
def run(): import pandas as pd model = VSM(model_path='./model.pl') query_list, query_id = process_query('../queries/query-train.xml') answer = pd.read_csv('../queries/ans_train.csv')[['retrieved_docs']].values answer = [a[0].strip().split(' ') for a in answer] for p in product_dict(params): model.k1 = p['k1'] model.b = p['b'] model.k3 = p['k3'] model.tf_type = p['tf'] model.idf_type = p['idf'] model.doc_len_norm = p['norm'] model._compute_idf() score = [] for i, query in enumerate(query_list): doc_id, doc_score = model.get_ranking(query, p['rocchio'], p['n'], p['k']) score.append(MAP(doc_id[:100], answer[i])) score = sum(score) / len(score) model_str = 'k1=%.2f,b=%.2f,k3=%d,idf_type=%s,rocchio=%s,n=%d,k=%d,score=%.5f' % (p['k1'], p['b'], p['k3'], p['idf'], p['rocchio'], p['n'], p['k'], score) print(model_str)