def print_statistics(self, methods): single_queries = Query(self.collection_path).get_queries_of_length(1) queries = {ele['num']:ele['title'] for ele in single_queries} cs = CollectionStats(self.collection_path) performance = Performances(self.collection_path) res = performance.gen_optimal_performances_queries(methods, queries.keys()) avdl = cs.get_avdl() total_terms = cs.get_total_terms() collection_freq = [] for qid in queries: idx = 0 ctf = cs.get_term_collection_occur(queries[qid]) idf = cs.get_term_logidf1(queries[qid]) collection_freq.append( ctf*1.0/total_terms ) print avdl print np.mean(collection_freq) for ele in res: label = ele[0] p = ele[1] para = float(ele[2].split(':')[1]) print label if 'okapi' in label: print 'b:', para, 'beta:', 1.2*para/avdl, 'c2:', 1.2*(1-para) if 'pivoted' in label: print 's:', para, 'beta:', para/avdl, 'c2:', 1-para
def gen_ranking_list(self, method, _callback, paras): """ We get the statistics from /collection_path/detailed_doc_stats/ so that we can get everything for the top 10,000 documents for each query generated by Dirichlet language model method. """ single_queries = Query(self.collection_path).get_queries_of_length(1) queries = {ele['num']:ele['title'] for ele in single_queries} doc_details = GenSqaDocDetails(self.collection_path) cs = CollectionStats(self.collection_path) avdl = cs.get_avdl() total_terms = cs.get_total_terms() res = {} for qid in queries: print queries[qid] res[qid] = [] idx = 0 ctf = cs.get_term_collection_occur(queries[qid]) idf = cs.get_term_logidf1(queries[qid]) #for row in cs.get_qid_details(qid): for row in doc_details.get_qid_details(qid): docid = row['docid'] total_tf = float(row['total_tf']) doc_len = float(row['doc_len']) localpara = copy.deepcopy(paras) localpara.extend([total_tf, doc_len, avdl, ctf, total_terms, idf]) score = _callback(localpara) res[qid].append((docid, score)) idx += 1 if idx >= 1000: break self.output_results(res, method) self.eval(method)
def process(self, qid, method_name, method_paras, output_fn): cs = CollectionStats(self.collection_path) single_queries = Query(self.collection_path).get_queries_of_length(1) queries = {ele['num']:ele['title'] for ele in single_queries} #print qids self.rel_docs = Judgment(self.collection_path).get_relevant_docs_of_some_queries([qid], 1, 'dict') # idfs = [(qid, math.log(cs.get_term_IDF1(queries[qid]))) for qid in self.rel_docs] # idfs.sort(key=itemgetter(1)) avdl = cs.get_avdl() total_terms = cs.get_total_terms() data = {True: [], False: []} # False: non-relevant True: relevant ctf = cs.get_term_collection_occur(queries[qid]) collection_para = { 'avdl': avdl, 'total_terms': total_terms, 'ctf': ctf } for row in cs.get_qid_details(qid): docid = row['docid'] total_tf = float(row['total_tf']) doc_len = float(row['doc_len']) rel_score = int(row['rel_score']) rel = (rel_score>=1) data[rel].append( { 'docid': docid, 'tf': total_tf, 'ln': doc_len } ) method_para_dict = {ele.split(':')[0]:ele.split(':')[1] for ele in method_paras.split(',')} max_map, max_para = self.learn(qid, data, collection_para, method_name, method_para_dict) with open(output_fn, 'wb') as f: json.dump({'map':max_map, 'para':max_para, 'eta':method_para_dict['eta']}, f, indent=2)