def print_statistics(self, methods): single_queries = Query(self.collection_path).get_queries_of_length(1) queries = {ele['num']:ele['title'] for ele in single_queries} cs = CollectionStats(self.collection_path) performance = Performances(self.collection_path) res = performance.gen_optimal_performances_queries(methods, queries.keys()) avdl = cs.get_avdl() total_terms = cs.get_total_terms() collection_freq = [] for qid in queries: idx = 0 ctf = cs.get_term_collection_occur(queries[qid]) idf = cs.get_term_logidf1(queries[qid]) collection_freq.append( ctf*1.0/total_terms ) print avdl print np.mean(collection_freq) for ele in res: label = ele[0] p = ele[1] para = float(ele[2].split(':')[1]) print label if 'okapi' in label: print 'b:', para, 'beta:', 1.2*para/avdl, 'c2:', 1.2*(1-para) if 'pivoted' in label: print 's:', para, 'beta:', para/avdl, 'c2:', 1-para
def print_best_performances(self, methods=[]): single_queries = Query(self.collection_path).get_queries_of_length(1) queries = {ele['num']:ele['title'] for ele in single_queries} cs = CollectionStats(self.collection_path) performance = Performances(self.collection_path) res = performance.gen_optimal_performances_queries(methods, queries.keys()) print res
def gen_output_performances_batch(): all_paras = [] with open('collections.json') as cf: for c in json.load(cf): collection_name = c['collection'] this_output_root = os.path.join(output_root, collection_name) if not os.path.exists(this_output_root): os.makedirs(this_output_root) index_path = os.path.join( index_root, 'lucene-index.' + collection_name + '.cnt.1') all_paras.extend( Performances(index_path).gen_output_performances_paras( this_output_root)) #print all_paras gen_batch_framework(all_paras, output_performances_atom)
def print_optimal_performances(metrics=['map']): # with open('g.json') as f: # methods = [m['name'] for m in json.load(f)['methods']] # if os.path.exists('microblog_funcs.json'): # with open('microblog_funcs.json') as f: # methods.extend([m['name'] for m in json.load(f)['methods']]) with open('collections.json') as cf: for c in json.load(cf): collection_name = c['collection'] this_output_root = os.path.join(output_root, collection_name) index_path = os.path.join( index_root, 'lucene-index.' + collection_name + '.cnt.1') print print collection_name print '=' * 30 Performances(index_path).print_optimal_performance( this_output_root, metrics)
def output_performances_atom(para): index_path = para[0] output_fn = para[1] input_fns = para[2:] Performances(index_path).output_performances(output_fn, input_fns)