def __init__(self, corpus, queries_group, vars_quantile, **kwargs): self.top_docs_overlap = kwargs.get('top_docs_overlap', 10) self.rbo_top = kwargs.get('rbo_top', 100) self.corpus = corpus self.queries_group = queries_group graphs = kwargs.get('graphs', None) if graphs: n = kwargs.get('n', None) assert n, 'Missing number of vars' self.__set_graph_paths(corpus, queries_group, graphs, n) else: self.__set_paths(corpus, queries_group, vars_quantile) _raw_res_data = dp.ResultsReader(self.results_file, 'trec') if queries_group == 'title': _title_res_data = dp.ResultsReader(self.title_res_file, 'trec') self.prediction_queries_res_data = _title_res_data else: self.prediction_queries_res_data = _raw_res_data self.queries_data = dp.QueriesTextParser(self.queries_full_file, 'uqv') self.topics_data = dp.QueriesTextParser(self.queries_topic_file) # Uncomment the next lines if you want to write the basic results of the topic queries. # write_basic_results(self.prediction_queries_res_data.data_df.loc[self.topics_data.queries_df['qid']], corpus, # queries_group) # exit() # These 2 DF used for the filtering method self.variations_data = dp.QueriesTextParser( self.queries_variations_file, 'uqv') self.quantile_variations_data = dp.QueriesTextParser( self.queries_quantile_vars, 'uqv') # _var_scores_df.loc[_var_scores_df['qid'].isin(_vars_list)] self.raw_res_data = _raw_res_data self.fused_data = dp.ResultsReader(self.fused_results_file, 'trec') self.query_vars = self.queries_data.query_vars
def main(args): results_file = args.results corpus_scores_file = args.corpus_scores queries_file = args.queries number_of_docs = args.docs # corpus = 'ROBUST' # queries_file = dp.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_full.txt') # results_file = dp.ensure_file(f'~/QppUqvProj/Results/{corpus}/test/raw/QL.res') # corpus_scores_file = dp.ensure_file(f'~/QppUqvProj/Results/{corpus}/test/raw/logqlc.res') # queries_file = dp.ensure_file(f'~/QppUqvProj/data/{corpus}/queries.txt') # results_file = dp.ensure_file(f'~/QppUqvProj/Results/{corpus}/test/basic/QL.res') # corpus_scores_file = dp.ensure_file(f'~/QppUqvProj/Results/{corpus}/test/basic/logqlc.res') queries_obj = dp.QueriesXMLParser(queries_file) # queries_obj = dp.QueriesTextParser(queries_file) results_obj = dp.ResultsReader(results_file, 'trec') corpus_scores_obj = dp.ResultsReader(corpus_scores_file, 'predictions') predictor = SMV(queries_obj, results_obj, corpus_scores_obj) if number_of_docs: predictor.calc_results(number_of_docs) else: for n in NUMBER_OF_DOCS: predictor.calc_results(n)
def main(args): results_file = args.results predict_queries_file = args.queries_to_predict full_queries_file = args.full_queries_file results_obj = dp.ResultsReader(results_file, 'trec') res_df = results_obj.data_df q2p = queries_to_predict(full_queries_file, predict_queries_file) pred_res_df, vars_res_df = split_prediction_queries(res_df, q2p)
def __init__(self, predictor, corpus, qgroup, vars_quantile, **kwargs): graphs = kwargs.get('graphs', None) if graphs: n = kwargs.get('n', None) assert n, 'Missing number of vars' self.__set_graph_paths(corpus, predictor, qgroup, graphs, n) else: self.__set_paths(corpus, predictor, qgroup, vars_quantile) _q2p_obj = dp.QueriesTextParser(self.queries2predict_file, 'uqv') self.var_cv = InterTopicCrossValidation( folds_map_file=self.folds, predictions_dir=self.vars_results_dir) _vars_results_df = self.var_cv.full_set # Initialize the base prediction results of the queries to be predicted if qgroup == 'title': _base_cv = InterTopicCrossValidation( folds_map_file=self.folds, predictions_dir=self.base_results_dir) self.base_results_df = _base_cv.full_set else: self.base_results_df = dp.convert_vid_to_qid( _vars_results_df.loc[_q2p_obj.queries_dict.keys()]) self.base_results_df.rename_axis('topic', inplace=True) # The next function is used to save results in basic predictions format of the given queries set # write_basic_predictions(self.base_results_df, corpus, qgroup, predictor) self.query_vars = dp.QueriesTextParser(self.query_vars_file, 'uqv') _quantile_vars = dp.QueriesTextParser(self.quantile_vars_file, 'uqv') _features_df = features_loader(self.features, corpus) self.features_df = self.__initialize_features_df( _quantile_vars, _features_df) self.var_scores_df = self.__initialize_var_scores_df( _features_df.reset_index()[['topic', 'qid']], _vars_results_df) self.geo_mean_df = self.__initialize_geo_scores_df( _features_df.reset_index()[['topic', 'qid']], dp.ResultsReader(self.geo_mean_file, 'predictions').data_df) self.real_ap_df = self.__initialize_var_scores_df( _features_df.reset_index()[['topic', 'qid']], dp.ResultsReader(self.real_ap_file, 'ap').data_df) self.geo_as_predictor()
def __init__(self, qpp_ref: QueryPredictionRef, corr_measure='pearson'): self.corr_measure = corr_measure _predictor = qpp_ref.predictor self.features_df = qpp_ref.features_df self.results_df = qpp_ref.var_scores_df _ap_file = qpp_ref.ap_file self.ap_obj = dp.ResultsReader(_ap_file, 'ap') self.folds_df = qpp_ref.var_cv.data_sets_map.transpose() self.output_dir = f'{qpp_ref.output_dir}/ltr/{_predictor}/' dp.ensure_dir(self.output_dir) self.calc_features_df = qpp_ref.calc_integrated self.feature_names = self.features_df.columns.tolist() self.cpu_cores = mp.cpu_count() - 1
def __init__(self, corpus): self.corpus = corpus # self.predictor = predictor # self.ql_results_file = None # self.queries_txt_file = None # self.predictions_output_dir = None # self.pkl_dir = None self.__set_paths() self.queries_obj = dp.QueriesTextParser(self.queries_txt_file, kind='uqv') self.queries_obj.queries_df = dp.add_topic_to_qdf( self.queries_obj.queries_df).set_index('qid') self.features_df = self.initialize_features_df() self.ql_results_obj = dp.ResultsReader(self.ql_results_file, 'trec')
def _build_full_set(predictions_dir, ap_file=None): """Assuming the predictions files are named : predictions-[*]""" all_files = glob.glob(predictions_dir + "/*predictions*") if 'uef' in predictions_dir: # Excluding all the 5 and 10 docs predictions if 'qf' in predictions_dir: all_files = [fn for fn in all_files if not os.path.basename(fn).endswith('-5+', 11, 14) and not os.path.basename(fn).endswith( '-10+', 11, 15)] else: all_files = [fn for fn in all_files if not os.path.basename(fn).endswith('-5') and not os.path.basename(fn).endswith('-10')] list_ = [] for file_ in all_files: fname = file_.split('-')[-1] df = dp.ResultsReader(file_, 'predictions').data_df df = df.rename(columns={"score": f'score_{fname}'}) list_.append(df) if ap_file: ap_df = dp.ResultsReader(ap_file, 'ap').data_df list_.append(ap_df) full_set = pd.concat(list_, axis=1, sort=True) assert not full_set.empty, f'The Full set DF is empty, make sure that {predictions_dir} is not empty' return full_set
def __init__(self, corpus, max_n=20, corr_measure='pearson', load_from_pkl=True, queries_group='title'): self.group = queries_group self.corr_measure = corr_measure self.load_from_pkl = load_from_pkl self.__set_paths(corpus, queries_group) self.corpus = corpus self.queries_obj = dp.QueriesTextParser(self.queries_file) self.queries_obj.queries_df = add_topic_to_qdf( self.queries_obj.queries_df) self.raw_ap_obj = dp.ResultsReader(self.raw_ap_file, 'ap') self.max_n = min( self.queries_obj.queries_df.groupby('topic').count().max()['qid'], max_n) self.basic_results_dict = defaultdict(float) self.__initialize_basic_results_dict()
def main(args): queries_txt_file = args.queries queries_to_remove = args.remove ap_file = args.ap queries_group = args.group quant_variants = args.quant stats = args.stats plot_vars = args.plot_vars filter_functions_dict = { 'top': filter_top_queries, 'low': filter_low_queries, 'medl': filter_medl_queries, 'medh': filter_medh_queries } # quantiles_dict = {'low': [0, 0.33], 'med': [0.33, 0.66], 'top': [0.66, 1]} quantiles_dict = {'low': [0, 0.5], 'high': [0.5, 1]} # # Uncomment for Debugging !!!!! # print('\n\n\n----------!!!!!!!!!!!!--------- Debugging Mode ----------!!!!!!!!!!!!---------\n\n\n') # # quant_variants = 'low' # corpus = 'ClueWeb12B' # corpus = 'ROBUST' # ap_file = dt.ensure_file(f'~/QppUqvProj/Results/{corpus}/test/raw/QLmap1000') # queries_txt_file = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_full.txt') # queries_txt_file_wo_title = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_title.txt') # queries_txt_file_wo_top = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_top.txt') # queries_txt_file_wo_low = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_low.txt') # queries_txt_file_wo_med = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_medh.txt') # plot_vars = True # df = create_overlap_ref_queries(queries_txt_file_wo_top, queries_txt_file_wo_low, queries_txt_file_wo_med, # queries_txt_file_wo_title) # write_queries_to_files(df, corpus, 'cref') # exit() corpus = 'ROBUST' if 'ROBUST' in queries_txt_file else 'ClueWeb12B' if queries_txt_file: qdb = dt.QueriesTextParser(queries_txt_file, 'uqv') df = add_topic_to_qdf(qdb.queries_df) qdb.queries_df = remove_duplicates(qdb) if queries_to_remove: qdb_rm = dt.QueriesTextParser(queries_to_remove) qdb.queries_df = remove_q1_from_q2(qdb_rm.queries_df, qdb) if ap_file: apdb = dt.ResultsReader(ap_file, 'ap') if queries_group != 'title': qdb.queries_df = filter_functions_dict[queries_group]( qdb.queries_df, apdb) elif quant_variants: qdb.queries_df = filter_quant_variants( qdb.queries_df, apdb, quantiles_dict[quant_variants]) if stats: title_queries_file = dt.ensure_file( f'~/QppUqvProj/data/{corpus}/queries_{corpus}_title.txt') title_queries_df = dt.QueriesTextParser( title_queries_file).queries_df title_ap_file = dt.ensure_file( f'~/QppUqvProj/Results/{corpus}/test/basic/QLmap1000') title_ap = dt.ResultsReader(title_ap_file, 'ap') calc_statistics(qdb.queries_df, apdb, title_queries_df, title_ap, filter_functions_dict, quantiles_dict, corpus) return elif plot_vars: title_queries_file = dt.ensure_file( f'~/QppUqvProj/data/{corpus}/queries_{corpus}_title.txt') title_queries_df = dt.QueriesTextParser( title_queries_file).queries_df title_ap_file = dt.ensure_file( f'~/QppUqvProj/Results/{corpus}/test/basic/QLmap1000') title_ap = dt.ResultsReader(title_ap_file, 'ap') plot_variants_ap(qdb.queries_df, apdb, title_queries_df, title_ap, corpus) return print_top_differences(qdb.queries_df, apdb, corpus)