def __save_new_dictionary(self, corpus, predictor): _dir = dp.ensure_dir( f'{self.res_dir}/test/pageRank/pkl_files/{predictor}') with open(f'{_dir}/dict_all_options_stochastic.pkl', 'wb') as handle: pickle.dump(self.dict_all_options_stochastic, handle, protocol=pickle.HIGHEST_PROTOCOL)
def main(args): corpus = args.corpus generate = args.generate # predict = args.predict # queries_group = args.group file_to_load = args.load # quantile = args.quantile # # Debugging # print('------------!!!!!!!---------- Debugging Mode ------------!!!!!!!----------') # testing_feat = QueryFeatureFactory('ROBUST') # norm_features_df = testing_feat.generate_features() # # norm_features_df.reset_index().to_json('query_features_{}_uqv.JSON'.format(corpus)) cores = mp.cpu_count() - 1 if generate: # FIXME: test and fix the fueatures creation to run in parallel with mp.Pool(processes=cores) as pool: norm_features_list = pool.starmap(run_features_process, itertools.product({'ROBUST', 'ClueWeb12B'}, NUMBER_OF_DOCS)) testing_feat = QueryFeatureFactory(corpus) norm_features_df = testing_feat.generate_features() _path = f'{dp.set_environment_paths()[0]}/{corpus}/test/pageRank' _path = dp.ensure_dir(_path) norm_features_df.reset_index().to_json(f'{_path}/PageRank_Features.JSON') elif file_to_load: features_df = features_loader(corpus, file_to_load) print(features_df) else: features_df = features_loader(corpus) print(features_df)
def __set_paths(cls, corpus, predictor): """This method sets the default paths of the files and the working directories, it assumes the standard naming convention of the project""" cls.predictor = predictor _res_dir, _data_dir = dp.set_environment_paths() cls.res_dir = f'{_res_dir}/{corpus}' _base_dir = f'{cls.res_dir}/uqvPredictions/' cls.vars_results_dir = dp.ensure_dir( f'{_base_dir}/raw/{cls.predictor}/predictions/') cls.output_dir = dp.ensure_dir(f'{_base_dir}/referenceLists/pageRank/') _test_dir = f'{cls.res_dir}/test' cls.folds = dp.ensure_file(f'{_test_dir}/2_folds_30_repetitions.json') # cls.ap_file = dp.ensure_file(f'{_test_dir}/pageRank/QLmap1000') cls.features = dp.ensure_file( f'{_test_dir}/pageRank/{corpus}_raw_PageRank_Features.pkl')
def _write_results(self, res_df: pd.Series, sim_func, pred_score, lambda_param): sim_func = SIMILARITY_DICT.get(sim_func, sim_func) dir_path = dp.ensure_dir( f'{self.output_dir}/raw/{sim_func}/{self.predictor}/predictions/') file_name = f'predictions-{pred_score}+lambda+{lambda_param}' res_df.to_csv(path_or_buf=f'{dir_path}/{file_name}', index=True, sep=' ', float_format='%f', header=False)
def _calc_features(self): _dict = {'topic': [], 'src': [], 'dest': [], 'jac': [], f'Top_{self.top_docs_overlap}_Docs_overlap': [], f'RBO_EXT_{self.rbo_top}': [], f'RBO_FUSED_EXT_{self.rbo_top}': []} for topic, pairs in self.features_index.items(): # number of combination with replacement is n(n+1)/2 _dict['topic'] += [topic] * (2 * len(pairs) - len(self.query_vars[topic])) fused_res_dict = self.fused_data.get_res_dict_by_qid(topic, top=100) for q1, q2 in pairs: txt1 = self.queries_data.get_qid_txt(q1) txt2 = self.queries_data.get_qid_txt(q2) jc = jaccard_coefficient(txt1, txt2) l1 = self.raw_res_data.get_docs_by_qid(q1, self.top_docs_overlap) l2 = self.raw_res_data.get_docs_by_qid(q2, self.top_docs_overlap) docs_overlap = list_overlap(l1, l2) # All RBO values are rounded to 10 decimal digits, to avoid float overflow q1_results_dict = self.raw_res_data.get_res_dict_by_qid(q1, top=self.rbo_top) q2_results_dict = self.raw_res_data.get_res_dict_by_qid(q2, top=self.rbo_top) _rbo_scores_dict = rbo_dict(q1_results_dict, q2_results_dict, p=0.95) rbo_ext_score = np.around(_rbo_scores_dict['ext'], 10) _q1_fused_rbo_scores_dict = rbo_dict(fused_res_dict, q1_results_dict, p=0.95) _q1_rbo_fused_ext_score = np.around(_q1_fused_rbo_scores_dict['ext'], 10) _q2_fused_rbo_scores_dict = rbo_dict(fused_res_dict, q2_results_dict, p=0.95) _q2_rbo_fused_ext_score = np.around(_q2_fused_rbo_scores_dict['ext'], 10) def _save_to_dict(q_1, q_2): _dict['src'] += [q_1] _dict['dest'] += [q_2] _dict['jac'] += [jc] _dict[f'Top_{self.top_docs_overlap}_Docs_overlap'] += [docs_overlap] _dict[f'RBO_{self.rbo_top}'] += [rbo_ext_score] # The RBO-F feature in that case for edge (q1, q2) will be the RBO similarity of q2 to fused list _dict[f'RBO_FUSED_EXT_{self.rbo_top}'] += [_q2_rbo_fused_ext_score] if q1 == q2: _save_to_dict(q1, q2) else: _save_to_dict(q1, q2) _save_to_dict(q2, q1) _df = pd.DataFrame.from_dict(_dict) _df.sort_values(['topic', 'src', 'dest'], inplace=True) _df.set_index(['topic', 'src', 'dest'], inplace=True) _test_dir = dp.ensure_dir(f'{self.res_dir}/test/pageRank/') _df.to_pickle(f'{_test_dir}/{self.corpus}_raw_PageRank_Features.pkl') return _df
def main(args): corpus = args.corpus results_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{corpus}/uqvPredictions/referenceLists/pageRank/raw') title_queries_file = dp.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_title.txt') title_queries_obj = dp.QueriesTextParser(title_queries_file) full_queries_file = dp.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_full.txt') full_queries_obj = dp.QueriesTextParser(full_queries_file, 'uqv') for sim in SIMILARITY_FUNCTIONS: for predictor in PREDICTORS: _results_dir = f'{results_dir}/{sim}/{predictor}/predictions' full_df = read_into_df(_results_dir) title_vid_df = filter_title_queries(full_df, full_queries_obj, title_queries_obj) title_qid_df = convert_vid_to_qid(title_vid_df) write_basic_predictions(title_qid_df, _results_dir.replace('raw', 'title'), sim)
def __set_paths(cls, corpus): """This method sets the default paths of the files and the working directories, it assumes the standard naming convention of the project""" # cls.predictor = predictor _res_dir, _data_dir = dp.set_environment_paths() cls.res_dir = f'{_res_dir}/{corpus}' cls.dat_dir = f'{_data_dir}/{corpus}' _results_file = f'{cls.res_dir}/test/raw/QL.res' cls.results_file = os.path.normpath(_results_file) dp.ensure_file(cls.results_file) _title_results_file = f'{cls.res_dir}/test/basic/QL.res' cls.title_res_file = os.path.normpath(_title_results_file) dp.ensure_file(cls.title_res_file) _queries_full_file = f'{cls.dat_dir}/queries_{corpus}_UQV_full.stemmed.txt' cls.queries_full_file = dp.ensure_file(_queries_full_file) _fused_results_file = f'{cls.res_dir}/test/fusion/QL.res' cls.fused_results_file = dp.ensure_file(_fused_results_file) cls.output_dir = dp.ensure_dir(f'{cls.res_dir}/test/raw/')
def write_basic_predictions(df: pd.DataFrame, output_dir, similarity: str, qgroup='title') -> None: """The function is used to save results in basic predictions format of a given queries set. e.g. 'qid': score""" output_dir = dp.ensure_dir(output_dir) for col in df.columns: file_name = f'{output_dir}/{col}' df[col].to_csv(file_name, sep=' ', header=False, index=True, float_format='%f')