def check_significance(corpus, predictor, alpha=0.05): _base_dir = f'~/QppUqvProj/Results/{corpus}/uqvPredictions/aggregated/avg/' baseline_dir = dp.ensure_dir(f'{_base_dir}/{predictor}/evaluation/') baseline_file = dp.ensure_file( f'{baseline_dir}/simple_results_vector_for_2_folds_30_repetitions_avg.json' ) with open(baseline_file) as json_data: data = json.load(json_data) baseline_sr = pd.DataFrame.from_dict(data, orient='index', columns=['correlation'], dtype=float) candidate_dir = dp.ensure_dir(f'{_base_dir}/{predictor}/ltr/evaluation/') candidate_file = dp.ensure_file( f'{candidate_dir}/simple_results_vector_for_2_folds_30_repetitions_ltr.json' ) with open(candidate_file) as json_data: data = json.load(json_data) candidate_sr = pd.DataFrame.from_dict(data, orient='index', columns=['correlation'], dtype=float) print(f'baseline: {baseline_sr.mean()[0]:.3f}') return t_test(baseline_sr, candidate_sr, alpha)
def run_svm_fine_tune(self): svm_learn = '~/svmRank/svm_rank_learn' svm_classify = '~/svmRank/svm_rank_classify' models_dir = self.output_dir.replace('datasets', 'models') ensure_dir(models_dir) classification_dir = self.output_dir.replace('datasets', 'classifications') ensure_dir(classification_dir) run(f'rm -rfv {models_dir}*', shell=True) run(f'rm -rfv {classification_dir}*', shell=True) train_sets = glob.glob(f'{self.output_dir}/train*') for c in C_list: for trainset in train_sets: testset = trainset.replace('train', 'test') _model_params = trainset.strip('.dat').split('_', 1)[-1] _model_path = f'{models_dir}model_{_model_params}_c_{c}' _cls_train_path = f'{classification_dir}train_{_model_params}_c_{c}.cls' _cls_test_path = f'{classification_dir}test_{_model_params}_c_{c}.cls' run('{0} -c {1} {2} {3}'.format(svm_learn, c, trainset, _model_path), shell=True) run('{0} {1} {2} {3}'.format(svm_classify, trainset, _model_path, _cls_train_path), shell=True) run('{0} {1} {2} {3}'.format(svm_classify, testset, _model_path, _cls_test_path), shell=True)
def write_basic_predictions(df: pd.DataFrame, corpus, qgroup, predictor): """The function is used to save results in basic predictions format of a given queries set""" for col in df.columns: _file_path = f'~/QppUqvProj/Results/{corpus}/basicPredictions/{qgroup}/{predictor}/predictions/' dp.ensure_dir(os.path.normpath(os.path.expanduser(_file_path))) _file_name = col.replace('score_', 'predictions-') file_name = f'{_file_path}{_file_name}' df[col].to_csv(file_name, sep=" ", header=False, index=True)
def _cp_result_file_to_dirs(self): destination_dirs = defaultdict(str) for lam in LAMBDA: for sim, pred in itertools.product(SIMILARITY_FUNCTIONS.values(), PREDICTORS): dest_dir = dp.ensure_dir(f'{self.results_dir}/{sim}/{pred}/lambda-{lam}/predictions') destination_dirs[sim, pred, f'{lam:.2f}'] = dest_dir src_dir = dp.ensure_dir(f'{self.raw_res_base_dir}/{sim}/{pred}/predictions') prediction_files = glob(f'{src_dir}/predictions-*+lambda+{lam}') for _file in prediction_files: copy2(_file, dest_dir) return destination_dirs
def __set_paths(self): """This method sets the default paths of the files and the working directories, it assumes the standard naming convention of the project""" _corpus_res_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{self.corpus}') _corpus_dat_dir = dp.ensure_dir(f'~/QppUqvProj/data/{self.corpus}') self.ql_results_file = dp.ensure_file( f'{_corpus_res_dir}/test/raw/QL.res') self.queries_txt_file = dp.ensure_file( f'{_corpus_dat_dir}/queries_{self.corpus}_UQV_full.stemmed.txt') # self.predictions_dir = dp.ensure_dir(f'{_corpus_res_dir}/uqvPredictions/raw/{self.predictor}') self.pkl_dir = dp.ensure_dir(f'{_corpus_res_dir}/test/raw/pkl_files/')
def __set_paths(cls, corpus, predictor, qgroup, vars_quantile): """This method sets the default paths of the files and the working directories, it assumes the standard naming convention of the project""" cls.predictor = predictor _base_dir = f'~/QppUqvProj/Results/{corpus}/uqvPredictions/' cls.vars_results_dir = dp.ensure_dir( f'{_base_dir}/raw/{predictor}/predictions/') if qgroup == 'title': _orig_dir = dp.ensure_dir( f'~/QppUqvProj/Results/{corpus}/basicPredictions/title') cls.base_results_dir = f'{_orig_dir}/{predictor}/predictions/' cls.output_dir = dp.ensure_dir( f'{_base_dir}/referenceLists/{qgroup}/{vars_quantile}_vars/') _test_dir = f'~/QppUqvProj/Results/{corpus}/test' cls.folds = dp.ensure_file(f'{_test_dir}/2_folds_30_repetitions.json') cls.ap_file = dp.ensure_file(f'{_test_dir}/ref/QLmap1000-{qgroup}') # cls.features = '{}/raw/query_features_{}_uqv_legal.JSON'.format(_test_dir, corpus) # cls.features = f'{_test_dir}/ref/{qgroup}_query_features_{corpus}_uqv.JSON' cls.features = dp.ensure_file( f'{_test_dir}/ref/{qgroup}_query_{vars_quantile}_variations_features_{corpus}_uqv.JSON' ) cls.geo_mean_file = dp.ensure_file( f'{_base_dir}/raw/geo/predictions/predictions-20000') # The variations file is used in the filter function - it consists of all the vars w/o the query at hand _query_vars = f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_{qgroup}.txt' cls.query_vars_file = os.path.normpath(os.path.expanduser(_query_vars)) dp.ensure_file(cls.query_vars_file) _queries2predict = f'~/QppUqvProj/data/{corpus}/queries_{corpus}_{qgroup}.txt' cls.queries2predict_file = dp.ensure_file(_queries2predict) if vars_quantile == 'all': cls.quantile_vars_file = cls.query_vars_file else: _quantile_vars = f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_{vars_quantile}_variants.txt' cls.quantile_vars_file = os.path.normpath( os.path.expanduser(_quantile_vars)) dp.ensure_file(cls.quantile_vars_file) cls.real_ap_file = dp.ensure_file( f'~/QppUqvProj/Results/{corpus}/test/raw/QLmap1000') cls.geo_predictions_dir = dp.ensure_dir( f'{_base_dir}/referenceLists/{qgroup}/{vars_quantile}_vars/sim_as_pred/geo/predictions' )
def __init__(self, qpp_ref: QueryPredictionRef, corr_measure='pearson'): self.corr_measure = corr_measure _predictor = qpp_ref.predictor self.features_df = qpp_ref.features_df self.results_df = qpp_ref.var_scores_df _ap_file = qpp_ref.ap_file self.ap_obj = dp.ResultsReader(_ap_file, 'ap') self.folds_df = qpp_ref.var_cv.data_sets_map.transpose() self.output_dir = f'{qpp_ref.output_dir}/ltr/{_predictor}/' dp.ensure_dir(self.output_dir) self.calc_features_df = qpp_ref.calc_integrated self.feature_names = self.features_df.columns.tolist() self.cpu_cores = mp.cpu_count() - 1
def main(args): corpus = args.corpus # corpus = 'ROBUST' if not corpus: return queries_file = dp.ensure_file( f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_full.txt') rm_probabilities_dir = dp.ensure_dir( f'~/QppUqvProj/Results/{corpus}/uqvPredictions/raw/RMprob') # queries_file = dp.ensure_file(f'~/QppUqvProj/data/{corpus}/queries.txt') # rm_probabilities_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{corpus}/basicPredictions/title/RMprob') queries_obj = dp.QueriesTextParser(queries_file) rm_probabilities_df = dp.read_rm_prob_files(rm_probabilities_dir, number_of_docs=20000, clipping='*') uqv = True if 'uqv' in queries_file.split('/')[-1].lower() else False results_df = geo_mean(queries_obj, rm_probabilities_df) write_predictions(results_df, corpus, uqv)
def write_predictions(df, corpus, uqv): if uqv: _dir = dp.ensure_dir( f'~/QppUqvProj/Results/{corpus}/uqvPredictions/raw/geo/predictions' ) else: _dir = dp.ensure_dir( f'~/QppUqvProj/Results/{corpus}/basicPredictions/title/geo/predictions' ) for col in df: file_name = f'{_dir}/predictions-{col}' df[col].to_csv(file_name, sep=" ", header=False, index=True, float_format='%f')
def _calc_general_model_result(self, direct, predictor, sim_func): print(f'\n---Generating {predictor}-{sim_func} {direct} results---\n') _dict = defaultdict(list) def append_to_full_results_dict(_mean, _n): _dict['direction'].append(direct) _dict['predictor'].append(predictor) _dict['sim_func'].append(sim_func) _dict['n_vars'].append(_n) _dict['result'].append(_mean) mean = self.basic_results_dict.get(predictor, None) assert mean, f'self.basic_results_dict couldn\'t get {predictor}' append_to_full_results_dict(mean, 0) _dir = f'{self.results_dir}/{direct}' for n in range(1, self.max_n + 1): _predictions_dir = dp.ensure_dir( f'{_dir}/{n}_vars/general/{sim_func}/{predictor}/predictions') cv_obj = InterTopicCrossValidation( k=2, rep=30, folds_map_file=self.cv_map_file, predictions_dir=_predictions_dir, load=True, ap_file=self.query_ap_file, test=self.corr_measure) mean = cv_obj.calc_test_results() append_to_full_results_dict(mean, n) _df = pd.DataFrame.from_dict(_dict) return _df
def load_full_features_df(**kwargs): """ :param kwargs: corpus, queries_group, quantile or features_factory_obj: QueryFeatureFactory() object :return: pd.DataFrame that contains all the features values """ corpus = kwargs.get('corpus', None) queries_group = kwargs.get('queries_group', None) quantile = kwargs.get('quantile', None) features_factory_obj = kwargs.get('features_factory_obj', None) if features_factory_obj: features_obj = features_factory_obj corpus = features_obj.corpus queries_group = features_obj.queries_group else: assert corpus and queries_group and quantile, f"Can't create a factory object from Corpus={corpus}, " \ f"Queries group={queries_group}, Variations Quantile={quantile}" features_obj = RefQueryFeatureFactory(corpus, queries_group, quantile) pkl_dir = dp.ensure_dir( f'~/QppUqvProj/Results/{corpus}/test/ref/pkl_files/') _list = [] last_df = pd.DataFrame() for n in NUMBER_OF_DOCS: _file = f'{pkl_dir}/{queries_group}_queries_{corpus}_RBO_{n}_TopDocs_{n}.pkl' try: dp.ensure_file(_file) _df = pd.read_pickle(_file).set_index(['topic', 'qid']) _df[f'Top_{n}_Docs_overlap'] = _df[f'Top_{n}_Docs_overlap'] / n _list.append(_df.drop('Jac_coefficient', axis=1)) last_df = _df['Jac_coefficient'] except AssertionError: print(f'!! Warning !! The file {_file} is missing') df = pd.concat(_list + [last_df], axis=1) return features_obj.divide_by_size(df)
def __init__(self, folds_map_file=None, k=2, rep=30, predictions_dir=None, test='pearson', ap_file=None, generate_folds=False, **kwargs): logging.debug("testing logger") self.k = k self.rep = rep self.test = test assert predictions_dir, 'Specify predictions dir' assert folds_map_file, 'Specify path for CV folds file' predictions_dir = os.path.abspath(os.path.normpath(os.path.expanduser(predictions_dir))) assert os.listdir(predictions_dir), f'{predictions_dir} is empty' self.output_dir = dp.ensure_dir(predictions_dir.replace('predictions', 'evaluation')) if ap_file: self.full_set = self._build_full_set(predictions_dir, ap_file) if '-' in ap_file: self.ap_func = ap_file.split('-')[-1] else: self.ap_func = 'basic' else: self.full_set = self._build_full_set(predictions_dir) if generate_folds: self.index = self.full_set.index self.folds_file = self._generate_k_folds() self.__load_k_folds() else: try: self.folds_file = dp.ensure_file(folds_map_file) except FileExistsError: print("The folds file specified doesn't exist, going to generate the file and save") self.__load_k_folds()
def cross_val(self): simple_results = {} classification_dir = self.output_dir.replace('datasets', 'classifications') eval_dir = ensure_dir(self.output_dir.replace('datasets', 'evaluation')) _list = [] for set_id in range(1, 31): _pair = [] for subset in ['a', 'b']: _res_df = pd.read_csv( f'{classification_dir}/predictions_{set_id}_{subset}', header=None, names=['score']) _test_topics = np.array( self.folds_df[set_id][subset]['test']).astype(str) _res_df.insert(loc=0, column='qid', value=_test_topics) _res_df.set_index('qid', inplace=True) _ap_df = self.ap_obj.data_df.loc[_test_topics] _df = _res_df.merge(_ap_df, how='outer', on='qid') _correlation = _df['score'].corr(_df['ap'], method=self.cv.test) _pair.append(_correlation) avg_res = np.mean(_pair) _list.append(avg_res) simple_results['set {}'.format(set_id)] = avg_res simple_results_df = pd.Series(simple_results) simple_results_df.to_json( ('{}/simple_results_vector_for_2_folds_30_repetitions_ltr.json'. format(eval_dir))) print('mean: {:.3f}'.format(np.mean(_list))) if check_significance(self.corpus, self.predictor): print('significant!') else: print('Not significant!')
def __set_paths(cls, corpus, predictor, agg): """This method sets the default paths of the files and the working directories, it assumes the standard naming convention of the project""" _base_dir = f'~/QppUqvProj/Results/{corpus}/uqvPredictions/' _base_dir = os.path.normpath(os.path.expanduser(_base_dir)) cls.parameters = '{}/aggregated/{}/{}/evaluation/full_results_vector_for_2_folds_30_repetitions_{}.json'.format( _base_dir, agg, predictor, agg) cls.results_dir = '{}/raw/{}/predictions/'.format(_base_dir, predictor) cls.output_dir = '{}/aggregated/{}/{}/ltr/datasets/'.format( _base_dir, agg, predictor) ensure_dir(cls.output_dir) _test_dir = f'~/QppUqvProj/Results/{corpus}/test/' _test_dir = os.path.normpath(os.path.expanduser(_test_dir)) cls.folds = '{}/2_folds_30_repetitions.json'.format(_test_dir) cls.features = '{}/raw/norm_features_{}_uqv.JSON'.format( _test_dir, corpus) cls.ap_file = '{}/aggregated/map1000-{}'.format(_test_dir, agg)
def __set_paths(cls, corpus): """This method sets the default paths of the files and the working directories, it assumes the standard naming convention of the project""" _corpus_test_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{corpus}/test/') # AP file for the cross validation process cls.query_ap_file = dp.ensure_file(f'{_corpus_test_dir}/ref/QLmap1000-title') # CV folds mapping file cls.cv_map_file = dp.ensure_file(f'{_corpus_test_dir}/2_folds_30_repetitions.json') # The data dir for the Graphs cls.data_dir = dp.ensure_dir(f'~/QppUqvProj/Graphs/{corpus}/data') # The results base dir for the Graphs cls.results_dir = dp.ensure_dir(f'~/QppUqvProj/Graphs/{corpus}/referenceLists/title/all_vars/general') cls.raw_res_base_dir = dp.ensure_dir( f'~/QppUqvProj/Results/{corpus}/uqvPredictions/referenceLists/title/all_vars/general') _ap_file = f'~/QppUqvProj/Results/{corpus}/test/basic/QLmap1000' cls.true_ap_file = dp.ensure_file(_ap_file)
def __set_graph_paths(cls, corpus, predictor, qgroup, direct, n): """This method sets the default paths of the files and the working directories, it assumes the standard naming convention of the project""" cls.predictor = predictor _corpus_res_dir = dp.ensure_dir( f'~/QppUqvProj/Results/{corpus}/uqvPredictions/') _corpus_dat_dir = dp.ensure_dir(f'~/QppUqvProj/data/{corpus}') _graphs_base_dir = dp.ensure_dir(f'~/QppUqvProj/Graphs/{corpus}') _graphs_dat_dir = dp.ensure_dir(f'{_graphs_base_dir}/data/{direct}') # Prediction results of all UQV query variants cls.vars_results_dir = dp.ensure_dir( f'{_corpus_res_dir}/raw/{predictor}/predictions/') # Prediction results of the queries to be predicted _orig_dir = dp.ensure_dir( f'~/QppUqvProj/Results/{corpus}/basicPredictions/{qgroup}') cls.base_results_dir = f'{_orig_dir}/{predictor}/predictions/' # The directory to save the new results cls.output_dir = dp.ensure_dir( f'{_graphs_base_dir}/referenceLists/{qgroup}/{direct}/{n}_vars') # The files for used for the LTR and CV _test_dir = f'~/QppUqvProj/Results/{corpus}/test' cls.folds = dp.ensure_file(f'{_test_dir}/2_folds_30_repetitions.json') cls.ap_file = dp.ensure_file(f'{_test_dir}/ref/QLmap1000-{qgroup}') # The features file used for prediction cls.features = dp.ensure_file( f'{_graphs_dat_dir}/features/{qgroup}_query_{n}_variations_features_{corpus}_uqv.JSON' ) cls.geo_mean_file = dp.ensure_file( f'QppUqvProj/Results/{corpus}/uqvPredictions/raw/geo/predictions/predictions-20000' ) # The variations file is used in the filter function - it consists of all the vars w/o the query at hand cls.query_vars_file = dp.ensure_file( f'{_graphs_dat_dir}/queries/queries_wo_{qgroup}_{n}_vars.txt') cls.quantile_vars_file = cls.query_vars_file _queries2predict = f'~/QppUqvProj/data/{corpus}/queries_{corpus}_{qgroup}.txt' cls.queries2predict_file = dp.ensure_file(_queries2predict) cls.real_ap_file = dp.ensure_file( f'~/QppUqvProj/Results/{corpus}/test/raw/QLmap1000') cls.geo_predictions_dir = dp.ensure_dir( f'{_corpus_res_dir}/referenceLists/{qgroup}/all_vars/sim_as_pred/geo/predictions' )
def create_query_files(self, n): for direction, func in {('asce', filter_n_low_queries), ('desc', filter_n_top_queries)}: _dir = dp.ensure_dir(f'{self.data_dir}/{direction}/queries') _file = f'{_dir}/queries_wo_{self.group}_{n}_vars.txt' _df = func(self.queries_obj.queries_df, self.raw_ap_obj, n) _df[['qid', 'text']].to_csv(_file, sep=":", header=False, index=False)
def main(): # corpus = 'ROBUST' corpus = 'ClueWeb12B' # raw_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{corpus}/uqvPredictions/raw/rsd/raw_data') raw_dir = dp.ensure_dir( f'~/QppUqvProj/Results/{corpus}/basicPredictions/title/rsd/raw_data') raw_files = glob(f'{raw_dir}/probabilities-*') for raw_file in raw_files: separate_tables(raw_file)
def __set_paths(cls, corpus, qgroup, vars_quantile): """This method sets the default paths of the files and the working directories, it assumes the standard naming convention of the project""" # cls.predictor = predictor _corpus_res_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{corpus}') _corpus_dat_dir = dp.ensure_dir(f'~/QppUqvProj/data/{corpus}') _results_file = f'{_corpus_res_dir}/test/raw/QL.res' cls.results_file = os.path.normpath(_results_file) dp.ensure_file(cls.results_file) _title_results_file = f'{_corpus_res_dir}/test/basic/QL.res' cls.title_res_file = os.path.normpath(_title_results_file) dp.ensure_file(cls.title_res_file) cls.queries_full_file = dp.ensure_file( f'{_corpus_dat_dir}/queries_{corpus}_UQV_full.stemmed.txt') # The variations file is used in the filter function - it consists of all the vars w/o the query at hand _queries_variations_file = f'{_corpus_dat_dir}/queries_{corpus}_UQV_wo_{qgroup}.txt' cls.queries_variations_file = dp.ensure_file(_queries_variations_file) # The vars quantile file is used in the filter function - it consists of the relevant vars quantile if vars_quantile == 'all': _queries_quantile_file = f'{_corpus_dat_dir}/queries_{corpus}_UQV_full.txt' else: _queries_quantile_file = f'{_corpus_dat_dir}/queries_{corpus}_UQV_{vars_quantile}_variants.txt' cls.queries_quantile_vars = dp.ensure_file(_queries_quantile_file) _queries_topic_file = f'{_corpus_dat_dir}/queries_{corpus}_{qgroup}.stemmed.txt' cls.queries_topic_file = dp.ensure_file(_queries_topic_file) _fused_results_file = f'{_corpus_res_dir}/test/fusion/QL.res' cls.fused_results_file = dp.ensure_file(_fused_results_file) # cls.output_dir = dp.ensure_dir(f'{_corpus_res_dir}/test/raw/') _predictions_out = f'{_corpus_res_dir}/uqvPredictions/referenceLists/{qgroup}/{vars_quantile}_vars/sim_as_pred/' cls.predictions_output_dir = dp.ensure_dir(_predictions_out) cls.pkl_dir = dp.ensure_dir(f'{_corpus_res_dir}/test/ref/pkl_files/')
def run_svm_fine_tune(self): models_dir = f'{self.output_dir}models' dp.ensure_dir(models_dir) classification_dir = f'{self.output_dir}classifications' dp.ensure_dir(classification_dir) dp.empty_dir(models_dir) dp.empty_dir(classification_dir) train_sets = glob.glob(f'{self.output_dir}datasets/train*') args_list = list(itertools.product(C_PARAMETERS, train_sets)) if not mp.current_process().daemon: with mp.Pool(processes=self.cpu_cores) as pool: pool.starmap( partial(svm_sub_procedure, models_dir=models_dir, classification_dir=classification_dir), args_list) else: for c, train_sets in args_list: svm_sub_procedure(c, train_sets, models_dir=models_dir, classification_dir=classification_dir)
def save_predictions(self, df: pd.DataFrame): _df = self._filter_queries(df) _df = _df.groupby('topic').mean() _df = dp.convert_vid_to_qid(_df) _rboP_dir = dp.ensure_dir( f'{self.predictions_output_dir}/rboP/predictions') _FrboP_dir = dp.ensure_dir( f'{self.predictions_output_dir}/FrboP/predictions') _topDocsP_dir = dp.ensure_dir( f'{self.predictions_output_dir}/topDocsP/predictions') _jcP_dir = dp.ensure_dir( f'{self.predictions_output_dir}/jcP/predictions') _df[f'RBO_EXT_{self.rbo_top}'].to_csv( f'{_rboP_dir}/predictions-{self.rbo_top}', sep=' ') _df[f'RBO_FUSED_EXT_{self.rbo_top}'].to_csv( f'{_FrboP_dir}/predictions-{self.rbo_top}', sep=' ') _df[f'Top_{self.top_docs_overlap}_Docs_overlap'].to_csv( f'{_topDocsP_dir}/predictions-{self.top_docs_overlap}', sep=' ') _df['Jac_coefficient'].to_csv(f'{_jcP_dir}/predictions-{self.rbo_top}', sep=' ')
def generate_qpp_reference_predictions(self, predictor): print(f'\n---Generating qpp ref predictions with {predictor}---\n') for direct in {'asce', 'desc'}: _dir = dp.ensure_dir(f'{self.data_dir}/{direct}') for n in range(1, self.max_n + 1): qpp_ref = QueryPredictionRef(predictor, self.corpus, qgroup=self.group, vars_quantile='all', graphs=direct, n=n) qpp_ref.calc_queries()
def run_svm(self): c = '1' svm_learn = 'svmRank/svm_rank_learn' svm_classify = '~/svmRank/svm_rank_classify' models_dir = self.output_dir.replace('datasets', 'models') dp.ensure_dir(models_dir) classification_dir = self.output_dir.replace('datasets', 'classifications') run(f'rm -rfv {models_dir}*', shell=True) run(f'rm -rfv {classification_dir}*', shell=True) dp.ensure_dir(classification_dir) for set_id in range(1, 31): for subset in ['a', 'b']: run('{0} -c {1} {2}/train_{3}_{4}.dat {5}/model_{3}_{4}'. format(svm_learn, c, self.output_dir, set_id, subset, models_dir), shell=True) run('{0} {1}/test_{2}_{3}.dat {4}/model_{2}_{3} {5}/predictions_{2}_{3}' .format(svm_classify, self.output_dir, set_id, subset, models_dir, classification_dir), shell=True)
def generate_features(self, n): print(f'\n---Generating Features for {n} vars---\n') for direct in {'asce', 'desc'}: _dir = dp.ensure_dir(f'{self.data_dir}/{direct}/features') _feat_obj = RefQueryFeatureFactory(corpus=self.corpus, queries_group=self.group, vars_quantile='all', graphs=direct, n=n) _df = load_full_features_df(features_factory_obj=_feat_obj) _df.reset_index().to_json( f'{_dir}/{self.group}_query_{n}_variations_features_{self.corpus}_uqv.JSON' )
def calc_single_query_result(self, predictor): print(f'\n---Generating {predictor} 0 vars results---\n') _predictions_dir = dp.ensure_dir( f'{self.basic_predictions_dir}/{predictor}/predictions') cv_obj = InterTopicCrossValidation(k=2, rep=30, folds_map_file=self.cv_map_file, predictions_dir=_predictions_dir, load=True, ap_file=self.query_ap_file, test=self.corr_measure) mean = cv_obj.calc_test_results() self.basic_results_dict[predictor] = mean
def write_results(self, df, column, lambda_param, oracle=False): sim_func = get_simfunct(column) if sim_func != 'jac' and sim_func != 'uni' and sim_func != 'geo': sim_param = [s for s in column.split('_') if s.isdigit()][0] else: sim_param = None if oracle: output_dir = dp.ensure_dir(f'{self.output_dir}/oracle') else: output_dir = dp.ensure_dir(f'{self.output_dir}/general') for col in df.columns: _file_path = f'{output_dir}/{sim_func}/{self.predictor}/predictions/' dp.ensure_dir(_file_path) _file_name = col.replace('score_', 'predictions-') if sim_param: file_name = f'{_file_path}{_file_name}+{sim_func}+{sim_param}+lambda+{lambda_param}' else: file_name = f'{_file_path}{_file_name}+lambda+{lambda_param}' df[col].to_csv(file_name, sep=" ", header=False, index=True, float_format='%f')
def __set_paths(cls, corpus, group): _corpus_test_dir = dp.ensure_dir( f'~/QppUqvProj/Results/{corpus}/test/') # Basic predictions dir cls.basic_predictions_dir = dp.ensure_dir( f'~/QppUqvProj/Results/{corpus}/basicPredictions/{group}/') # AP file to pick variations according to AP cls.raw_ap_file = dp.ensure_file(f'{_corpus_test_dir}/raw/QLmap1000') # AP file for the cross validation process cls.query_ap_file = dp.ensure_file( f'{_corpus_test_dir}/ref/QLmap1000-{group}') # CV folds mapping file cls.cv_map_file = dp.ensure_file( f'{_corpus_test_dir}/2_folds_30_repetitions.json') # Queries file with all the variations except the ones to be predicted cls.queries_file = dp.ensure_file( f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_{group}.txt') # The data dir for the Graphs cls.data_dir = dp.ensure_dir(f'~/QppUqvProj/Graphs/{corpus}/data') # The results base dir for the Graphs cls.results_dir = dp.ensure_dir( f'~/QppUqvProj/Graphs/{corpus}/referenceLists/{group}')
def generate_sim_predictions(self, k): print(f'\n---Generating sim predictions {k} docs---\n') load_pickle = self.load_from_pkl for direct in {'asce', 'desc'}: _dir = dp.ensure_dir(f'{self.data_dir}/{direct}') for n in range(1, self.max_n + 1): sim_ref_pred = RefQueryFeatureFactory(self.corpus, queries_group=self.group, vars_quantile='all', rbo_top=k, top_docs_overlap=k, graphs=direct, n=n) sim_ref_pred.generate_predictions(load_pickle) load_pickle = True
def __set_graph_paths(cls, corpus, qgroup, direct, n): """This method sets the default paths of the files and the working directories, it assumes the standard naming convention of the project""" # cls.predictor = predictor _corpus_res_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{corpus}') _corpus_dat_dir = dp.ensure_dir(f'~/QppUqvProj/data/{corpus}') _graphs_base_dir = dp.ensure_dir(f'~/QppUqvProj/Graphs/{corpus}') _graphs_res_dir = dp.ensure_dir( f'{_graphs_base_dir}/referenceLists/{qgroup}/{direct}/{n}_vars') _graphs_dat_dir = dp.ensure_dir(f'{_graphs_base_dir}/data') cls.number_of_vars = n _results_file = f'{_corpus_res_dir}/test/raw/QL.res' cls.results_file = os.path.normpath(_results_file) dp.ensure_file(cls.results_file) _title_results_file = f'{_corpus_res_dir}/test/basic/QL.res' cls.title_res_file = os.path.normpath(_title_results_file) dp.ensure_file(cls.title_res_file) _queries_full_file = f'{_corpus_dat_dir}/queries_{corpus}_UQV_full.stemmed.txt' cls.queries_full_file = dp.ensure_file(_queries_full_file) # The variations file is used in the filter function - it consists of all the vars w/o the query at hand _queries_variations_file = f'{_graphs_dat_dir}/{direct}/queries/queries_wo_{qgroup}_{n}_vars.txt' cls.queries_variations_file = dp.ensure_file(_queries_variations_file) cls.queries_quantile_vars = cls.queries_variations_file _queries_topic_file = f'{_corpus_dat_dir}/queries_{corpus}_{qgroup}.stemmed.txt' cls.queries_topic_file = dp.ensure_file(_queries_topic_file) _fused_results_file = f'{_corpus_res_dir}/test/fusion/QL.res' # _fused_results_file = f'{_corpus_res_dir}/test/fusion/all_wo_{qgroup}_fused_QL.res' cls.fused_results_file = dp.ensure_file(_fused_results_file) # cls.output_dir = dp.ensure_dir(f'{_graphs_res_dir}/test/raw/') cls.predictions_output_dir = dp.ensure_dir( f'{_graphs_res_dir}/sim_as_pred/') cls.pkl_dir = dp.ensure_dir(f'{_graphs_dat_dir}/pkl_files/features')
def separate_tables(raw_file): raw_df = pd.read_table(raw_file, names=['qid', 'term', 'probability', 'clipping'], sep=' ') full_rm_df = raw_df.loc[raw_df['clipping'].isna()].drop('clipping', axis=1) clipped_df = raw_df.loc[~raw_df['clipping'].isna()].drop('clipping', axis=1) raw_file = raw_file.rsplit('/', 1) _file_name = raw_file[-1] _dir = dp.ensure_dir(raw_file[0].replace('raw_data', 'data')) fullrm_file = f'{_file_name}+c0' clipped_file = f'{_file_name}+c100' clipped_df.to_csv(f'{_dir}/{clipped_file}', sep=" ", header=False, index=False, float_format='%f') full_rm_df.to_csv(f'{_dir}/{fullrm_file}', sep=" ", header=False, index=False, float_format='%f')