예제 #1
0
 def __save_new_dictionary(self, corpus, predictor):
     _dir = dp.ensure_dir(
         f'{self.res_dir}/test/pageRank/pkl_files/{predictor}')
     with open(f'{_dir}/dict_all_options_stochastic.pkl', 'wb') as handle:
         pickle.dump(self.dict_all_options_stochastic,
                     handle,
                     protocol=pickle.HIGHEST_PROTOCOL)
예제 #2
0
def main(args):
    corpus = args.corpus
    generate = args.generate
    # predict = args.predict
    # queries_group = args.group
    file_to_load = args.load
    # quantile = args.quantile

    # # Debugging
    # print('------------!!!!!!!---------- Debugging Mode ------------!!!!!!!----------')
    # testing_feat = QueryFeatureFactory('ROBUST')
    # norm_features_df = testing_feat.generate_features()
    # # norm_features_df.reset_index().to_json('query_features_{}_uqv.JSON'.format(corpus))

    cores = mp.cpu_count() - 1

    if generate:
        # FIXME: test and fix the fueatures creation to run in parallel
        with mp.Pool(processes=cores) as pool:
            norm_features_list = pool.starmap(run_features_process,
                                              itertools.product({'ROBUST', 'ClueWeb12B'}, NUMBER_OF_DOCS))

        testing_feat = QueryFeatureFactory(corpus)
        norm_features_df = testing_feat.generate_features()

        _path = f'{dp.set_environment_paths()[0]}/{corpus}/test/pageRank'
        _path = dp.ensure_dir(_path)
        norm_features_df.reset_index().to_json(f'{_path}/PageRank_Features.JSON')

    elif file_to_load:
        features_df = features_loader(corpus, file_to_load)
        print(features_df)
    else:
        features_df = features_loader(corpus)
        print(features_df)
예제 #3
0
    def __set_paths(cls, corpus, predictor):
        """This method sets the default paths of the files and the working directories, it assumes the standard naming
         convention of the project"""
        cls.predictor = predictor
        _res_dir, _data_dir = dp.set_environment_paths()
        cls.res_dir = f'{_res_dir}/{corpus}'
        _base_dir = f'{cls.res_dir}/uqvPredictions/'
        cls.vars_results_dir = dp.ensure_dir(
            f'{_base_dir}/raw/{cls.predictor}/predictions/')

        cls.output_dir = dp.ensure_dir(f'{_base_dir}/referenceLists/pageRank/')

        _test_dir = f'{cls.res_dir}/test'
        cls.folds = dp.ensure_file(f'{_test_dir}/2_folds_30_repetitions.json')

        # cls.ap_file = dp.ensure_file(f'{_test_dir}/pageRank/QLmap1000')

        cls.features = dp.ensure_file(
            f'{_test_dir}/pageRank/{corpus}_raw_PageRank_Features.pkl')
예제 #4
0
 def _write_results(self, res_df: pd.Series, sim_func, pred_score,
                    lambda_param):
     sim_func = SIMILARITY_DICT.get(sim_func, sim_func)
     dir_path = dp.ensure_dir(
         f'{self.output_dir}/raw/{sim_func}/{self.predictor}/predictions/')
     file_name = f'predictions-{pred_score}+lambda+{lambda_param}'
     res_df.to_csv(path_or_buf=f'{dir_path}/{file_name}',
                   index=True,
                   sep=' ',
                   float_format='%f',
                   header=False)
예제 #5
0
    def _calc_features(self):
        _dict = {'topic': [], 'src': [], 'dest': [], 'jac': [],
                 f'Top_{self.top_docs_overlap}_Docs_overlap': [], f'RBO_EXT_{self.rbo_top}': [],
                 f'RBO_FUSED_EXT_{self.rbo_top}': []}
        for topic, pairs in self.features_index.items():
            # number of combination with replacement is n(n+1)/2
            _dict['topic'] += [topic] * (2 * len(pairs) - len(self.query_vars[topic]))
            fused_res_dict = self.fused_data.get_res_dict_by_qid(topic, top=100)
            for q1, q2 in pairs:
                txt1 = self.queries_data.get_qid_txt(q1)
                txt2 = self.queries_data.get_qid_txt(q2)
                jc = jaccard_coefficient(txt1, txt2)

                l1 = self.raw_res_data.get_docs_by_qid(q1, self.top_docs_overlap)
                l2 = self.raw_res_data.get_docs_by_qid(q2, self.top_docs_overlap)
                docs_overlap = list_overlap(l1, l2)

                # All RBO values are rounded to 10 decimal digits, to avoid float overflow
                q1_results_dict = self.raw_res_data.get_res_dict_by_qid(q1, top=self.rbo_top)
                q2_results_dict = self.raw_res_data.get_res_dict_by_qid(q2, top=self.rbo_top)
                _rbo_scores_dict = rbo_dict(q1_results_dict, q2_results_dict, p=0.95)
                rbo_ext_score = np.around(_rbo_scores_dict['ext'], 10)

                _q1_fused_rbo_scores_dict = rbo_dict(fused_res_dict, q1_results_dict, p=0.95)
                _q1_rbo_fused_ext_score = np.around(_q1_fused_rbo_scores_dict['ext'], 10)

                _q2_fused_rbo_scores_dict = rbo_dict(fused_res_dict, q2_results_dict, p=0.95)
                _q2_rbo_fused_ext_score = np.around(_q2_fused_rbo_scores_dict['ext'], 10)

                def _save_to_dict(q_1, q_2):
                    _dict['src'] += [q_1]
                    _dict['dest'] += [q_2]
                    _dict['jac'] += [jc]
                    _dict[f'Top_{self.top_docs_overlap}_Docs_overlap'] += [docs_overlap]
                    _dict[f'RBO_{self.rbo_top}'] += [rbo_ext_score]
                    # The RBO-F feature in that case for edge (q1, q2) will be the RBO similarity of q2 to fused list
                    _dict[f'RBO_FUSED_EXT_{self.rbo_top}'] += [_q2_rbo_fused_ext_score]

                if q1 == q2:
                    _save_to_dict(q1, q2)
                else:
                    _save_to_dict(q1, q2)
                    _save_to_dict(q2, q1)

        _df = pd.DataFrame.from_dict(_dict)
        _df.sort_values(['topic', 'src', 'dest'], inplace=True)
        _df.set_index(['topic', 'src', 'dest'], inplace=True)
        _test_dir = dp.ensure_dir(f'{self.res_dir}/test/pageRank/')
        _df.to_pickle(f'{_test_dir}/{self.corpus}_raw_PageRank_Features.pkl')
        return _df
예제 #6
0
def main(args):
    corpus = args.corpus
    results_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{corpus}/uqvPredictions/referenceLists/pageRank/raw')
    title_queries_file = dp.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_title.txt')
    title_queries_obj = dp.QueriesTextParser(title_queries_file)
    full_queries_file = dp.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_full.txt')
    full_queries_obj = dp.QueriesTextParser(full_queries_file, 'uqv')

    for sim in SIMILARITY_FUNCTIONS:
        for predictor in PREDICTORS:
            _results_dir = f'{results_dir}/{sim}/{predictor}/predictions'
            full_df = read_into_df(_results_dir)
            title_vid_df = filter_title_queries(full_df, full_queries_obj, title_queries_obj)
            title_qid_df = convert_vid_to_qid(title_vid_df)
            write_basic_predictions(title_qid_df, _results_dir.replace('raw', 'title'), sim)
예제 #7
0
    def __set_paths(cls, corpus):
        """This method sets the default paths of the files and the working directories, it assumes the standard naming
         convention of the project"""
        # cls.predictor = predictor
        _res_dir, _data_dir = dp.set_environment_paths()
        cls.res_dir = f'{_res_dir}/{corpus}'
        cls.dat_dir = f'{_data_dir}/{corpus}'

        _results_file = f'{cls.res_dir}/test/raw/QL.res'
        cls.results_file = os.path.normpath(_results_file)
        dp.ensure_file(cls.results_file)

        _title_results_file = f'{cls.res_dir}/test/basic/QL.res'
        cls.title_res_file = os.path.normpath(_title_results_file)
        dp.ensure_file(cls.title_res_file)

        _queries_full_file = f'{cls.dat_dir}/queries_{corpus}_UQV_full.stemmed.txt'

        cls.queries_full_file = dp.ensure_file(_queries_full_file)

        _fused_results_file = f'{cls.res_dir}/test/fusion/QL.res'
        cls.fused_results_file = dp.ensure_file(_fused_results_file)

        cls.output_dir = dp.ensure_dir(f'{cls.res_dir}/test/raw/')
예제 #8
0
def write_basic_predictions(df: pd.DataFrame, output_dir, similarity: str, qgroup='title') -> None:
    """The function is used to save results in basic predictions format of a given queries set. e.g. 'qid': score"""
    output_dir = dp.ensure_dir(output_dir)
    for col in df.columns:
        file_name = f'{output_dir}/{col}'
        df[col].to_csv(file_name, sep=' ', header=False, index=True, float_format='%f')