예제 #1
0
def load_full_features_df(**kwargs):
    """
    :param kwargs: corpus, queries_group, quantile or features_factory_obj: QueryFeatureFactory() object
    :return: pd.DataFrame that contains all the features values
    """
    corpus = kwargs.get('corpus', None)
    queries_group = kwargs.get('queries_group', None)
    quantile = kwargs.get('quantile', None)
    features_factory_obj = kwargs.get('features_factory_obj', None)
    if features_factory_obj:
        features_obj = features_factory_obj
        corpus = features_obj.corpus
        queries_group = features_obj.queries_group
    else:
        assert corpus and queries_group and quantile, f"Can't create a factory object from Corpus={corpus}, " \
                                                      f"Queries group={queries_group}, Variations Quantile={quantile}"
        features_obj = RefQueryFeatureFactory(corpus, queries_group, quantile)
    pkl_dir = dp.ensure_dir(
        f'~/QppUqvProj/Results/{corpus}/test/ref/pkl_files/')
    _list = []
    last_df = pd.DataFrame()
    for n in NUMBER_OF_DOCS:
        _file = f'{pkl_dir}/{queries_group}_queries_{corpus}_RBO_{n}_TopDocs_{n}.pkl'
        try:
            dp.ensure_file(_file)
            _df = pd.read_pickle(_file).set_index(['topic', 'qid'])
            _df[f'Top_{n}_Docs_overlap'] = _df[f'Top_{n}_Docs_overlap'] / n
            _list.append(_df.drop('Jac_coefficient', axis=1))
            last_df = _df['Jac_coefficient']
        except AssertionError:
            print(f'!! Warning !! The file {_file} is missing')
    df = pd.concat(_list + [last_df], axis=1)
    return features_obj.divide_by_size(df)
예제 #2
0
def check_significance(corpus, predictor, alpha=0.05):
    _base_dir = f'~/QppUqvProj/Results/{corpus}/uqvPredictions/aggregated/avg/'
    baseline_dir = dp.ensure_dir(f'{_base_dir}/{predictor}/evaluation/')
    baseline_file = dp.ensure_file(
        f'{baseline_dir}/simple_results_vector_for_2_folds_30_repetitions_avg.json'
    )
    with open(baseline_file) as json_data:
        data = json.load(json_data)
    baseline_sr = pd.DataFrame.from_dict(data,
                                         orient='index',
                                         columns=['correlation'],
                                         dtype=float)

    candidate_dir = dp.ensure_dir(f'{_base_dir}/{predictor}/ltr/evaluation/')
    candidate_file = dp.ensure_file(
        f'{candidate_dir}/simple_results_vector_for_2_folds_30_repetitions_ltr.json'
    )
    with open(candidate_file) as json_data:
        data = json.load(json_data)
    candidate_sr = pd.DataFrame.from_dict(data,
                                          orient='index',
                                          columns=['correlation'],
                                          dtype=float)
    print(f'baseline: {baseline_sr.mean()[0]:.3f}')
    return t_test(baseline_sr, candidate_sr, alpha)
예제 #3
0
 def __set_paths(self):
     """This method sets the default paths of the files and the working directories, it assumes the standard naming
      convention of the project"""
     _corpus_res_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{self.corpus}')
     _corpus_dat_dir = dp.ensure_dir(f'~/QppUqvProj/data/{self.corpus}')
     self.ql_results_file = dp.ensure_file(
         f'{_corpus_res_dir}/test/raw/QL.res')
     self.queries_txt_file = dp.ensure_file(
         f'{_corpus_dat_dir}/queries_{self.corpus}_UQV_full.stemmed.txt')
     # self.predictions_dir = dp.ensure_dir(f'{_corpus_res_dir}/uqvPredictions/raw/{self.predictor}')
     self.pkl_dir = dp.ensure_dir(f'{_corpus_res_dir}/test/raw/pkl_files/')
예제 #4
0
    def __set_paths(cls, corpus, predictor, qgroup, vars_quantile):
        """This method sets the default paths of the files and the working directories, it assumes the standard naming
         convention of the project"""
        cls.predictor = predictor

        _base_dir = f'~/QppUqvProj/Results/{corpus}/uqvPredictions/'
        cls.vars_results_dir = dp.ensure_dir(
            f'{_base_dir}/raw/{predictor}/predictions/')

        if qgroup == 'title':
            _orig_dir = dp.ensure_dir(
                f'~/QppUqvProj/Results/{corpus}/basicPredictions/title')
            cls.base_results_dir = f'{_orig_dir}/{predictor}/predictions/'

        cls.output_dir = dp.ensure_dir(
            f'{_base_dir}/referenceLists/{qgroup}/{vars_quantile}_vars/')

        _test_dir = f'~/QppUqvProj/Results/{corpus}/test'
        cls.folds = dp.ensure_file(f'{_test_dir}/2_folds_30_repetitions.json')

        cls.ap_file = dp.ensure_file(f'{_test_dir}/ref/QLmap1000-{qgroup}')

        # cls.features = '{}/raw/query_features_{}_uqv_legal.JSON'.format(_test_dir, corpus)
        # cls.features = f'{_test_dir}/ref/{qgroup}_query_features_{corpus}_uqv.JSON'
        cls.features = dp.ensure_file(
            f'{_test_dir}/ref/{qgroup}_query_{vars_quantile}_variations_features_{corpus}_uqv.JSON'
        )

        cls.geo_mean_file = dp.ensure_file(
            f'{_base_dir}/raw/geo/predictions/predictions-20000')

        # The variations file is used in the filter function - it consists of all the vars w/o the query at hand
        _query_vars = f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_{qgroup}.txt'
        cls.query_vars_file = os.path.normpath(os.path.expanduser(_query_vars))
        dp.ensure_file(cls.query_vars_file)

        _queries2predict = f'~/QppUqvProj/data/{corpus}/queries_{corpus}_{qgroup}.txt'
        cls.queries2predict_file = dp.ensure_file(_queries2predict)

        if vars_quantile == 'all':
            cls.quantile_vars_file = cls.query_vars_file
        else:
            _quantile_vars = f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_{vars_quantile}_variants.txt'
            cls.quantile_vars_file = os.path.normpath(
                os.path.expanduser(_quantile_vars))
            dp.ensure_file(cls.quantile_vars_file)

        cls.real_ap_file = dp.ensure_file(
            f'~/QppUqvProj/Results/{corpus}/test/raw/QLmap1000')

        cls.geo_predictions_dir = dp.ensure_dir(
            f'{_base_dir}/referenceLists/{qgroup}/{vars_quantile}_vars/sim_as_pred/geo/predictions'
        )
예제 #5
0
def features_loader(file_to_load, corpus):
    if file_to_load is None:
        file = dp.ensure_file('features_{}_uqv.JSON'.format(corpus))
    else:
        file = dp.ensure_file(file_to_load)

    features_df = pd.read_json(file, dtype={'topic': str, 'qid': str})
    features_df.reset_index(drop=True, inplace=True)
    features_df.set_index(['topic', 'qid'], inplace=True)
    features_df.rename(index=lambda x: x.split('-')[0], level=0, inplace=True)
    features_df.sort_values(['topic', 'qid'], axis=0, inplace=True)
    return features_df
예제 #6
0
 def generate_results_df(self, cores=None, load_from_pkl=None):
     # _pkl_file = f'{self.data_dir}/pkl_files/full_results_df_{self.max_n}_{self.corpus}_{self.corr_measure}_{self.group}.pkl'
     _pkl_file = f'{self.data_dir}/pkl_files/full_results_df_{self.max_n}_{self.corpus}_{self.corr_measure}.pkl'
     if load_from_pkl:
         try:
             file_to_load = dp.ensure_file(_pkl_file)
             full_results_df = pd.read_pickle(file_to_load)
         except AssertionError:
             print(f'\nFailed to load {_pkl_file}')
             print(f'Will generate {_pkl_file} and save')
             with mp.Pool(processes=cores) as pool:
                 result = pool.starmap(
                     self._calc_general_model_result,
                     itertools.product({'asce', 'desc'}, PREDICTORS,
                                       SIMILARITY_FUNCTIONS.values()))
             pool.close()
             full_results_df = pd.concat(result, axis=0)
             full_results_df.to_pickle(_pkl_file)
     else:
         with mp.Pool(processes=cores) as pool:
             result = pool.starmap(
                 self._calc_general_model_result,
                 itertools.product({'asce', 'desc'}, PREDICTORS,
                                   SIMILARITY_FUNCTIONS.values()))
         pool.close()
         full_results_df = pd.concat(result, axis=0)
         full_results_df.to_pickle(_pkl_file)
     return full_results_df
예제 #7
0
def main(args):
    corpus = args.corpus

    # corpus = 'ROBUST'

    if not corpus:
        return

    queries_file = dp.ensure_file(
        f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_full.txt')
    rm_probabilities_dir = dp.ensure_dir(
        f'~/QppUqvProj/Results/{corpus}/uqvPredictions/raw/RMprob')

    # queries_file = dp.ensure_file(f'~/QppUqvProj/data/{corpus}/queries.txt')
    # rm_probabilities_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{corpus}/basicPredictions/title/RMprob')

    queries_obj = dp.QueriesTextParser(queries_file)
    rm_probabilities_df = dp.read_rm_prob_files(rm_probabilities_dir,
                                                number_of_docs=20000,
                                                clipping='*')

    uqv = True if 'uqv' in queries_file.split('/')[-1].lower() else False

    results_df = geo_mean(queries_obj, rm_probabilities_df)
    write_predictions(results_df, corpus, uqv)
예제 #8
0
 def __init__(self, folds_map_file=None, k=2, rep=30, predictions_dir=None, test='pearson', ap_file=None,
              generate_folds=False, **kwargs):
     logging.debug("testing logger")
     self.k = k
     self.rep = rep
     self.test = test
     assert predictions_dir, 'Specify predictions dir'
     assert folds_map_file, 'Specify path for CV folds file'
     predictions_dir = os.path.abspath(os.path.normpath(os.path.expanduser(predictions_dir)))
     assert os.listdir(predictions_dir), f'{predictions_dir} is empty'
     self.output_dir = dp.ensure_dir(predictions_dir.replace('predictions', 'evaluation'))
     if ap_file:
         self.full_set = self._build_full_set(predictions_dir, ap_file)
         if '-' in ap_file:
             self.ap_func = ap_file.split('-')[-1]
         else:
             self.ap_func = 'basic'
     else:
         self.full_set = self._build_full_set(predictions_dir)
     if generate_folds:
         self.index = self.full_set.index
         self.folds_file = self._generate_k_folds()
         self.__load_k_folds()
     else:
         try:
             self.folds_file = dp.ensure_file(folds_map_file)
         except FileExistsError:
             print("The folds file specified doesn't exist, going to generate the file and save")
         self.__load_k_folds()
예제 #9
0
    def __set_paths(cls, corpus):
        """This method sets the default paths of the files and the working directories, it assumes the standard naming
         convention of the project"""
        _corpus_test_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{corpus}/test/')

        # AP file for the cross validation process
        cls.query_ap_file = dp.ensure_file(f'{_corpus_test_dir}/ref/QLmap1000-title')
        # CV folds mapping file
        cls.cv_map_file = dp.ensure_file(f'{_corpus_test_dir}/2_folds_30_repetitions.json')
        # The data dir for the Graphs
        cls.data_dir = dp.ensure_dir(f'~/QppUqvProj/Graphs/{corpus}/data')
        # The results base dir for the Graphs
        cls.results_dir = dp.ensure_dir(f'~/QppUqvProj/Graphs/{corpus}/referenceLists/title/all_vars/general')
        cls.raw_res_base_dir = dp.ensure_dir(
            f'~/QppUqvProj/Results/{corpus}/uqvPredictions/referenceLists/title/all_vars/general')

        _ap_file = f'~/QppUqvProj/Results/{corpus}/test/basic/QLmap1000'
        cls.true_ap_file = dp.ensure_file(_ap_file)
예제 #10
0
 def load_per_topic_df(self):
     try:
         inter_res_file = dp.ensure_file(
             f'{self.output_dir}/per_topic_correlations_for_{self.k}_folds_{self.rep}_repetitions_pageRank.pkl')
     except AssertionError:
         logging.warning(
             f"File {self.output_dir}/per_topic_correlations_for_{self.k}_folds_{self.rep}_repetitions_pageRank.pkl doesnt exist")
         return None
     df = pd.read_pickle(inter_res_file)
     return df
예제 #11
0
    def __set_graph_paths(cls, corpus, predictor, qgroup, direct, n):
        """This method sets the default paths of the files and the working directories, it assumes the standard naming
         convention of the project"""
        cls.predictor = predictor

        _corpus_res_dir = dp.ensure_dir(
            f'~/QppUqvProj/Results/{corpus}/uqvPredictions/')
        _corpus_dat_dir = dp.ensure_dir(f'~/QppUqvProj/data/{corpus}')

        _graphs_base_dir = dp.ensure_dir(f'~/QppUqvProj/Graphs/{corpus}')
        _graphs_dat_dir = dp.ensure_dir(f'{_graphs_base_dir}/data/{direct}')

        # Prediction results of all UQV query variants
        cls.vars_results_dir = dp.ensure_dir(
            f'{_corpus_res_dir}/raw/{predictor}/predictions/')

        # Prediction results of the queries to be predicted
        _orig_dir = dp.ensure_dir(
            f'~/QppUqvProj/Results/{corpus}/basicPredictions/{qgroup}')
        cls.base_results_dir = f'{_orig_dir}/{predictor}/predictions/'

        # The directory to save the new results
        cls.output_dir = dp.ensure_dir(
            f'{_graphs_base_dir}/referenceLists/{qgroup}/{direct}/{n}_vars')

        # The files for used for the LTR and CV
        _test_dir = f'~/QppUqvProj/Results/{corpus}/test'
        cls.folds = dp.ensure_file(f'{_test_dir}/2_folds_30_repetitions.json')
        cls.ap_file = dp.ensure_file(f'{_test_dir}/ref/QLmap1000-{qgroup}')

        # The features file used for prediction
        cls.features = dp.ensure_file(
            f'{_graphs_dat_dir}/features/{qgroup}_query_{n}_variations_features_{corpus}_uqv.JSON'
        )

        cls.geo_mean_file = dp.ensure_file(
            f'QppUqvProj/Results/{corpus}/uqvPredictions/raw/geo/predictions/predictions-20000'
        )

        # The variations file is used in the filter function - it consists of all the vars w/o the query at hand
        cls.query_vars_file = dp.ensure_file(
            f'{_graphs_dat_dir}/queries/queries_wo_{qgroup}_{n}_vars.txt')
        cls.quantile_vars_file = cls.query_vars_file

        _queries2predict = f'~/QppUqvProj/data/{corpus}/queries_{corpus}_{qgroup}.txt'
        cls.queries2predict_file = dp.ensure_file(_queries2predict)

        cls.real_ap_file = dp.ensure_file(
            f'~/QppUqvProj/Results/{corpus}/test/raw/QLmap1000')

        cls.geo_predictions_dir = dp.ensure_dir(
            f'{_corpus_res_dir}/referenceLists/{qgroup}/all_vars/sim_as_pred/geo/predictions'
        )
예제 #12
0
    def __set_paths(cls, corpus, group):
        _corpus_test_dir = dp.ensure_dir(
            f'~/QppUqvProj/Results/{corpus}/test/')

        # Basic predictions dir
        cls.basic_predictions_dir = dp.ensure_dir(
            f'~/QppUqvProj/Results/{corpus}/basicPredictions/{group}/')
        # AP file to pick variations according to AP
        cls.raw_ap_file = dp.ensure_file(f'{_corpus_test_dir}/raw/QLmap1000')
        # AP file for the cross validation process
        cls.query_ap_file = dp.ensure_file(
            f'{_corpus_test_dir}/ref/QLmap1000-{group}')
        # CV folds mapping file
        cls.cv_map_file = dp.ensure_file(
            f'{_corpus_test_dir}/2_folds_30_repetitions.json')
        # Queries file with all the variations except the ones to be predicted
        cls.queries_file = dp.ensure_file(
            f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_{group}.txt')
        # The data dir for the Graphs
        cls.data_dir = dp.ensure_dir(f'~/QppUqvProj/Graphs/{corpus}/data')
        # The results base dir for the Graphs
        cls.results_dir = dp.ensure_dir(
            f'~/QppUqvProj/Graphs/{corpus}/referenceLists/{group}')
예제 #13
0
    def __set_paths(cls, corpus, qgroup, vars_quantile):
        """This method sets the default paths of the files and the working directories, it assumes the standard naming
         convention of the project"""
        # cls.predictor = predictor
        _corpus_res_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{corpus}')
        _corpus_dat_dir = dp.ensure_dir(f'~/QppUqvProj/data/{corpus}')

        _results_file = f'{_corpus_res_dir}/test/raw/QL.res'
        cls.results_file = os.path.normpath(_results_file)
        dp.ensure_file(cls.results_file)

        _title_results_file = f'{_corpus_res_dir}/test/basic/QL.res'
        cls.title_res_file = os.path.normpath(_title_results_file)
        dp.ensure_file(cls.title_res_file)

        cls.queries_full_file = dp.ensure_file(
            f'{_corpus_dat_dir}/queries_{corpus}_UQV_full.stemmed.txt')

        # The variations file is used in the filter function - it consists of all the vars w/o the query at hand
        _queries_variations_file = f'{_corpus_dat_dir}/queries_{corpus}_UQV_wo_{qgroup}.txt'
        cls.queries_variations_file = dp.ensure_file(_queries_variations_file)

        # The vars quantile file is used in the filter function - it consists of the relevant vars quantile
        if vars_quantile == 'all':
            _queries_quantile_file = f'{_corpus_dat_dir}/queries_{corpus}_UQV_full.txt'
        else:
            _queries_quantile_file = f'{_corpus_dat_dir}/queries_{corpus}_UQV_{vars_quantile}_variants.txt'
        cls.queries_quantile_vars = dp.ensure_file(_queries_quantile_file)

        _queries_topic_file = f'{_corpus_dat_dir}/queries_{corpus}_{qgroup}.stemmed.txt'
        cls.queries_topic_file = dp.ensure_file(_queries_topic_file)

        _fused_results_file = f'{_corpus_res_dir}/test/fusion/QL.res'
        cls.fused_results_file = dp.ensure_file(_fused_results_file)

        # cls.output_dir = dp.ensure_dir(f'{_corpus_res_dir}/test/raw/')

        _predictions_out = f'{_corpus_res_dir}/uqvPredictions/referenceLists/{qgroup}/{vars_quantile}_vars/sim_as_pred/'
        cls.predictions_output_dir = dp.ensure_dir(_predictions_out)

        cls.pkl_dir = dp.ensure_dir(f'{_corpus_res_dir}/test/ref/pkl_files/')
예제 #14
0
 def load_similarity_features_df(self):
     """
     Try loading the features df from a file, if fails will generate a new one
     :return: pandas DF with the similarity features
     """
     sim_features_file = f'{self.pkl_dir}/similarity_features_df.pkl'
     try:
         df_file = dp.ensure_file(sim_features_file)
         df = pd.read_pickle(df_file)
     except AssertionError:
         print(
             f'-- Failed loading {sim_features_file}, will generate and save --'
         )
         df = self.calc_features_parallel()
         df.to_pickle(sim_features_file)
     return df
예제 #15
0
 def __load_features_df(self, _file_name):
     """The method will try to load the features DF from a pkl file, if it fails it will generate a new df
     and save it"""
     try:
         # Will try loading a DF, if fails will generate and save a new one
         file_to_load = dp.ensure_file(_file_name)
         _df = pd.read_pickle(file_to_load)
     except AssertionError:
         print(f'\nFailed to load {_file_name}')
         print(
             f'Will generate {self.pkl_dir.rsplit("/")[-1]} vars {self.queries_group}_query_features '
             f'features and save')
         _df = self._calc_features()
         _df.to_pickle(_file_name)
     n = self.top_docs_overlap
     _df[f'Top_{n}_Docs_overlap'] = _df[f'Top_{n}_Docs_overlap'] / n
     return _df
예제 #16
0
    def __set_graph_paths(cls, corpus, qgroup, direct, n):
        """This method sets the default paths of the files and the working directories, it assumes the standard naming
         convention of the project"""
        # cls.predictor = predictor
        _corpus_res_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{corpus}')
        _corpus_dat_dir = dp.ensure_dir(f'~/QppUqvProj/data/{corpus}')

        _graphs_base_dir = dp.ensure_dir(f'~/QppUqvProj/Graphs/{corpus}')
        _graphs_res_dir = dp.ensure_dir(
            f'{_graphs_base_dir}/referenceLists/{qgroup}/{direct}/{n}_vars')
        _graphs_dat_dir = dp.ensure_dir(f'{_graphs_base_dir}/data')

        cls.number_of_vars = n

        _results_file = f'{_corpus_res_dir}/test/raw/QL.res'
        cls.results_file = os.path.normpath(_results_file)
        dp.ensure_file(cls.results_file)

        _title_results_file = f'{_corpus_res_dir}/test/basic/QL.res'
        cls.title_res_file = os.path.normpath(_title_results_file)
        dp.ensure_file(cls.title_res_file)

        _queries_full_file = f'{_corpus_dat_dir}/queries_{corpus}_UQV_full.stemmed.txt'
        cls.queries_full_file = dp.ensure_file(_queries_full_file)

        # The variations file is used in the filter function - it consists of all the vars w/o the query at hand
        _queries_variations_file = f'{_graphs_dat_dir}/{direct}/queries/queries_wo_{qgroup}_{n}_vars.txt'
        cls.queries_variations_file = dp.ensure_file(_queries_variations_file)
        cls.queries_quantile_vars = cls.queries_variations_file

        _queries_topic_file = f'{_corpus_dat_dir}/queries_{corpus}_{qgroup}.stemmed.txt'
        cls.queries_topic_file = dp.ensure_file(_queries_topic_file)

        _fused_results_file = f'{_corpus_res_dir}/test/fusion/QL.res'
        # _fused_results_file = f'{_corpus_res_dir}/test/fusion/all_wo_{qgroup}_fused_QL.res'
        cls.fused_results_file = dp.ensure_file(_fused_results_file)

        # cls.output_dir = dp.ensure_dir(f'{_graphs_res_dir}/test/raw/')

        cls.predictions_output_dir = dp.ensure_dir(
            f'{_graphs_res_dir}/sim_as_pred/')

        cls.pkl_dir = dp.ensure_dir(f'{_graphs_dat_dir}/pkl_files/features')
예제 #17
0
 def __initialize_basic_results_dict(self):
     _pkl_file = f'{self.data_dir}/pkl_files/basic_results_dict_{self.corpus}_{self.corr_measure}.pkl'
     if self.load_from_pkl:
         try:
             file_to_load = dp.ensure_file(_pkl_file)
             with open(file_to_load, 'rb') as handle:
                 self.basic_results_dict = pickle.load(handle)
         except AssertionError:
             print(f'\nFailed to load {_pkl_file}')
             print(f'Will generate {_pkl_file} and save')
             for predictor in PREDICTORS:
                 self.calc_single_query_result(predictor)
             with open(_pkl_file, 'wb') as handle:
                 pickle.dump(self.basic_results_dict,
                             handle,
                             protocol=pickle.HIGHEST_PROTOCOL)
     else:
         for predictor in PREDICTORS:
             self.calc_single_query_result(predictor)
         with open(_pkl_file, 'wb') as handle:
             pickle.dump(self.basic_results_dict,
                         handle,
                         protocol=pickle.HIGHEST_PROTOCOL)
예제 #18
0
 def generate_results_df(self, cores=4):
     _pkl_file = f'{self.data_dir}/pkl_files/lambda_full_results_df_{self.corpus}_{self.corr_measure}.pkl'
     if self.load_from_pkl:
         try:
             file_to_load = dp.ensure_file(_pkl_file)
             full_results_df = pd.read_pickle(file_to_load)
         except AssertionError:
             print(f'\nFailed to load {_pkl_file}')
             print(f'Will generate {_pkl_file} and save')
             with mp.Pool(processes=cores) as pool:
                 result = pool.starmap(self.generate_graph_df,
                                       itertools.product(SIMILARITY_FUNCTIONS.values(), PREDICTORS))
             pool.close()
             full_results_df = pd.concat(result, axis=0)
             full_results_df.to_pickle(_pkl_file)
     else:
         with mp.Pool(processes=cores) as pool:
             result = pool.starmap(self.generate_graph_df,
                                   itertools.product(SIMILARITY_FUNCTIONS.values(), PREDICTORS))
         pool.close()
         full_results_df = pd.concat(result, axis=0)
         full_results_df.to_pickle(_pkl_file)
     return full_results_df
예제 #19
0
def main(args):
    queries_txt_file = args.queries
    queries_to_remove = args.remove
    ap_file = args.ap
    queries_group = args.group
    quant_variants = args.quant
    stats = args.stats
    plot_vars = args.plot_vars

    filter_functions_dict = {
        'top': filter_top_queries,
        'low': filter_low_queries,
        'medl': filter_medl_queries,
        'medh': filter_medh_queries
    }
    # quantiles_dict = {'low': [0, 0.33], 'med': [0.33, 0.66], 'top': [0.66, 1]}
    quantiles_dict = {'low': [0, 0.5], 'high': [0.5, 1]}

    # # Uncomment for Debugging !!!!!
    # print('\n\n\n----------!!!!!!!!!!!!--------- Debugging Mode ----------!!!!!!!!!!!!---------\n\n\n')
    # # quant_variants = 'low'
    # corpus = 'ClueWeb12B'
    # corpus = 'ROBUST'
    # ap_file = dt.ensure_file(f'~/QppUqvProj/Results/{corpus}/test/raw/QLmap1000')
    # queries_txt_file = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_full.txt')
    # queries_txt_file_wo_title = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_title.txt')
    # queries_txt_file_wo_top = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_top.txt')
    # queries_txt_file_wo_low = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_low.txt')
    # queries_txt_file_wo_med = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_medh.txt')
    # plot_vars = True

    # df = create_overlap_ref_queries(queries_txt_file_wo_top, queries_txt_file_wo_low, queries_txt_file_wo_med,
    #                                 queries_txt_file_wo_title)
    # write_queries_to_files(df, corpus, 'cref')
    # exit()

    corpus = 'ROBUST' if 'ROBUST' in queries_txt_file else 'ClueWeb12B'
    if queries_txt_file:
        qdb = dt.QueriesTextParser(queries_txt_file, 'uqv')
        df = add_topic_to_qdf(qdb.queries_df)
        qdb.queries_df = remove_duplicates(qdb)
        if queries_to_remove:
            qdb_rm = dt.QueriesTextParser(queries_to_remove)
            qdb.queries_df = remove_q1_from_q2(qdb_rm.queries_df, qdb)
        if ap_file:
            apdb = dt.ResultsReader(ap_file, 'ap')
            if queries_group != 'title':
                qdb.queries_df = filter_functions_dict[queries_group](
                    qdb.queries_df, apdb)
            elif quant_variants:
                qdb.queries_df = filter_quant_variants(
                    qdb.queries_df, apdb, quantiles_dict[quant_variants])
            if stats:
                title_queries_file = dt.ensure_file(
                    f'~/QppUqvProj/data/{corpus}/queries_{corpus}_title.txt')
                title_queries_df = dt.QueriesTextParser(
                    title_queries_file).queries_df
                title_ap_file = dt.ensure_file(
                    f'~/QppUqvProj/Results/{corpus}/test/basic/QLmap1000')
                title_ap = dt.ResultsReader(title_ap_file, 'ap')
                calc_statistics(qdb.queries_df, apdb, title_queries_df,
                                title_ap, filter_functions_dict,
                                quantiles_dict, corpus)
                return
            elif plot_vars:
                title_queries_file = dt.ensure_file(
                    f'~/QppUqvProj/data/{corpus}/queries_{corpus}_title.txt')
                title_queries_df = dt.QueriesTextParser(
                    title_queries_file).queries_df
                title_ap_file = dt.ensure_file(
                    f'~/QppUqvProj/Results/{corpus}/test/basic/QLmap1000')
                title_ap = dt.ResultsReader(title_ap_file, 'ap')
                plot_variants_ap(qdb.queries_df, apdb, title_queries_df,
                                 title_ap, corpus)
                return

        print_top_differences(qdb.queries_df, apdb, corpus)