예제 #1
0
    def per_epoch_summary_step2(self, id_str, fold_k,
                                list_fold_k_train_eval_track,
                                list_fold_k_test_eval_track, do_vali,
                                list_fold_k_vali_eval_track):
        sy_prefix = '_'.join(['Fold', str(fold_k)])

        fold_k_train_eval = np.vstack(list_fold_k_train_eval_track)
        fold_k_test_eval = np.vstack(list_fold_k_test_eval_track)
        pickle_save(fold_k_train_eval,
                    file=self.dir_run +
                    '_'.join([sy_prefix, id_str, 'train_eval.np']))
        pickle_save(fold_k_test_eval,
                    file=self.dir_run +
                    '_'.join([sy_prefix, id_str, 'test_eval.np']))
        '''
        fold_k_epoch_loss = np.hstack(list_epoch_loss)
        pickle_save((fold_k_epoch_loss, train_data.__len__()),
                    file=self.dir_run + '_'.join([sy_prefix, id_str, 'epoch_loss.np']))
        '''

        if do_vali:
            fold_k_vali_eval = np.hstack(list_fold_k_vali_eval_track)
            pickle_save(fold_k_vali_eval,
                        file=self.dir_run +
                        '_'.join([sy_prefix, id_str, 'vali_eval.np']))
예제 #2
0
    def fold_summary(self, fold_k, dir_run, train_data_length):
        sy_prefix = '_'.join(['Fold', str(fold_k)])

        if self.do_validation:
            fold_k_vali_eval = np.hstack(self.list_fold_k_vali_track)
            pickle_save(fold_k_vali_eval,
                        file=dir_run + '_'.join([sy_prefix, 'vali_eval.np']))

        fold_k_train_eval = np.vstack(self.list_fold_k_train_track)
        fold_k_test_eval = np.vstack(self.list_fold_k_test_track)
        pickle_save(fold_k_train_eval,
                    file=dir_run + '_'.join([sy_prefix, 'train_eval.np']))
        pickle_save(fold_k_test_eval,
                    file=dir_run + '_'.join([sy_prefix, 'test_eval.np']))

        fold_k_epoch_loss = np.hstack(self.list_epoch_loss)
        pickle_save((fold_k_epoch_loss, train_data_length),
                    file=dir_run + '_'.join([sy_prefix, 'epoch_loss.np']))
예제 #3
0
    def kfold_cv_eval(self,
                      data_dict=None,
                      eval_dict=None,
                      model_para_dict=None):
        """
        Evaluation based on k-fold cross validation if multiple folds exist
        :param data_dict:
        :param eval_dict:
        :param model_para_dict:
        :return:
        """
        self.display_information(data_dict=data_dict)
        self.setup_eval(data_dict=data_dict, eval_dict=eval_dict)
        model_id, data_id = self.model_parameter.model_id, data_dict['data_id']

        fold_num = data_dict['fold_num']  # updated due to the debug mode
        cutoffs, do_validation = eval_dict['cutoffs'], eval_dict[
            'do_validation']

        tree_ranker = globals()[model_id](model_para_dict)

        time_begin = datetime.datetime.now()  # timing
        l2r_cv_avg_ndcg_scores = np.zeros(len(cutoffs))  # fold average
        l2r_cv_avg_nerr_scores = np.zeros(len(cutoffs))  # fold average
        l2r_cv_avg_ap_scores = np.zeros(len(cutoffs))  # fold average
        l2r_cv_avg_p_scores = np.zeros(len(cutoffs))  # fold average

        list_all_fold_ndcg_at_ks_per_q = []
        list_all_fold_err_at_ks_per_q = []
        list_all_fold_ap_at_ks_per_q = []
        list_all_fold_p_at_ks_per_q = []

        for fold_k in range(1, fold_num + 1):
            # determine the file paths
            file_train, file_vali, file_test = self.determine_files(
                data_dict=data_dict, fold_k=fold_k)

            self.update_save_model_dir(data_dict=data_dict, fold_k=fold_k)

            y_test, group_test, y_pred = tree_ranker.run(
                fold_k=fold_k,
                file_train=file_train,
                file_vali=file_vali,
                file_test=file_test,
                data_dict=data_dict,
                eval_dict=eval_dict,
                save_model_dir=self.save_model_dir)

            fold_avg_ndcg_at_ks, fold_avg_nerr_at_ks, fold_avg_ap_at_ks, fold_avg_p_at_ks,\
            list_ndcg_at_ks_per_q, list_err_at_ks_per_q, list_ap_at_ks_per_q, list_p_at_ks_per_q = \
                                    self.cal_metric_at_ks(model_id=model_id, all_std_labels=y_test, all_preds=y_pred,
                                                          group=group_test, ks=cutoffs)

            performance_list = [
                model_id
            ] if data_id in YAHOO_LTR or data_id in ISTELLA_LTR else [
                model_id + ' Fold-' + str(fold_k)
            ]

            for i, co in enumerate(cutoffs):
                performance_list.append('\nnDCG@{}:{:.4f}'.format(
                    co, fold_avg_ndcg_at_ks[i]))
            for i, co in enumerate(cutoffs):
                performance_list.append('\nnERR@{}:{:.4f}'.format(
                    co, fold_avg_nerr_at_ks[i]))
            for i, co in enumerate(cutoffs):
                performance_list.append('\nMAP@{}:{:.4f}'.format(
                    co, fold_avg_ap_at_ks[i]))
            for i, co in enumerate(cutoffs):
                performance_list.append('\nP@{}:{:.4f}'.format(
                    co, fold_avg_p_at_ks[i]))

            performance_str = '\t'.join(performance_list)
            print('\n\t', performance_str)

            l2r_cv_avg_ndcg_scores = np.add(
                l2r_cv_avg_ndcg_scores,
                fold_avg_ndcg_at_ks)  # sum for later cv-performance
            l2r_cv_avg_nerr_scores = np.add(
                l2r_cv_avg_nerr_scores,
                fold_avg_nerr_at_ks)  # sum for later cv-performance
            l2r_cv_avg_ap_scores = np.add(
                l2r_cv_avg_ap_scores,
                fold_avg_ap_at_ks)  # sum for later cv-performance
            l2r_cv_avg_p_scores = np.add(
                l2r_cv_avg_p_scores,
                fold_avg_p_at_ks)  # sum for later cv-performance

            list_all_fold_ndcg_at_ks_per_q.extend(list_ndcg_at_ks_per_q)
            list_all_fold_err_at_ks_per_q.extend(list_err_at_ks_per_q)
            list_all_fold_ap_at_ks_per_q.extend(list_ap_at_ks_per_q)
            list_all_fold_p_at_ks_per_q.extend(list_p_at_ks_per_q)

        time_end = datetime.datetime.now()  # overall timing
        elapsed_time_str = str(time_end - time_begin)
        print('Elapsed time:\t', elapsed_time_str + "\n")

        print()  # begin to print either cv or average performance
        l2r_cv_avg_ndcg_scores = np.divide(l2r_cv_avg_ndcg_scores, fold_num)
        l2r_cv_avg_nerr_scores = np.divide(l2r_cv_avg_nerr_scores, fold_num)
        l2r_cv_avg_ap_scores = np.divide(l2r_cv_avg_ap_scores, fold_num)
        l2r_cv_avg_p_scores = np.divide(l2r_cv_avg_p_scores, fold_num)

        if do_validation:
            eval_prefix = str(fold_num) + '-fold cross validation scores:'
        else:
            eval_prefix = str(fold_num) + '-fold average scores:'

        print(
            model_id, eval_prefix,
            self.result_to_str(list_scores=l2r_cv_avg_ndcg_scores,
                               list_cutoffs=cutoffs,
                               metric_str='nDCG'))
        print(
            model_id, eval_prefix,
            self.result_to_str(list_scores=l2r_cv_avg_nerr_scores,
                               list_cutoffs=cutoffs,
                               metric_str='nERR'))
        print(
            model_id, eval_prefix,
            self.result_to_str(list_scores=l2r_cv_avg_ap_scores,
                               list_cutoffs=cutoffs,
                               metric_str='MAP'))
        print(
            model_id, eval_prefix,
            self.result_to_str(list_scores=l2r_cv_avg_p_scores,
                               list_cutoffs=cutoffs,
                               metric_str='P'))

        all_fold_ndcg_at_ks_per_q = np.vstack(list_all_fold_ndcg_at_ks_per_q)
        all_fold_err_at_ks_per_q = np.vstack(list_all_fold_err_at_ks_per_q)
        all_fold_ap_at_ks_per_q = np.vstack(list_all_fold_ap_at_ks_per_q)
        all_fold_p_at_ks_per_q = np.vstack(list_all_fold_p_at_ks_per_q)

        pickle_save(
            all_fold_ndcg_at_ks_per_q,
            file=self.output_root +
            '_'.join([data_id, model_id, 'all_fold_ndcg_at_ks_per_q.np']))
        pickle_save(
            all_fold_err_at_ks_per_q,
            file=self.output_root +
            '_'.join([data_id, model_id, 'all_fold_err_at_ks_per_q.np']))
        pickle_save(
            all_fold_ap_at_ks_per_q,
            file=self.output_root +
            '_'.join([data_id, model_id, 'all_fold_ap_at_ks_per_q.np']))
        pickle_save(all_fold_p_at_ks_per_q,
                    file=self.output_root +
                    '_'.join([data_id, model_id, 'all_fold_p_at_ks_per_q.np']))

        return l2r_cv_avg_ndcg_scores, l2r_cv_avg_nerr_scores, l2r_cv_avg_ap_scores, l2r_cv_avg_p_scores
예제 #4
0
    def kfold_cv_eval(self,
                      data_dict=None,
                      eval_dict=None,
                      sf_para_dict=None,
                      model_para_dict=None):
        """
        Evaluation learning-to-rank methods via k-fold cross validation if there are k folds, otherwise one fold.
        :param data_dict:       settings w.r.t. data
        :param eval_dict:       settings w.r.t. evaluation
        :param sf_para_dict:    settings w.r.t. scoring function
        :param model_para_dict: settings w.r.t. the ltr_adhoc model
        :return:
        """
        self.display_information(data_dict, model_para_dict)
        self.check_consistency(data_dict, eval_dict, sf_para_dict)
        self.setup_eval(data_dict, eval_dict, sf_para_dict, model_para_dict)

        model_id = model_para_dict['model_id']
        fold_num = data_dict['fold_num']
        # for quick access of common evaluation settings
        epochs, loss_guided = eval_dict['epochs'], eval_dict['loss_guided']
        vali_k, log_step, cutoffs = eval_dict['vali_k'], eval_dict[
            'log_step'], eval_dict['cutoffs']
        do_vali, do_summary = eval_dict['do_validation'], eval_dict[
            'do_summary']

        ranker = self.load_ranker(model_para_dict=model_para_dict,
                                  sf_para_dict=sf_para_dict)

        time_begin = datetime.datetime.now()  # timing
        l2r_cv_avg_scores = np.zeros(len(cutoffs))  # fold average

        for fold_k in range(1, fold_num + 1):  # evaluation over k-fold data
            ranker.reset_parameters(
            )  # reset with the same random initialization

            train_data, test_data, vali_data = self.load_data(
                eval_dict, data_dict, fold_k)

            if do_vali: fold_optimal_ndcgk = 0.0
            if do_summary:                list_epoch_loss, list_fold_k_train_eval_track, list_fold_k_test_eval_track, list_fold_k_vali_eval_track = [], [], [], []
            if not do_vali and loss_guided:
                first_round = True
                threshold_epoch_loss = torch.cuda.FloatTensor([
                    10000000.0
                ]) if self.gpu else torch.FloatTensor([10000000.0])

            for epoch_k in range(1, epochs + 1):
                torch_fold_k_epoch_k_loss, stop_training = self.train_ranker(
                    ranker=ranker,
                    train_data=train_data,
                    model_para_dict=model_para_dict,
                    epoch_k=epoch_k)

                ranker.scheduler.step(
                )  # adaptive learning rate with step_size=40, gamma=0.5

                if stop_training:
                    print('training is failed !')
                    break

                if (do_summary
                        or do_vali) and (epoch_k % log_step == 0
                                         or epoch_k == 1):  # stepwise check
                    if do_vali:  # per-step validation score
                        vali_eval_tmp = ndcg_at_k(ranker=ranker,
                                                  test_data=vali_data,
                                                  k=vali_k,
                                                  gpu=self.gpu,
                                                  device=self.device,
                                                  label_type=self.data_setting.
                                                  data_dict['label_type'])
                        vali_eval_v = vali_eval_tmp.data.numpy()
                        if epoch_k > 1:  # further validation comparison
                            curr_vali_ndcg = vali_eval_v
                            if (curr_vali_ndcg > fold_optimal_ndcgk) or (
                                    epoch_k == epochs
                                    and curr_vali_ndcg == fold_optimal_ndcgk
                            ):  # we need at least a reference, in case all zero
                                print('\t', epoch_k,
                                      '- nDCG@{} - '.format(vali_k),
                                      curr_vali_ndcg)
                                fold_optimal_ndcgk = curr_vali_ndcg
                                fold_optimal_checkpoint = '-'.join(
                                    ['Fold', str(fold_k)])
                                fold_optimal_epoch_val = epoch_k
                                ranker.save(
                                    dir=self.dir_run +
                                    fold_optimal_checkpoint + '/',
                                    name='_'.join(
                                        ['net_params_epoch',
                                         str(epoch_k)]) +
                                    '.pkl')  # buffer currently optimal model
                            else:
                                print('\t\t', epoch_k,
                                      '- nDCG@{} - '.format(vali_k),
                                      curr_vali_ndcg)

                    if do_summary:  # summarize per-step performance w.r.t. train, test
                        fold_k_epoch_k_train_ndcg_ks = ndcg_at_ks(
                            ranker=ranker,
                            test_data=train_data,
                            ks=cutoffs,
                            gpu=self.gpu,
                            device=self.device,
                            label_type=self.data_setting.
                            data_dict['label_type'])
                        np_fold_k_epoch_k_train_ndcg_ks = fold_k_epoch_k_train_ndcg_ks.cpu(
                        ).numpy(
                        ) if self.gpu else fold_k_epoch_k_train_ndcg_ks.data.numpy(
                        )
                        list_fold_k_train_eval_track.append(
                            np_fold_k_epoch_k_train_ndcg_ks)

                        fold_k_epoch_k_test_ndcg_ks = ndcg_at_ks(
                            ranker=ranker,
                            test_data=test_data,
                            ks=cutoffs,
                            gpu=self.gpu,
                            device=self.device,
                            label_type=self.data_setting.
                            data_dict['label_type'])
                        np_fold_k_epoch_k_test_ndcg_ks = fold_k_epoch_k_test_ndcg_ks.cpu(
                        ).numpy(
                        ) if self.gpu else fold_k_epoch_k_test_ndcg_ks.data.numpy(
                        )
                        list_fold_k_test_eval_track.append(
                            np_fold_k_epoch_k_test_ndcg_ks)

                        fold_k_epoch_k_loss = torch_fold_k_epoch_k_loss.cpu(
                        ).numpy(
                        ) if self.gpu else torch_fold_k_epoch_k_loss.data.numpy(
                        )
                        list_epoch_loss.append(fold_k_epoch_k_loss)

                        if do_vali:
                            list_fold_k_vali_eval_track.append(vali_eval_v)

                elif loss_guided:  # stopping check via epoch-loss
                    if first_round and torch_fold_k_epoch_k_loss >= threshold_epoch_loss:
                        print('Bad threshold: ', torch_fold_k_epoch_k_loss,
                              threshold_epoch_loss)

                    if torch_fold_k_epoch_k_loss < threshold_epoch_loss:
                        first_round = False
                        print('\tFold-', str(fold_k), ' Epoch-', str(epoch_k),
                              'Loss: ', torch_fold_k_epoch_k_loss)
                        threshold_epoch_loss = torch_fold_k_epoch_k_loss
                    else:
                        print('\tStopped according epoch-loss!',
                              torch_fold_k_epoch_k_loss, threshold_epoch_loss)
                        break

            if do_summary:  # track
                sy_prefix = '_'.join(['Fold', str(fold_k)])
                fold_k_train_eval = np.vstack(list_fold_k_train_eval_track)
                fold_k_test_eval = np.vstack(list_fold_k_test_eval_track)
                pickle_save(fold_k_train_eval,
                            file=self.dir_run +
                            '_'.join([sy_prefix, 'train_eval.np']))
                pickle_save(fold_k_test_eval,
                            file=self.dir_run +
                            '_'.join([sy_prefix, 'test_eval.np']))

                fold_k_epoch_loss = np.hstack(list_epoch_loss)
                pickle_save(
                    (fold_k_epoch_loss, train_data.__len__()),
                    file=self.dir_run + '_'.join([sy_prefix, 'epoch_loss.np']))
                if do_vali:
                    fold_k_vali_eval = np.hstack(list_fold_k_vali_eval_track)
                    pickle_save(fold_k_vali_eval,
                                file=self.dir_run +
                                '_'.join([sy_prefix, 'vali_eval.np']))

            if do_vali:  # using the fold-wise optimal model for later testing based on validation data
                buffered_model = '_'.join(
                    ['net_params_epoch',
                     str(fold_optimal_epoch_val)]) + '.pkl'
                ranker.load(self.dir_run + fold_optimal_checkpoint + '/' +
                            buffered_model)
                fold_optimal_ranker = ranker
            else:  # buffer the model after a fixed number of training-epoches if no validation is deployed
                fold_optimal_checkpoint = '-'.join(['Fold', str(fold_k)])
                ranker.save(dir=self.dir_run + fold_optimal_checkpoint + '/',
                            name='_'.join(['net_params_epoch',
                                           str(epoch_k)]) + '.pkl')
                fold_optimal_ranker = ranker

            torch_fold_ndcg_ks = ndcg_at_ks(
                ranker=fold_optimal_ranker,
                test_data=test_data,
                ks=cutoffs,
                gpu=self.gpu,
                device=self.device,
                label_type=self.data_setting.data_dict['label_type'])
            fold_ndcg_ks = torch_fold_ndcg_ks.data.numpy()

            performance_list = [model_id + ' Fold-' + str(fold_k)
                                ]  # fold-wise performance
            for i, co in enumerate(cutoffs):
                performance_list.append('nDCG@{}:{:.4f}'.format(
                    co, fold_ndcg_ks[i]))
            performance_str = '\t'.join(performance_list)
            print('\t', performance_str)

            l2r_cv_avg_scores = np.add(
                l2r_cv_avg_scores,
                fold_ndcg_ks)  # sum for later cv-performance

        time_end = datetime.datetime.now()  # overall timing
        elapsed_time_str = str(time_end - time_begin)
        print('Elapsed time:\t', elapsed_time_str + "\n\n")

        l2r_cv_avg_scores = np.divide(l2r_cv_avg_scores, fold_num)
        eval_prefix = str(
            fold_num) + '-fold cross validation scores:' if do_vali else str(
                fold_num) + '-fold average scores:'
        print(model_id, eval_prefix,
              metric_results_to_string(list_scores=l2r_cv_avg_scores,
                                       list_cutoffs=cutoffs)
              )  # print either cv or average performance

        return l2r_cv_avg_scores
예제 #5
0
    def get_cv_performance(self):
        time_end = datetime.datetime.now()  # overall timing
        elapsed_time_str = str(time_end - self.time_begin)

        ndcg_cv_avg_scores = np.divide(self.ndcg_cv_avg_scores, self.fold_num)
        nerr_cv_avg_scores = np.divide(self.nerr_cv_avg_scores, self.fold_num)
        ap_cv_avg_scores = np.divide(self.ap_cv_avg_scores, self.fold_num)
        p_cv_avg_scores = np.divide(self.p_cv_avg_scores, self.fold_num)

        eval_prefix = str(self.fold_num) + '-fold cross validation scores:' if self.do_validation \
                      else str(self.fold_num) + '-fold average scores:'

        list_metric_strs = []
        list_metric_strs.append(
            metric_results_to_string(list_scores=ndcg_cv_avg_scores,
                                     list_cutoffs=self.cutoffs,
                                     metric='nDCG'))
        list_metric_strs.append(
            metric_results_to_string(list_scores=nerr_cv_avg_scores,
                                     list_cutoffs=self.cutoffs,
                                     metric='nERR'))
        list_metric_strs.append(
            metric_results_to_string(list_scores=ap_cv_avg_scores,
                                     list_cutoffs=self.cutoffs,
                                     metric='AP'))
        list_metric_strs.append(
            metric_results_to_string(list_scores=p_cv_avg_scores,
                                     list_cutoffs=self.cutoffs,
                                     metric='P'))
        metric_string = '\n'.join(list_metric_strs)
        print("\n{} {}\n{}".format(self.model_id, eval_prefix, metric_string))
        print('Elapsed time:\t', elapsed_time_str + "\n\n")

        if self.reproduce:
            torch_mat_per_q_p = torch.cat(self.list_per_q_p, dim=0)
            torch_mat_per_q_ap = torch.cat(self.list_per_q_ap, dim=0)
            torch_mat_per_q_nerr = torch.cat(self.list_per_q_nerr, dim=0)
            torch_mat_per_q_ndcg = torch.cat(self.list_per_q_ndcg, dim=0)
            #print('torch_mat_per_q_ndcg', torch_mat_per_q_ndcg.size())
            mat_per_q_p = torch_mat_per_q_p.data.numpy()
            mat_per_q_ap = torch_mat_per_q_ap.data.numpy()
            mat_per_q_nerr = torch_mat_per_q_nerr.data.numpy()
            mat_per_q_ndcg = torch_mat_per_q_ndcg.data.numpy()

            pickle_save(target=mat_per_q_p,
                        file=self.dir_run +
                        '_'.join([self.model_id, 'all_fold_p_at_ks_per_q.np']))
            pickle_save(
                target=mat_per_q_ap,
                file=self.dir_run +
                '_'.join([self.model_id, 'all_fold_ap_at_ks_per_q.np']))
            pickle_save(
                target=mat_per_q_nerr,
                file=self.dir_run +
                '_'.join([self.model_id, 'all_fold_nerr_at_ks_per_q.np']))
            pickle_save(
                target=mat_per_q_ndcg,
                file=self.dir_run +
                '_'.join([self.model_id, 'all_fold_ndcg_at_ks_per_q.np']))

        return ndcg_cv_avg_scores
예제 #6
0
    def __init__(self, split_type, list_as_file, data_id=None, data_dict=None, fold_dir=None, presort=True, alpha=0.5,
                 dictQueryRepresentation=None, dictDocumentRepresentation=None, dictQueryPermutaion=None,
                 dictQueryDocumentSubtopics=None, buffer=True, add_noise=False, std_delta=1.0):
        self.presort = presort
        self.add_noise = add_noise
        ''' split-specific settings '''
        self.split_type = split_type
        self.data_id = data_dict['data_id']
        assert presort is True # since it is time-consuming to generate the ideal diversified ranking dynamically.

        if data_dict['data_id'] in TREC_DIV: # supported datasets
            torch_buffer_file = fold_dir.replace('folder', 'Bufferedfolder') + split_type.name
            if self.presort:
                torch_buffer_file = '_'.join([torch_buffer_file, 'presort', '{:,g}'.format(alpha)])
            if self.add_noise:
                torch_buffer_file = '_'.join([torch_buffer_file, 'gaussian', '{:,g}'.format(std_delta)])

            torch_buffer_file += '.torch'

            if os.path.exists(torch_buffer_file):
                print('loading buffered file ...')
                self.list_torch_Qs = pickle_load(torch_buffer_file)
            else:
                self.list_torch_Qs = []
                for qid in list_as_file:
                    np_q_repr = dictQueryRepresentation[str(qid)] # [1, 100]
                    alphaDCG = dictQueryPermutaion[str(qid)]['alphaDCG']
                    q_doc_subtopics = dictQueryDocumentSubtopics[str(qid)]
                    perm_docs = dictQueryPermutaion[str(qid)]['permutation']
                    if self.presort:
                        # print('json-alphaDCG', alphaDCG) # TODO the meaning of json-alphaDCG needs to be confirmed
                        ''' the following comparison shows that the provided permutation of docs is the ideal ranking '''
                        #print('personal-computation for json', alpha_DCG_at_k(sorted_docs=perm_docs, q_doc_subtopics=q_doc_subtopics, k=4, alpha=0.5))
                        perm_docs = get_div_ideal_ranking(pool_docs=perm_docs, q_doc_subtopics=q_doc_subtopics, alpha=alpha)
                        #print('personal-computation for ideal', alpha_DCG_at_k(sorted_docs=perm_docs, q_doc_subtopics=q_doc_subtopics, k=4, alpha=0.5))
                        #print('===')

                    list_doc_reprs = []
                    for doc in perm_docs:
                        list_doc_reprs.append(dictDocumentRepresentation[doc]) # [1, 100]
                    np_doc_reprs = np.vstack(list_doc_reprs) # [permutation_size, 100]

                    q_repr = torch.from_numpy(np_q_repr).type(torch.FloatTensor)
                    doc_reprs = torch.from_numpy(np_doc_reprs).type(torch.FloatTensor)

                    if self.add_noise: # add gaussian noise
                        q_noise = torch.normal(mean=torch.zeros_like(q_repr), std=std_delta)
                        doc_noise = torch.normal(mean=torch.zeros_like(doc_reprs), std=std_delta)
                        q_repr = torch.add(q_repr, q_noise)
                        doc_reprs = torch.add(doc_reprs, doc_noise)

                    np_rele_mat = to_matrix(perm_docs=perm_docs, q_doc_subtopics=q_doc_subtopics)
                    q_doc_rele_mat = torch.from_numpy(np_rele_mat).type(torch.FloatTensor)
                    self.list_torch_Qs.append((qid, q_repr, perm_docs, doc_reprs, alphaDCG, q_doc_subtopics, q_doc_rele_mat))

                #print('Num of q:', len(self.list_torch_Qs))
                if buffer:
                    parent_dir = Path(torch_buffer_file).parent
                    if not os.path.exists(parent_dir):
                        os.makedirs(parent_dir)
                    pickle_save(self.list_torch_Qs, torch_buffer_file)
        else:
            raise NotImplementedError
예제 #7
0
    def __init__(self,
                 train,
                 file,
                 data_id=None,
                 data_dict=None,
                 sample_rankings_per_q=1,
                 shuffle=True,
                 hot=False,
                 eval_dict=None,
                 buffer=True,
                 given_scaler=None):

        assert data_id is not None or data_dict is not None
        if data_dict is None:
            data_dict = self.get_default_data_dict(data_id=data_id)

        self.train = train

        if data_dict['data_id'] in MSLETOR or data_dict['data_id'] in MSLRWEB \
                or data_dict['data_id'] in YAHOO_LTR or data_dict['data_id'] in YAHOO_LTR_5Fold \
                or data_dict['data_id'] in ISTELLA_LTR \
                or data_dict['data_id'] == 'IRGAN_MQ2008_Semi': # supported datasets

            self.check_load_setting(data_dict, eval_dict)

            perquery_file = get_buffer_file_name(data_id=data_id,
                                                 file=file,
                                                 data_dict=data_dict)

            if sample_rankings_per_q > 1:
                if hot:
                    torch_perquery_file = perquery_file.replace(
                        '.np', '_'.join([
                            'SP',
                            str(sample_rankings_per_q), 'Hot', '.torch'
                        ]))
                else:
                    torch_perquery_file = perquery_file.replace(
                        '.np',
                        '_'.join(['SP',
                                  str(sample_rankings_per_q), '.torch']))
            else:
                if hot:
                    torch_perquery_file = perquery_file.replace(
                        '.np', '_Hot.torch')
                else:
                    torch_perquery_file = perquery_file.replace(
                        '.np', '.torch')

            if eval_dict is not None:
                mask_label, mask_ratio, mask_type = eval_dict[
                    'mask_label'], eval_dict['mask_ratio'], eval_dict[
                        'mask_type']
                print(eval_dict)
                if mask_label:
                    mask_label_str = '_'.join(
                        [mask_type, 'Ratio', '{:,g}'.format(mask_ratio)])
                    torch_perquery_file = torch_perquery_file.replace(
                        '.torch', '_' + mask_label_str + '.torch')
            else:
                mask_label = False

            if os.path.exists(torch_perquery_file):
                print('loading buffered file ...')
                self.list_torch_Qs = pickle_load(torch_perquery_file)
            else:
                self.list_torch_Qs = []

                scale_data = data_dict['scale_data']
                scaler_id = data_dict[
                    'scaler_id'] if 'scaler_id' in data_dict else None
                list_Qs = iter_queries(in_file=file,
                                       data_dict=data_dict,
                                       scale_data=scale_data,
                                       scaler_id=scaler_id,
                                       perquery_file=perquery_file,
                                       buffer=buffer)

                list_inds = list(range(len(list_Qs)))
                for ind in list_inds:
                    qid, doc_reprs, doc_labels = list_Qs[ind]

                    if sample_rankings_per_q > 1:
                        assert mask_label is not True  # not supported since it is rarely used.

                        list_ranking = []
                        list_labels = []
                        for _ in range(self.sample_rankings_per_q):
                            des_inds = np_arg_shuffle_ties(
                                doc_labels,
                                descending=True)  # sampling by shuffling ties
                            list_ranking.append(doc_reprs[des_inds])
                            list_labels.append(doc_labels[des_inds])

                        batch_rankings = np.stack(list_ranking, axis=0)
                        batch_std_labels = np.stack(list_labels, axis=0)

                        torch_batch_rankings = torch.from_numpy(
                            batch_rankings).type(torch.FloatTensor)
                        torch_batch_std_labels = torch.from_numpy(
                            batch_std_labels).type(torch.FloatTensor)
                    else:
                        torch_batch_rankings = torch.from_numpy(
                            doc_reprs).type(torch.FloatTensor)
                        torch_batch_rankings = torch.unsqueeze(
                            torch_batch_rankings,
                            dim=0)  # a consistent batch dimension of size 1

                        torch_batch_std_labels = torch.from_numpy(
                            doc_labels).type(torch.FloatTensor)
                        torch_batch_std_labels = torch.unsqueeze(
                            torch_batch_std_labels, dim=0)

                        if mask_label:  # masking
                            if mask_type == 'rand_mask_rele':
                                torch_batch_rankings, torch_batch_std_labels = random_mask_rele_labels(
                                    batch_ranking=torch_batch_rankings,
                                    batch_label=torch_batch_std_labels,
                                    mask_ratio=mask_ratio,
                                    mask_value=0,
                                    presort=data_dict['presort'])

                            elif mask_type == 'rand_mask_all':
                                masked_res = random_mask_all_labels(
                                    batch_ranking=torch_batch_rankings,
                                    batch_label=torch_batch_std_labels,
                                    mask_ratio=mask_ratio,
                                    mask_value=0,
                                    presort=data_dict['presort'])
                                if masked_res is not None:
                                    torch_batch_rankings, torch_batch_std_labels = masked_res
                                else:
                                    continue
                            else:
                                raise NotImplementedError
                    if hot:
                        assert mask_label is not True  # not supported since it is rarely used.
                        max_rele_level = data_dict['max_rele_level']
                        assert max_rele_level is not None

                        torch_batch_std_hot_labels = get_one_hot_reprs(
                            torch_batch_std_labels)
                        batch_cnts = batch_count(
                            batch_std_labels=torch_batch_std_labels,
                            max_rele_grade=max_rele_level,
                            descending=True)

                        self.list_torch_Qs.append(
                            (qid, torch_batch_rankings, torch_batch_std_labels,
                             torch_batch_std_hot_labels, batch_cnts))
                    else:
                        self.list_torch_Qs.append((qid, torch_batch_rankings,
                                                   torch_batch_std_labels))
                #buffer
                #print('Num of q:', len(self.list_torch_Qs))
                if buffer:
                    parent_dir = Path(torch_perquery_file).parent
                    if not os.path.exists(parent_dir):
                        os.makedirs(parent_dir)
                    pickle_save(self.list_torch_Qs, torch_perquery_file)
        else:
            raise NotImplementedError

        self.hot = hot
        self.shuffle = shuffle
예제 #8
0
def iter_queries(in_file,
                 data_dict=None,
                 scale_data=None,
                 scaler_id=None,
                 perquery_file=None,
                 buffer=True):
    '''
    Transforms an iterator of rows to an iterator of queries (i.e., a unit of all the documents and labels associated
    with the same query). Each query is represented by a (qid, feature_mat, std_label_vec) tuple.
    :param in_file:
    :param has_comment:
    :param query_level_scale: perform query-level scaling, say normalization
    :param scaler: MinMaxScaler | RobustScaler
    :param unknown_as_zero: if not labled, regard the relevance degree as zero
    :return:
    '''
    if os.path.exists(perquery_file): return pickle_load(perquery_file)

    if scale_data: scaler = get_scaler(scaler_id=scaler_id)
    presort, min_docs, min_rele = data_dict['presort'], data_dict[
        'min_docs'], data_dict['min_rele']
    unknown_as_zero, binary_rele, has_comment = data_dict[
        'unknown_as_zero'], data_dict['binary_rele'], data_dict['has_comment']

    clip_query = False
    if min_rele is not None and min_rele > 0:
        clip_query = True
    if min_docs is not None and min_docs > 0:
        clip_query = True

    list_Qs = []
    with open(in_file, encoding='iso-8859-1') as file_obj:
        dict_data = dict()
        if has_comment:
            all_features_mat, all_labels_vec, qids, docids = parse_letor(
                file_obj.readlines(), has_comment=True)

            for i in range(len(qids)):
                f_vec = all_features_mat[i, :]
                std_s = all_labels_vec[i]
                qid = qids[i]
                docid = docids[i]

                if qid in dict_data:
                    dict_data[qid].append((std_s, docid, f_vec))
                else:
                    dict_data[qid] = [(std_s, docid, f_vec)]

            del all_features_mat
            # unique qids
            seen = set()
            seen_add = seen.add
            # sequential unique id
            qids_unique = [x for x in qids if not (x in seen or seen_add(x))]

            for qid in qids_unique:
                tmp = list(zip(*dict_data[qid]))

                list_labels_per_q = tmp[0]
                if data_dict['data_id'] in MSLETOR_LIST:
                    ''' convert the original rank-position into grade-labels '''
                    ranking_size = len(list_labels_per_q)
                    list_labels_per_q = [
                        ranking_size - r for r in list_labels_per_q
                    ]

                #list_docids_per_q = tmp[1]
                list_features_per_q = tmp[2]
                feature_mat = np.vstack(list_features_per_q)

                if scale_data:
                    if data_dict['data_id'] in ISTELLA_LTR:
                        # due to the possible extremely large features, e.g., 1.79769313486e+308
                        feature_mat = scaler.fit_transform(
                            np.clip(feature_mat, a_min=None,
                                    a_max=ISTELLA_MAX))
                    else:
                        feature_mat = scaler.fit_transform(feature_mat)

                Q = clip_query_data(qid=qid,
                                    feature_mat=feature_mat,
                                    std_label_vec=np.array(list_labels_per_q),
                                    binary_rele=binary_rele,
                                    unknown_as_zero=unknown_as_zero,
                                    clip_query=clip_query,
                                    min_docs=min_docs,
                                    min_rele=min_rele,
                                    presort=presort)
                if Q is not None:
                    list_Qs.append(Q)
        else:
            all_features_mat, all_labels_vec, qids = parse_letor(
                file_obj.readlines(), has_comment=False)

            for i in range(len(qids)):
                f_vec = all_features_mat[i, :]
                std_s = all_labels_vec[i]
                qid = qids[i]

                if qid in dict_data:
                    dict_data[qid].append((std_s, f_vec))
                else:
                    dict_data[qid] = [(std_s, f_vec)]

            del all_features_mat
            # unique qids
            seen = set()
            seen_add = seen.add
            # sequential unique id
            qids_unique = [x for x in qids if not (x in seen or seen_add(x))]

            for qid in qids_unique:
                tmp = list(zip(*dict_data[qid]))
                list_labels_per_q = tmp[0]
                if data_dict['data_id'] in MSLETOR_LIST:
                    ''' convert the original rank-position into grade-labels '''
                    ranking_size = len(list_labels_per_q)
                    list_labels_per_q = [
                        ranking_size - r for r in list_labels_per_q
                    ]

                list_features_per_q = tmp[1]
                feature_mat = np.vstack(list_features_per_q)

                if data_dict['data_id'] in ISTELLA_LTR:
                    # due to the possible extremely large features, e.g., 1.79769313486e+308
                    feature_mat = scaler.fit_transform(
                        np.clip(feature_mat, a_min=None, a_max=ISTELLA_MAX))
                else:
                    feature_mat = scaler.fit_transform(feature_mat)

                Q = clip_query_data(qid=qid,
                                    feature_mat=feature_mat,
                                    std_label_vec=np.array(list_labels_per_q),
                                    binary_rele=binary_rele,
                                    unknown_as_zero=unknown_as_zero,
                                    clip_query=clip_query,
                                    min_docs=min_docs,
                                    min_rele=min_rele,
                                    presort=presort)
                if Q is not None:
                    list_Qs.append(Q)

    if buffer:
        assert perquery_file is not None
        parent_dir = Path(perquery_file).parent
        if not os.path.exists(parent_dir):
            os.makedirs(parent_dir)

        pickle_save(list_Qs, file=perquery_file)

    return list_Qs