Exemplo n.º 1
0
    def __init__(self,
                 train,
                 file,
                 data_id=None,
                 data_dict=None,
                 sample_rankings_per_q=1,
                 shuffle=True,
                 hot=False,
                 eval_dict=None,
                 buffer=True,
                 given_scaler=None):

        assert data_id is not None or data_dict is not None
        if data_dict is None:
            data_dict = self.get_default_data_dict(data_id=data_id)

        self.train = train

        if data_dict['data_id'] in MSLETOR or data_dict['data_id'] in MSLRWEB \
                or data_dict['data_id'] in YAHOO_LTR or data_dict['data_id'] in YAHOO_LTR_5Fold \
                or data_dict['data_id'] in ISTELLA_LTR \
                or data_dict['data_id'] == 'IRGAN_MQ2008_Semi': # supported datasets

            self.check_load_setting(data_dict, eval_dict)

            perquery_file = get_buffer_file_name(data_id=data_id,
                                                 file=file,
                                                 data_dict=data_dict)

            if sample_rankings_per_q > 1:
                if hot:
                    torch_perquery_file = perquery_file.replace(
                        '.np', '_'.join([
                            'SP',
                            str(sample_rankings_per_q), 'Hot', '.torch'
                        ]))
                else:
                    torch_perquery_file = perquery_file.replace(
                        '.np',
                        '_'.join(['SP',
                                  str(sample_rankings_per_q), '.torch']))
            else:
                if hot:
                    torch_perquery_file = perquery_file.replace(
                        '.np', '_Hot.torch')
                else:
                    torch_perquery_file = perquery_file.replace(
                        '.np', '.torch')

            if eval_dict is not None:
                mask_label, mask_ratio, mask_type = eval_dict[
                    'mask_label'], eval_dict['mask_ratio'], eval_dict[
                        'mask_type']
                print(eval_dict)
                if mask_label:
                    mask_label_str = '_'.join(
                        [mask_type, 'Ratio', '{:,g}'.format(mask_ratio)])
                    torch_perquery_file = torch_perquery_file.replace(
                        '.torch', '_' + mask_label_str + '.torch')
            else:
                mask_label = False

            if os.path.exists(torch_perquery_file):
                print('loading buffered file ...')
                self.list_torch_Qs = pickle_load(torch_perquery_file)
            else:
                self.list_torch_Qs = []

                scale_data = data_dict['scale_data']
                scaler_id = data_dict[
                    'scaler_id'] if 'scaler_id' in data_dict else None
                list_Qs = iter_queries(in_file=file,
                                       data_dict=data_dict,
                                       scale_data=scale_data,
                                       scaler_id=scaler_id,
                                       perquery_file=perquery_file,
                                       buffer=buffer)

                list_inds = list(range(len(list_Qs)))
                for ind in list_inds:
                    qid, doc_reprs, doc_labels = list_Qs[ind]

                    if sample_rankings_per_q > 1:
                        assert mask_label is not True  # not supported since it is rarely used.

                        list_ranking = []
                        list_labels = []
                        for _ in range(self.sample_rankings_per_q):
                            des_inds = np_arg_shuffle_ties(
                                doc_labels,
                                descending=True)  # sampling by shuffling ties
                            list_ranking.append(doc_reprs[des_inds])
                            list_labels.append(doc_labels[des_inds])

                        batch_rankings = np.stack(list_ranking, axis=0)
                        batch_std_labels = np.stack(list_labels, axis=0)

                        torch_batch_rankings = torch.from_numpy(
                            batch_rankings).type(torch.FloatTensor)
                        torch_batch_std_labels = torch.from_numpy(
                            batch_std_labels).type(torch.FloatTensor)
                    else:
                        torch_batch_rankings = torch.from_numpy(
                            doc_reprs).type(torch.FloatTensor)
                        torch_batch_rankings = torch.unsqueeze(
                            torch_batch_rankings,
                            dim=0)  # a consistent batch dimension of size 1

                        torch_batch_std_labels = torch.from_numpy(
                            doc_labels).type(torch.FloatTensor)
                        torch_batch_std_labels = torch.unsqueeze(
                            torch_batch_std_labels, dim=0)

                        if mask_label:  # masking
                            if mask_type == 'rand_mask_rele':
                                torch_batch_rankings, torch_batch_std_labels = random_mask_rele_labels(
                                    batch_ranking=torch_batch_rankings,
                                    batch_label=torch_batch_std_labels,
                                    mask_ratio=mask_ratio,
                                    mask_value=0,
                                    presort=data_dict['presort'])

                            elif mask_type == 'rand_mask_all':
                                masked_res = random_mask_all_labels(
                                    batch_ranking=torch_batch_rankings,
                                    batch_label=torch_batch_std_labels,
                                    mask_ratio=mask_ratio,
                                    mask_value=0,
                                    presort=data_dict['presort'])
                                if masked_res is not None:
                                    torch_batch_rankings, torch_batch_std_labels = masked_res
                                else:
                                    continue
                            else:
                                raise NotImplementedError
                    if hot:
                        assert mask_label is not True  # not supported since it is rarely used.
                        max_rele_level = data_dict['max_rele_level']
                        assert max_rele_level is not None

                        torch_batch_std_hot_labels = get_one_hot_reprs(
                            torch_batch_std_labels)
                        batch_cnts = batch_count(
                            batch_std_labels=torch_batch_std_labels,
                            max_rele_grade=max_rele_level,
                            descending=True)

                        self.list_torch_Qs.append(
                            (qid, torch_batch_rankings, torch_batch_std_labels,
                             torch_batch_std_hot_labels, batch_cnts))
                    else:
                        self.list_torch_Qs.append((qid, torch_batch_rankings,
                                                   torch_batch_std_labels))
                #buffer
                #print('Num of q:', len(self.list_torch_Qs))
                if buffer:
                    parent_dir = Path(torch_perquery_file).parent
                    if not os.path.exists(parent_dir):
                        os.makedirs(parent_dir)
                    pickle_save(self.list_torch_Qs, torch_perquery_file)
        else:
            raise NotImplementedError

        self.hot = hot
        self.shuffle = shuffle
Exemplo n.º 2
0
    def __init__(self, split_type, list_as_file, data_id=None, data_dict=None, fold_dir=None, presort=True, alpha=0.5,
                 dictQueryRepresentation=None, dictDocumentRepresentation=None, dictQueryPermutaion=None,
                 dictQueryDocumentSubtopics=None, buffer=True, add_noise=False, std_delta=1.0):
        self.presort = presort
        self.add_noise = add_noise
        ''' split-specific settings '''
        self.split_type = split_type
        self.data_id = data_dict['data_id']
        assert presort is True # since it is time-consuming to generate the ideal diversified ranking dynamically.

        if data_dict['data_id'] in TREC_DIV: # supported datasets
            torch_buffer_file = fold_dir.replace('folder', 'Bufferedfolder') + split_type.name
            if self.presort:
                torch_buffer_file = '_'.join([torch_buffer_file, 'presort', '{:,g}'.format(alpha)])
            if self.add_noise:
                torch_buffer_file = '_'.join([torch_buffer_file, 'gaussian', '{:,g}'.format(std_delta)])

            torch_buffer_file += '.torch'

            if os.path.exists(torch_buffer_file):
                print('loading buffered file ...')
                self.list_torch_Qs = pickle_load(torch_buffer_file)
            else:
                self.list_torch_Qs = []
                for qid in list_as_file:
                    np_q_repr = dictQueryRepresentation[str(qid)] # [1, 100]
                    alphaDCG = dictQueryPermutaion[str(qid)]['alphaDCG']
                    q_doc_subtopics = dictQueryDocumentSubtopics[str(qid)]
                    perm_docs = dictQueryPermutaion[str(qid)]['permutation']
                    if self.presort:
                        # print('json-alphaDCG', alphaDCG) # TODO the meaning of json-alphaDCG needs to be confirmed
                        ''' the following comparison shows that the provided permutation of docs is the ideal ranking '''
                        #print('personal-computation for json', alpha_DCG_at_k(sorted_docs=perm_docs, q_doc_subtopics=q_doc_subtopics, k=4, alpha=0.5))
                        perm_docs = get_div_ideal_ranking(pool_docs=perm_docs, q_doc_subtopics=q_doc_subtopics, alpha=alpha)
                        #print('personal-computation for ideal', alpha_DCG_at_k(sorted_docs=perm_docs, q_doc_subtopics=q_doc_subtopics, k=4, alpha=0.5))
                        #print('===')

                    list_doc_reprs = []
                    for doc in perm_docs:
                        list_doc_reprs.append(dictDocumentRepresentation[doc]) # [1, 100]
                    np_doc_reprs = np.vstack(list_doc_reprs) # [permutation_size, 100]

                    q_repr = torch.from_numpy(np_q_repr).type(torch.FloatTensor)
                    doc_reprs = torch.from_numpy(np_doc_reprs).type(torch.FloatTensor)

                    if self.add_noise: # add gaussian noise
                        q_noise = torch.normal(mean=torch.zeros_like(q_repr), std=std_delta)
                        doc_noise = torch.normal(mean=torch.zeros_like(doc_reprs), std=std_delta)
                        q_repr = torch.add(q_repr, q_noise)
                        doc_reprs = torch.add(doc_reprs, doc_noise)

                    np_rele_mat = to_matrix(perm_docs=perm_docs, q_doc_subtopics=q_doc_subtopics)
                    q_doc_rele_mat = torch.from_numpy(np_rele_mat).type(torch.FloatTensor)
                    self.list_torch_Qs.append((qid, q_repr, perm_docs, doc_reprs, alphaDCG, q_doc_subtopics, q_doc_rele_mat))

                #print('Num of q:', len(self.list_torch_Qs))
                if buffer:
                    parent_dir = Path(torch_buffer_file).parent
                    if not os.path.exists(parent_dir):
                        os.makedirs(parent_dir)
                    pickle_save(self.list_torch_Qs, torch_buffer_file)
        else:
            raise NotImplementedError
Exemplo n.º 3
0
def iter_queries(in_file,
                 data_dict=None,
                 scale_data=None,
                 scaler_id=None,
                 perquery_file=None,
                 buffer=True):
    '''
    Transforms an iterator of rows to an iterator of queries (i.e., a unit of all the documents and labels associated
    with the same query). Each query is represented by a (qid, feature_mat, std_label_vec) tuple.
    :param in_file:
    :param has_comment:
    :param query_level_scale: perform query-level scaling, say normalization
    :param scaler: MinMaxScaler | RobustScaler
    :param unknown_as_zero: if not labled, regard the relevance degree as zero
    :return:
    '''
    if os.path.exists(perquery_file): return pickle_load(perquery_file)

    if scale_data: scaler = get_scaler(scaler_id=scaler_id)
    presort, min_docs, min_rele = data_dict['presort'], data_dict[
        'min_docs'], data_dict['min_rele']
    unknown_as_zero, binary_rele, has_comment = data_dict[
        'unknown_as_zero'], data_dict['binary_rele'], data_dict['has_comment']

    clip_query = False
    if min_rele is not None and min_rele > 0:
        clip_query = True
    if min_docs is not None and min_docs > 0:
        clip_query = True

    list_Qs = []
    with open(in_file, encoding='iso-8859-1') as file_obj:
        dict_data = dict()
        if has_comment:
            all_features_mat, all_labels_vec, qids, docids = parse_letor(
                file_obj.readlines(), has_comment=True)

            for i in range(len(qids)):
                f_vec = all_features_mat[i, :]
                std_s = all_labels_vec[i]
                qid = qids[i]
                docid = docids[i]

                if qid in dict_data:
                    dict_data[qid].append((std_s, docid, f_vec))
                else:
                    dict_data[qid] = [(std_s, docid, f_vec)]

            del all_features_mat
            # unique qids
            seen = set()
            seen_add = seen.add
            # sequential unique id
            qids_unique = [x for x in qids if not (x in seen or seen_add(x))]

            for qid in qids_unique:
                tmp = list(zip(*dict_data[qid]))

                list_labels_per_q = tmp[0]
                if data_dict['data_id'] in MSLETOR_LIST:
                    ''' convert the original rank-position into grade-labels '''
                    ranking_size = len(list_labels_per_q)
                    list_labels_per_q = [
                        ranking_size - r for r in list_labels_per_q
                    ]

                #list_docids_per_q = tmp[1]
                list_features_per_q = tmp[2]
                feature_mat = np.vstack(list_features_per_q)

                if scale_data:
                    if data_dict['data_id'] in ISTELLA_LTR:
                        # due to the possible extremely large features, e.g., 1.79769313486e+308
                        feature_mat = scaler.fit_transform(
                            np.clip(feature_mat, a_min=None,
                                    a_max=ISTELLA_MAX))
                    else:
                        feature_mat = scaler.fit_transform(feature_mat)

                Q = clip_query_data(qid=qid,
                                    feature_mat=feature_mat,
                                    std_label_vec=np.array(list_labels_per_q),
                                    binary_rele=binary_rele,
                                    unknown_as_zero=unknown_as_zero,
                                    clip_query=clip_query,
                                    min_docs=min_docs,
                                    min_rele=min_rele,
                                    presort=presort)
                if Q is not None:
                    list_Qs.append(Q)
        else:
            all_features_mat, all_labels_vec, qids = parse_letor(
                file_obj.readlines(), has_comment=False)

            for i in range(len(qids)):
                f_vec = all_features_mat[i, :]
                std_s = all_labels_vec[i]
                qid = qids[i]

                if qid in dict_data:
                    dict_data[qid].append((std_s, f_vec))
                else:
                    dict_data[qid] = [(std_s, f_vec)]

            del all_features_mat
            # unique qids
            seen = set()
            seen_add = seen.add
            # sequential unique id
            qids_unique = [x for x in qids if not (x in seen or seen_add(x))]

            for qid in qids_unique:
                tmp = list(zip(*dict_data[qid]))
                list_labels_per_q = tmp[0]
                if data_dict['data_id'] in MSLETOR_LIST:
                    ''' convert the original rank-position into grade-labels '''
                    ranking_size = len(list_labels_per_q)
                    list_labels_per_q = [
                        ranking_size - r for r in list_labels_per_q
                    ]

                list_features_per_q = tmp[1]
                feature_mat = np.vstack(list_features_per_q)

                if data_dict['data_id'] in ISTELLA_LTR:
                    # due to the possible extremely large features, e.g., 1.79769313486e+308
                    feature_mat = scaler.fit_transform(
                        np.clip(feature_mat, a_min=None, a_max=ISTELLA_MAX))
                else:
                    feature_mat = scaler.fit_transform(feature_mat)

                Q = clip_query_data(qid=qid,
                                    feature_mat=feature_mat,
                                    std_label_vec=np.array(list_labels_per_q),
                                    binary_rele=binary_rele,
                                    unknown_as_zero=unknown_as_zero,
                                    clip_query=clip_query,
                                    min_docs=min_docs,
                                    min_rele=min_rele,
                                    presort=presort)
                if Q is not None:
                    list_Qs.append(Q)

    if buffer:
        assert perquery_file is not None
        parent_dir = Path(perquery_file).parent
        if not os.path.exists(parent_dir):
            os.makedirs(parent_dir)

        pickle_save(list_Qs, file=perquery_file)

    return list_Qs