Пример #1
0
def load_idlist(id_list_nm='id_list_dict_max_len_200_all',
                zero_pre_post='pre'):
    """
    zero_pre_post: "pre"表示序列开头填充0,"post"表示序列尾部填充0
    """
    # id_list_dict: 包含padding后的序列特征字典以及词表
    id_list_dict = Cache.reload_cache(file_nm=id_list_nm,
                                      base_dir=INPUT_DATA_BASE_DIR,
                                      pure_nm=True)
    # truncate:
    if USE_SEQ_LENGTH < 200:
        if zero_pre_post == 'pre':  # 前面填充0,从后序开始截断:-USE_SEQ_LENGTH:
            for col in EMB_keys2do:
                id_list_dict[col + "_list"]['id_list'] = id_list_dict[
                    col + "_list"]['id_list'][:, -USE_SEQ_LENGTH:]

        elif zero_pre_post == 'post':  # 后面填充0,从前序开始截断:0:USE_SEQ_LENGTH
            for col in EMB_keys2do:
                id_list_dict[col + "_list"]['id_list'] = id_list_dict[
                    col + "_list"]['id_list'][:, 0:USE_SEQ_LENGTH]
        else:
            raise NotImplementedError

    KEY2INDEX_DICT = {}  # 每个序列特征的词表组成的字典
    SEQ_LENTH_DICT = {}  # 存放每个序列截断长度的字典 一般都是一样的,比如这里是 150

    for key in EMB_keys2do:
        KEY2INDEX_DICT[key] = id_list_dict[f'{key}_list']['key2index']
        SEQ_LENTH_DICT[key] = id_list_dict[f'{key}_list']['id_list'].shape[-1]

    if len(set(SEQ_LENTH_DICT.values())) == 1:
        print("GlobalSeqLength:", SEQ_LENTH_DICT[key])
    else:
        print(
            "GlobalSeqLength is Not Unique!!! If you are sure, comment the line after to avoid exception."
        )
        raise

    # 生成mask 放入click_times_list
    array_new = id_list_dict['industry_list']['id_list'].copy()
    array_new = (array_new == 0).astype(np.int32)
    id_list_dict['click_times_list'] = {}
    id_list_dict['click_times_list']['id_list'] = array_new  # mask
    del array_new
    gc.collect()

    input_dict_all = {}
    for col in EMB_keys2do:
        input_dict_all[col] = id_list_dict[col + '_list']['id_list']
    input_dict_all['click_times'] = id_list_dict['click_times_list'][
        'id_list']  # 加入time
    return input_dict_all, KEY2INDEX_DICT
Пример #2
0
def load_datalabel():
    '''
    :return: train datalabel and matrix to save modelresult
    '''
    datalabel = Cache.reload_cache(file_nm='datalabel_with_seq_length',
                                   base_dir=INPUT_DATA_BASE_DIR,
                                   pure_nm=True)
    if datalabel['age'].min() == 1:
        datalabel['age'] = datalabel['age'] - 1
    if datalabel['gender'].min() == 1:
        datalabel['gender'] = datalabel['gender'] - 1
    assert datalabel['age'].min() == 0
    assert datalabel['gender'].min() == 0

    datalabel = datalabel[['user_id', 'gender', 'age']]
    traindata = datalabel.loc[~datalabel['age'].isna()].reset_index(drop=True)
    testdata = datalabel.loc[datalabel['age'].isna()].copy().reset_index(
        drop=True)

    traindata['age'] = traindata['age'].astype(np.int8)
    traindata['gender'] = traindata['gender'].astype(np.int8)
    traindata['age_gender'] = traindata['gender'] * 10 + traindata['age']
    # gender = 0, age => 0~9
    # gender = 1, age+=10 => 10~19
    print(
        f"traindata['age_gender'].unique(): {sorted(traindata['age_gender'].unique())}"
    )
    print(traindata.shape, testdata.shape)

    # init array to store oof and model prob
    train_shape = traindata.shape[0]
    test_shape = testdata.shape[0]
    model_prob = np.zeros((train_shape + test_shape, NUM_CLASSES, N_FOLDS),
                          dtype='float32')

    all_uid_df = datalabel[['user_id']].copy()  # to save the model_prob
    train_uid_df = traindata[['user_id']].copy()  # to save the oof_prob

    if not isTEST:
        os.makedirs(f"../../05_RESULT/META/{TRAIN_MARKER}", exist_ok=True)
        os.makedirs("../../05_RESULT/SUB", exist_ok=True)
        all_uid_df.to_csv(
            f"../../05_RESULT/META/{TRAIN_MARKER}/SAVE_all_uid_df.csv",
            index=False)
        train_uid_df.to_csv(
            f"../../05_RESULT/META/{TRAIN_MARKER}/SAVE_train_uid_df.csv",
            index=False)
    return traindata, model_prob
 def random_get_embedding_fun(self, id_list_dict):
     emb_matrix_dict = {}
     for col in self.use_cols:
         col_file_names = []
         sepc_embs = self.spec_emb_dict[col]  # 必须要用
         # 随机抽一些embedding 优先抽最大个数个 再在后续不断拼到dict中达到max_embs就停止
         # 文件名对应的表示是user_id_xx
         for indexpath, pathi in enumerate(self.path_list):
             for filei in os.listdir(pathi):
                 if filei.find('user_id_' + col) > -1:
                     col_file_names.append(pathi + filei)
         if len(sepc_embs) > 0:
             # 排它
             col_file_names = list(
                 set(col_file_names).difference(set(sepc_embs)))
         random.shuffle(col_file_names)
         select_nums = min(
             [len(col_file_names),
              self.max_nums[col] - len(sepc_embs)])  # 再选入的个数
         file_to_load = col_file_names[:select_nums]  # 再选入的emb
         file_to_load = sepc_embs + file_to_load
         emblist = []
         for filei in file_to_load:
             try:
                 emb_i = Cache.reload_cache(file_nm=filei,
                                            base_dir='',
                                            pure_nm=False)['word_emb_dict']
                 emblist.append(emb_i)
             except:
                 print('missing! ', filei)
         print('processing {} shape {}'.format(col, len(emblist)))
         print(file_to_load)  # 选中的file
         emb_matrix_all = self.get_batch_emb_matrix(
             file_to_load,
             emblist,
             id_list_dict,
             col + '_list',
             max_embs=self.max_embs[col])  # id_list_dict 外部传入
         emb_matrix_dict[col] = emb_matrix_all  # 一个list
         del emb_matrix_all, emblist
         gc.collect()
     # key 是列名 value是一个list 里面有这个列所属的各种embedding矩阵 按照词表*emb_size的
     return emb_matrix_dict
BATCH_SIZE = 512
SEQ_LENGTH = 150
DROPOUT = 0.3
NUM_CLASS = 20
EPOCHS = 30
LR = 1e-3

device = torch.device("cuda:0")

##############################
######## 获取emb #############
##############################

seq_length_creative_id = 150  # 序列都padding到了150
id_list_dict = Cache.reload_cache(
    file_nm='../../cached_data/CACHE_id_list_dict_150_normal.pkl',
    base_dir='',
    pure_nm=False)

# 定义需要的输入
cols_to_emb = [
    'creative_id', 'ad_id', 'advertiser_id', 'product_id', 'product_category',
    'industry', 'time'
]
# 定义emb 文件路径
path_list = ['../../cached_data/']
# 定义最大emb_size
max_embs = {
    'creative_id': 2000,
    'ad_id': 2000,
    'advertiser_id': 2000,
    'product_id': 2000,
def _load_merged_emb(emb_lst):
    all_emb_dict = {}
    for i, nm in enumerate(emb_lst, 1):
        all_emb_dict[f"emb_{i}"] = Cache.reload_cache(nm)
    return all_emb_dict
Пример #6
0
        return id_list, key2index

    id_list_dict = {}
    for col in tqdm(sequence_features):
        id_list, key2index = get_sequence(datalabel, col, max_len=150)
        # dict ,id_list as key index sequence key2index as words -> key index
        id_list_dict[col] = {'id_list': id_list, 'key2index': key2index}

    Cache.cache_data(id_list_dict, nm_marker='id_list_dict_150_normal')

    # ##################################################################################################################
    # get time embedding
    import datetime
    # during 2019-09-01 to 2019-11-30
    id_list_dict = Cache.reload_cache(file_nm=data_path +
                                      'CACHE_id_list_dict_150_normal.pkl',
                                      base_dir='',
                                      pure_nm=False)

    class strTimeEmb(object):
        '''
    	# time 中一些特征做onehot encoding
        周x
        是否是周末
        月
        月第x周
        教师节 中秋节 16日 9.29调休 10.1假期 10.7重阳节 10.12调休 10.28寒衣节 11.8立冬 11.17学生日 11.28感恩节
        '''
        def __init__(self, daynow):
            self.daynow = int(daynow)
            self.month = 0
            self.day = 0