Пример #1
0
    def run_w2v(sentence_id, word_id, emb_size=256):
        '''

        :param sentence_id: sentence groupby key
        :param word_id: col as word
        :param emb_size: output embedding size used in w2v
        :return:
        '''
        # large window embedding
        window = 150
        res_dict = w2v_pro_normal(datalog,
                                  sentence_id=sentence_id,
                                  word_id=word_id,
                                  window=window,
                                  emb_size=emb_size,
                                  dropna=False,
                                  n_jobs=12,
                                  epoch=5)
        epoch = 10
        method = 'CBOW'
        author = 'AZ'
        marker = 'TXBASE'
        Cache.cache_data(
            res_dict,
            nm_marker=
            f'EMB_DICT_{author}_{method}_{marker}_{window}WINDOW_{epoch}EPOCH_{sentence_id}_{word_id}'
        )
        del res_dict
        gc.collect()
Пример #2
0
def run_d2v(sentence_id, word_id, marker, epoch=10, window=30, emb_size=128):
    emb_name = f'EMB_DICT_ZQ_D2V_{marker}_{window}WINDOW_{epoch}EPOCH_{sentence_id}_{word_id}'
    print(emb_name)
    if word_id == 'industry':
        epoch = 8
    res_dict = d2v_pro(datalog,
                       sentence_id=sentence_id,
                       word_id=word_id,
                       emb_size=emb_size,
                       dropna=False,
                       n_jobs=48,
                       hs=1,
                       window=window,
                       negative=10,
                       epoch=epoch,
                       return_model=False)

    Cache.cache_data(res_dict, nm_marker=emb_name)
Пример #3
0
def load_idlist(id_list_nm='id_list_dict_max_len_200_all',
                zero_pre_post='pre'):
    """
    zero_pre_post: "pre"表示序列开头填充0,"post"表示序列尾部填充0
    """
    # id_list_dict: 包含padding后的序列特征字典以及词表
    id_list_dict = Cache.reload_cache(file_nm=id_list_nm,
                                      base_dir=INPUT_DATA_BASE_DIR,
                                      pure_nm=True)
    # truncate:
    if USE_SEQ_LENGTH < 200:
        if zero_pre_post == 'pre':  # 前面填充0,从后序开始截断:-USE_SEQ_LENGTH:
            for col in EMB_keys2do:
                id_list_dict[col + "_list"]['id_list'] = id_list_dict[
                    col + "_list"]['id_list'][:, -USE_SEQ_LENGTH:]

        elif zero_pre_post == 'post':  # 后面填充0,从前序开始截断:0:USE_SEQ_LENGTH
            for col in EMB_keys2do:
                id_list_dict[col + "_list"]['id_list'] = id_list_dict[
                    col + "_list"]['id_list'][:, 0:USE_SEQ_LENGTH]
        else:
            raise NotImplementedError

    KEY2INDEX_DICT = {}  # 每个序列特征的词表组成的字典
    SEQ_LENTH_DICT = {}  # 存放每个序列截断长度的字典 一般都是一样的,比如这里是 150

    for key in EMB_keys2do:
        KEY2INDEX_DICT[key] = id_list_dict[f'{key}_list']['key2index']
        SEQ_LENTH_DICT[key] = id_list_dict[f'{key}_list']['id_list'].shape[-1]

    if len(set(SEQ_LENTH_DICT.values())) == 1:
        print("GlobalSeqLength:", SEQ_LENTH_DICT[key])
    else:
        print(
            "GlobalSeqLength is Not Unique!!! If you are sure, comment the line after to avoid exception."
        )
        raise

    # 生成mask 放入click_times_list
    array_new = id_list_dict['industry_list']['id_list'].copy()
    array_new = (array_new == 0).astype(np.int32)
    id_list_dict['click_times_list'] = {}
    id_list_dict['click_times_list']['id_list'] = array_new  # mask
    del array_new
    gc.collect()

    input_dict_all = {}
    for col in EMB_keys2do:
        input_dict_all[col] = id_list_dict[col + '_list']['id_list']
    input_dict_all['click_times'] = id_list_dict['click_times_list'][
        'id_list']  # 加入time
    return input_dict_all, KEY2INDEX_DICT
Пример #4
0
    def run_w2v(sentence_id, word_id, emb_size=256):
        window = 60
        res_dict = w2v_pro_normal(datalog,
                                  sentence_id=sentence_id,
                                  word_id=word_id,
                                  window=60,
                                  emb_size=emb_size,
                                  dropna=False,
                                  n_jobs=24,
                                  epoch=10)
        epoch = 10
        method = 'CBOW'
        author = 'AZ'
        marker = 'CLICK_TIMES_INCREASED'

        Cache.cache_data(
            res_dict,
            nm_marker=
            f'EMB_DICT_{author}_{method}_{marker}_{window}WINDOW_{epoch}EPOCH_{sentence_id}_{word_id}'
        )
        del res_dict
        gc.collect()
Пример #5
0
 def run_w2v(sentence_id, word_id, emb_size=256, epoch=10):
     window = 60
     res_dict0, res_dict1 = w2v_pro_item(datalog,
                                         sentence_id=sentence_id,
                                         word_id=word_id,
                                         window=window,
                                         emb_size=emb_size,
                                         dropna=False,
                                         n_jobs=12,
                                         epoch=epoch)
     epoch = epoch
     method = 'cbow'
     author = 'AZ'
     marker = 'CONCAT_' + word_id[1]
     Cache.cache_data(
         res_dict0,
         nm_marker=
         f'EMB_DICT_{author}_{method}_{marker}_{window}WINDOW_{epoch}EPOCH_{sentence_id}_{word_id[0]}'
     )
     del res_dict0, res_dict1
     # do not use category embedding
     # Cache.cache_data(res_dict1,
     #                  nm_marker=f'EMB_DICT_{author}_{method}_{marker}_{window}WINDOW_{epoch}EPOCH_{sentence_id}_{word_id[1]}')
     gc.collect()
Пример #6
0
def load_datalabel():
    '''
    :return: train datalabel and matrix to save modelresult
    '''
    datalabel = Cache.reload_cache(file_nm='datalabel_with_seq_length',
                                   base_dir=INPUT_DATA_BASE_DIR,
                                   pure_nm=True)
    if datalabel['age'].min() == 1:
        datalabel['age'] = datalabel['age'] - 1
    if datalabel['gender'].min() == 1:
        datalabel['gender'] = datalabel['gender'] - 1
    assert datalabel['age'].min() == 0
    assert datalabel['gender'].min() == 0

    datalabel = datalabel[['user_id', 'gender', 'age']]
    traindata = datalabel.loc[~datalabel['age'].isna()].reset_index(drop=True)
    testdata = datalabel.loc[datalabel['age'].isna()].copy().reset_index(
        drop=True)

    traindata['age'] = traindata['age'].astype(np.int8)
    traindata['gender'] = traindata['gender'].astype(np.int8)
    traindata['age_gender'] = traindata['gender'] * 10 + traindata['age']
    # gender = 0, age => 0~9
    # gender = 1, age+=10 => 10~19
    print(
        f"traindata['age_gender'].unique(): {sorted(traindata['age_gender'].unique())}"
    )
    print(traindata.shape, testdata.shape)

    # init array to store oof and model prob
    train_shape = traindata.shape[0]
    test_shape = testdata.shape[0]
    model_prob = np.zeros((train_shape + test_shape, NUM_CLASSES, N_FOLDS),
                          dtype='float32')

    all_uid_df = datalabel[['user_id']].copy()  # to save the model_prob
    train_uid_df = traindata[['user_id']].copy()  # to save the oof_prob

    if not isTEST:
        os.makedirs(f"../../05_RESULT/META/{TRAIN_MARKER}", exist_ok=True)
        os.makedirs("../../05_RESULT/SUB", exist_ok=True)
        all_uid_df.to_csv(
            f"../../05_RESULT/META/{TRAIN_MARKER}/SAVE_all_uid_df.csv",
            index=False)
        train_uid_df.to_csv(
            f"../../05_RESULT/META/{TRAIN_MARKER}/SAVE_train_uid_df.csv",
            index=False)
    return traindata, model_prob
 def random_get_embedding_fun(self, id_list_dict):
     emb_matrix_dict = {}
     for col in self.use_cols:
         col_file_names = []
         sepc_embs = self.spec_emb_dict[col]  # 必须要用
         # 随机抽一些embedding 优先抽最大个数个 再在后续不断拼到dict中达到max_embs就停止
         # 文件名对应的表示是user_id_xx
         for indexpath, pathi in enumerate(self.path_list):
             for filei in os.listdir(pathi):
                 if filei.find('user_id_' + col) > -1:
                     col_file_names.append(pathi + filei)
         if len(sepc_embs) > 0:
             # 排它
             col_file_names = list(
                 set(col_file_names).difference(set(sepc_embs)))
         random.shuffle(col_file_names)
         select_nums = min(
             [len(col_file_names),
              self.max_nums[col] - len(sepc_embs)])  # 再选入的个数
         file_to_load = col_file_names[:select_nums]  # 再选入的emb
         file_to_load = sepc_embs + file_to_load
         emblist = []
         for filei in file_to_load:
             try:
                 emb_i = Cache.reload_cache(file_nm=filei,
                                            base_dir='',
                                            pure_nm=False)['word_emb_dict']
                 emblist.append(emb_i)
             except:
                 print('missing! ', filei)
         print('processing {} shape {}'.format(col, len(emblist)))
         print(file_to_load)  # 选中的file
         emb_matrix_all = self.get_batch_emb_matrix(
             file_to_load,
             emblist,
             id_list_dict,
             col + '_list',
             max_embs=self.max_embs[col])  # id_list_dict 外部传入
         emb_matrix_dict[col] = emb_matrix_all  # 一个list
         del emb_matrix_all, emblist
         gc.collect()
     # key 是列名 value是一个list 里面有这个列所属的各种embedding矩阵 按照词表*emb_size的
     return emb_matrix_dict
BATCH_SIZE = 512
SEQ_LENGTH = 150
DROPOUT = 0.3
NUM_CLASS = 20
EPOCHS = 30
LR = 1e-3

device = torch.device("cuda:0")

##############################
######## 获取emb #############
##############################

seq_length_creative_id = 150  # 序列都padding到了150
id_list_dict = Cache.reload_cache(
    file_nm='../../cached_data/CACHE_id_list_dict_150_normal.pkl',
    base_dir='',
    pure_nm=False)

# 定义需要的输入
cols_to_emb = [
    'creative_id', 'ad_id', 'advertiser_id', 'product_id', 'product_category',
    'industry', 'time'
]
# 定义emb 文件路径
path_list = ['../../cached_data/']
# 定义最大emb_size
max_embs = {
    'creative_id': 2000,
    'ad_id': 2000,
    'advertiser_id': 2000,
    'product_id': 2000,
def _load_merged_emb(emb_lst):
    all_emb_dict = {}
    for i, nm in enumerate(emb_lst, 1):
        all_emb_dict[f"emb_{i}"] = Cache.reload_cache(nm)
    return all_emb_dict
Пример #10
0
        if max_len is None:
            max_len = int(np.percentile(id_list_length, 99))
        # pre padding , 0 before sequence
        id_list = pad_sequences(id_list,
                                maxlen=max_len,
                                padding='pre',
                                truncating='pre')
        return id_list, key2index

    id_list_dict = {}
    for col in tqdm(sequence_features):
        id_list, key2index = get_sequence(datalabel, col, max_len=150)
        # dict ,id_list as key index sequence key2index as words -> key index
        id_list_dict[col] = {'id_list': id_list, 'key2index': key2index}

    Cache.cache_data(id_list_dict, nm_marker='id_list_dict_150_normal')

    # ##################################################################################################################
    # get time embedding
    import datetime
    # during 2019-09-01 to 2019-11-30
    id_list_dict = Cache.reload_cache(file_nm=data_path +
                                      'CACHE_id_list_dict_150_normal.pkl',
                                      base_dir='',
                                      pure_nm=False)

    class strTimeEmb(object):
        '''
    	# time 中一些特征做onehot encoding
        周x
        是否是周末