Пример #1
0
def get_embedding(f1_f2, f1):
    path = f1_f2 + '_word2vec_kv.kv'
    wv = KeyedVectors.load(path, mmap='r')
    list_df = Cache.reload_cache('CACHE_list_df_' + f1_f2 + '.pkl')
    list_df.columns = ['list', f1]
    f = open(f1_f2 + '.txt', 'r')
    ind = 0
    buf = []
    for i in f:
        buf_ = np.zeros(64)
        for j in i.strip().split(' '):
            buf_ = buf_ + wv[j]
        buf_ = buf_ / len(i)  # 求平均
        buf_f1 = list_df.at[ind, f1]
        buf__ = []
        buf_ = buf_.tolist()
        buf__.append(buf_)
        buf__.append(buf_f1)
        buf.append(buf__)
        ind = ind + 1
    df_f1_list = pd.DataFrame(buf)
    Cache.cache_data(df_f1_list, nm_marker='list_df_avg_' + f1_f2)
    return 0
Пример #2
0
 def run_w2v(df,
             sentence_id,
             word_id,
             emb_size=256,
             window=10,
             slid_window=1,
             embedding_type='w2v'):
     res_dict = get_embedding_pro(df,
                                  sentence_id=sentence_id,
                                  word_id=word_id,
                                  window=window,
                                  slide_window=slid_window,
                                  emb_size=emb_size,
                                  dropna=False,
                                  n_jobs=n_jobs,
                                  return_model=False,
                                  epoch=10,
                                  embedding_type=embedding_type)
     Cache.cache_data(
         res_dict,
         nm_marker=
         f'EMB_DICT_{window}_{slid_window}_{emb_size}_{sentence_id}_{word_id}_{embedding_type}'
     )
     return res_dict["sentence_emb_df"]
Пример #3
0
def gen_list_df(feature):
    print(f'{feature} start!')
    data = Cache.reload_cache(
        'CACHE_data_sampling_pos1_neg5.pkl')  # 直接对采样后的数据做序列
    if feature == 'label':
        data.loc[data['pt_d'] >= 8, 'label'] = -1  # test的label做mask
        data['label'] = data['label'].astype(np.int8)
        data['label'] = data['label'] + 1  # 因为0用于padding置为0
    data = data[['uid', feature, 'pt_d']]
    gc.collect()
    print(data.shape)
    data_group = data.groupby(['uid'])
    gc.collect()
    index_list = []
    feature_list = []
    print('index_list start')
    for name, group in tqdm(data_group):
        index_list.append(name)
    print('feature_list start')
    for i in tqdm(index_list):
        index_get_group = data_group.get_group(i)
        ptd_set = set(index_get_group['pt_d'].values.flatten().tolist())
        for j in ptd_set:
            feature_list_ = []
            buf_list = []
            buf_list = index_get_group.query('pt_d < @j')[
                feature].values.flatten().tolist()  # 本行样本之前的点击行为序列
            buf_list.append(0)  # padding 0
            feature_list_.append(buf_list)  # 行为序列
            feature_list_.append(j)  # pt_d
            feature_list_.append(i)  # uid
            feature_list.append(feature_list_)

    list_df = pd.DataFrame(feature_list)
    del index_list, feature_list, feature_list_, data_group, index_get_group, ptd_set
    gc.collect()
    list_df.columns = ['list', 'pt_d', 'uid']
    list_df['list'] = list_df['list'].map(lambda x: [str(i)
                                                     for i in x])  # 转str
    list_df = list_df.drop_duplicates(subset=['pt_d', 'uid'])
    list_df = data.merge(list_df, how='left',
                         on=('uid', 'pt_d'))  # 顺序还是用data的顺序
    # 加入当天本样本 label不加
    if feature != 'label':
        list_df['list'] = list_df[feature].map(
            lambda x: [str(x)]) + list_df['list']
    print('w2v start!')
    emb_size = 32  # 预训练 embedding dim
    model = Word2Vec(list_df['list'].values.tolist(),
                     size=emb_size,
                     window=5,
                     workers=5,
                     min_count=1,
                     sg=0,
                     hs=0,
                     negative=5,
                     iter=5,
                     seed=0)
    # 1 获取seq
    id_list, key2index = get_sequence(list_df, 'list', max_len=40)
    # 2 获取key2index
    emb_dict = {}
    for word_i in list(model.wv.vocab.keys()):
        if word_i in model.wv:
            emb_dict[word_i] = model.wv[word_i]
        else:
            emb_dict[word_i] = np.zeros(emb_size)
    # 3 保存
    id_list_dict = {}
    id_list_dict['id_list'] = id_list
    id_list_dict['key2index'] = key2index
    id_list_dict['emb'] = emb_dict
    Cache.cache_data(id_list_dict, nm_marker=f'EMB_INPUTSEQ_stage2_{feature}')
    print(f'{feature} done!')
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


# In[2]:

# import ipdb
# ipdb.set
train = pd.read_csv(r'./data/train_data.csv', sep='|', dtype=str)
Cache.cache_data(train, nm_marker='train_raw')

# In[3]:

test_A = pd.read_csv(r'./data/test_data_A.csv', sep='|', dtype=str)
test_A.insert(0, 'label', np.ones([1000000]))
test_A['label'] = 2
Cache.cache_data(test_A, nm_marker='test_A_raw')

# In[4]:

test_B = pd.read_csv(r'./data/test_data_B.csv', sep='|', dtype=str)
test_B.insert(0, 'label', np.ones([1000000]))
test_B['label'] = 2
Cache.cache_data(test_B, nm_marker='test_B_raw')
Пример #5
0
    mode_num = data[var].mode()[0]
    # shape_null = data.query('{}==-1'.format(var))  # .shape[0]
    # print('process sparse int: ', var, 'fillna: ', mode_num, 'fillna_shape: ', shape_null)
    data.loc[data[var] == -1, var] = mode_num
    data[var] = data[var].astype(int)

for var in dense_features:
    mode_num = int(data[var].mean())
    shape_null = data.query('{}==-1'.format(var)).shape[0]
    print('process dense int: ', var, 'fillna: ', mode_num, 'fillna_shape: ',
          shape_null)
    if shape_null > 0:
        data.loc[data[var] == -1, var] = mode_num
        data[var] = data[var].astype(int)
data = reduce_mem(data, use_float16=True)
Cache.cache_data(data, nm_marker=f'data_step_1_feature_0924_r{sample_rate}')

# ######################################################################################
# base feature

# 提取相对count特征
## 列并行

from multiprocessing import Pool

cate_cols = [
    'task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id', 'dev_id',
    'inter_type_cd', 'slot_id', 'spread_app_id', 'tags', 'app_first_class',
    'app_second_class', 'city', 'device_name', 'career', 'gender', 'net_type',
    'residence', 'emui_dev', 'indu_name', 'cmr_0', 'cmr_1', 'cmr_2', 'cmr_3',
    'cmr_4', 'cmr_5', 'cmr_6', 'cmr_7', 'cmr_8', 'cmr_9', 'cmr_10', 'cmr_11',
Пример #6
0
                                     n_jobs=n_jobs,
                                     return_model=False,
                                     epoch=10,
                                     embedding_type=embedding_type)
        Cache.cache_data(
            res_dict,
            nm_marker=
            f'EMB_DICT_{window}_{slid_window}_{emb_size}_{sentence_id}_{word_id}_{embedding_type}'
        )
        return res_dict["sentence_emb_df"]

    for var in sparse_features:
        fe = run_w2v(df,
                     'uid',
                     var,
                     emb_size=16,
                     window=8,
                     embedding_type='w2v')
        df = pd.concat([df, fe], axis=1)
        fe = run_w2v(df,
                     'uid',
                     var,
                     emb_size=16,
                     window=8,
                     embedding_type='fasttext')
        df = pd.concat([df, fe], axis=1)

    cols_to_save = [i for i in df.columns if i.find('EMB_') > -1]
    df = df[['index'] + cols_to_save]
    Cache.cache_data(df, nm_marker='EMB_feature0912')
Пример #7
0
columns_str = datatest.columns[0]
dflisttst = []
for i in tqdm(range(datatest.shape[0])):
    dflisttst.append([
        int(j) if index != 32 else j
        for index, j in enumerate(datatest[columns_str].iloc[i].split('|'))
    ])
del datatest
gc.collect()
dflisttst = pd.DataFrame(dflisttst, columns=columns_str.split('|'))
dflist['id'] = -1  # train id都改成-1
dataall = pd.concat([dflist, dflisttst], ignore_index=True)
del dflist, dflisttst
gc.collect()
dataall = reduce_mem(dataall, use_float16=False)
Cache.cache_data(dataall, nm_marker='dataall0816')

##############################################################################################################
# 比较慢!
route = []
for i in tqdm(range(dataall.shape[0])):
    route.append(dataall['communication_onlinerate'].iloc[i].split('^'))
route = pd.DataFrame(route)
route = route.fillna(-1).astype(int)
routes = []
for i in tqdm(range(route.shape[0])):
    routes.append(np.sum(np.eye(25)[route.iloc[i, :]], axis=0))
del route
gc.collect()
routes = pd.DataFrame(routes,
                      columns=['cmr_' + str(i)
Пример #8
0
datatraintestA = Cache.reload_cache('CACHE_dataall0816.pkl')
datatest = pd.read_csv('./data/test_data_B.csv')
columns_str = datatest.columns[0]
dflisttst = []
for i in tqdm(range(datatest.shape[0])):
    dflisttst.append([
        int(j) if index != 32 else j
        for index, j in enumerate(datatest[columns_str].iloc[i].split('|'))
    ])
del datatest
gc.collect()
dflisttst = pd.DataFrame(dflisttst, columns=columns_str.split('|'))
dataall = pd.concat([datatraintestA, dflisttst], ignore_index=True)
dataall = reduce_mem(dataall, use_float16=False)
Cache.cache_data(
    dataall,
    nm_marker='dataall_stage2_0924')  # 基础特征+id 希望test a test b的id不重复 日

##############################################################################################################
# 比较慢!
datatraintestA = Cache.reload_cache('CACHE_cmr0816.pkl')
route = []
for i in tqdm(range(dflisttst.shape[0])):
    route.append(dflisttst['communication_onlinerate'].iloc[i].split('^'))
route = pd.DataFrame(route)
route = route.fillna(-1).astype(int)
routes = []
for i in tqdm(range(route.shape[0])):
    routes.append(np.sum(np.eye(25)[route.iloc[i, :]], axis=0))
del route
gc.collect()
Пример #9
0
def get_emb_matrix(col):
    """
    inputs:    
    col 需要做成预训练emb_matrix的列

    cross_emb_dict 结构:
    (embvari,embvarj)
    embvari:{key:word in dataframe,value:embvec} 就是字典

    data[col].unique() 需要转化的字典 不在原字典里的给-1 在的按大小顺序从1开始排

    得出id_list_dict + emb_matrix
    """
    vari, varj = col.split('__')
    key_to_represent_rare = -1
    words_vari = list(cross_emb_dict[col][0].keys())
    words_varj = list(cross_emb_dict[col][1].keys())
    emb_size_vari = cross_emb_dict[col][0][words_vari[0]].shape[0]
    emb_size_varj = cross_emb_dict[col][1][words_varj[0]].shape[0]
    voc_size_vari = len(words_vari)
    voc_size_varj = len(words_varj)
    list_df_vari = list(data[vari].unique())
    list_df_varj = list(data[varj].unique())
    # emb 中必须要有'-1' 作为index 0
    if -1 not in cross_emb_dict[col][0].keys():
        #  emb中无-1 为全词表数据!需要自行计算均值emb vec
        # 为embi 添加一个embedding
        # 这些词的vector求均值
        vector_low_frequency_words = np.zeros((emb_size_vari, ))
        for w in words_vari:
            vector_low_frequency_words += cross_emb_dict[col][0][w]
        vector_low_frequency_words = vector_low_frequency_words / voc_size_vari
        # emb添加一个key value
        cross_emb_dict[col][0][
            key_to_represent_rare] = vector_low_frequency_words
        voc_size_vari += 1
        # print(f'{col} file has no key_to_represent_rare add low frequency words and fill vector as:', vector_low_frequency_words)
    if -1 not in cross_emb_dict[col][1].keys():
        #  emb中无-1 为全词表数据!需要自行计算均值emb vec
        # 为embi 添加一个embedding
        # 这些词的vector求均值
        vector_low_frequency_words = np.zeros((emb_size_varj, ))
        for w in words_varj:
            vector_low_frequency_words += cross_emb_dict[col][1][w]
        vector_low_frequency_words = vector_low_frequency_words / voc_size_vari
        # emb添加一个key value
        cross_emb_dict[col][1][
            key_to_represent_rare] = vector_low_frequency_words
        voc_size_varj += 1
        # print(f'{col} file has no key_to_represent_rare add low frequency words and fill vector as:', vector_low_frequency_words)

    # 根据list_df_vari 生成emb matrix
    emb_matrix_vari = np.zeros((voc_size_vari + 1, emb_size_vari))  # 0是padding
    emb_matrix_varj = np.zeros((voc_size_varj + 1, emb_size_varj))  # 0是padding
    key2index_vari = {}  # 要对data[vari]做mapping
    key2index_varj = {}  # 要对data[varj]做mapping
    indexi = 2  # 1设为-1
    for k, idx in enumerate(list_df_vari):
        if idx in cross_emb_dict[col][0].keys():
            # 出现过
            emb_matrix_vari[indexi, :] = cross_emb_dict[col][0][idx]
            key2index_vari[idx] = indexi
            indexi += 1
        else:
            # 没出现过认为是-1
            key2index_vari[idx] = 1
    indexi = 2  # 1设为-1
    for k, idx in enumerate(list_df_varj):
        if idx in cross_emb_dict[col][1].keys():
            # 出现过
            emb_matrix_varj[indexi, :] = cross_emb_dict[col][1][idx]
            key2index_varj[idx] = indexi
            indexi += 1
        else:
            # 没出现过认为是-1
            key2index_varj[idx] = 1
    emb_matrix_vari = np.float32(emb_matrix_vari)
    emb_matrix_varj = np.float32(emb_matrix_varj)
    # 制作输入
    id_list_dict_vari = []  # input vari
    id_list_dict_varj = []  # input varj
    for valuei in tqdm(list(data[vari])):
        id_list_dict_vari.append(np.array([key2index_vari[valuei]]))
    for valuej in tqdm(list(data[varj])):
        id_list_dict_varj.append(np.array([key2index_varj[valuej]]))
    Cache.cache_data([(id_list_dict_vari, emb_matrix_vari),
                      (id_list_dict_varj, emb_matrix_varj)],
                     nm_marker=f'CROSSEMB__{col}')
                                             '_target_enc'].values / skf.n_splits

del trn_x, val_x, enc_df, test_x
gc.collect()
# all features
df_fe = pd.concat([train_df, test_df])
del train_df, test_df
df_fe = df_fe.sort_values('index').reset_index(drop=True)
df_fe = reduce_mem(df_fe, use_float16=False)

droplist = []
set_test = df_fe.query('pt_d>=8')
for var in df_fe.columns:
    if var not in ['id', 'index', 'label', 'pt_d']:
        if set_test[var].nunique() < 2 or set_test[var].count() < 2:
            droplist.append(var)
print('drop list:', droplist)
df_fe = df_fe.drop(droplist, axis=1)


# ## data merge

# In[5]:


df_fe = df_fe.drop(columns=['index'])
Cache.cache_data(df_fe, nm_marker='sampling_pro_feature')


# In[ ]:
Пример #11
0
data_test_B = pd.read_csv(
    './data/test_data_B.csv',
    sep='|',
    dtype=str,
    nrows=100000,
)
# datatraintestA = Cache.reload_cache('CACHE_dataall0816.pkl')
# datatest = pd.read_csv('./data/test_data_B.csv')
# columns_str = datatest.columns[0] dflisttst = [] for i in tqdm(range(datatest.shape[0])): dflisttst.append([int(j) if index != 32 else j for index, j in enumerate(datatest[columns_str].iloc[i].split('|'))])
# del datatest
# gc.collect()
# dflisttst = pd.DataFrame(dflisttst, columns=columns_str.split('|'))
dataall = pd.concat([data_train, data_test_A, data_test_B], ignore_index=True)
dataall = reduce_mem(dataall, use_float16=False)
# 基础特征+id 希望test a test b的id不重复 日
Cache.cache_data(dataall, nm_marker='dataall_stage2_0924_debug')

# %%
##############################################################################################################
# 比较慢!
# datatraintestA = Cache.reload_cache('CACHE_cmr0816.pkl')
# route = []
# for i in tqdm(range(dataall.shape[0])):
#     route.append(dataall['communication_onlinerate'].iloc[i].split('^'))
# route = pd.DataFrame(route)
# # %%
# route = route.fillna(-1).astype(int)
# routes = []
# for i in tqdm(range(route.shape[0])):
#     routes.append(np.sum(np.eye(25)[route.iloc[i, :]], axis=0))
# del route
Пример #12
0
train_ptd_1 = get_sample(train, 1)
train_ptd_2 = get_sample(train, 2)
train_ptd_3 = get_sample(train, 3)
train_ptd_4 = get_sample(train, 4)
train_ptd_5 = get_sample(train, 5)
train_ptd_6 = get_sample(train, 6)
train_ptd_7 = get_sample(train, 7)

train_sampling = pd.concat([train_ptd_1, train_ptd_2], ignore_index=True)
train_sampling = pd.concat([train_sampling, train_ptd_3], ignore_index=True)
train_sampling = pd.concat([train_sampling, train_ptd_4], ignore_index=True)
train_sampling = pd.concat([train_sampling, train_ptd_5], ignore_index=True)
train_sampling = pd.concat([train_sampling, train_ptd_6], ignore_index=True)
train_sampling = pd.concat([train_sampling, train_ptd_7], ignore_index=True)

Cache.cache_data(train_sampling, nm_marker='train_sampling_pos1_neg5')

sampling_data = pd.concat([train_sampling, test_B], ignore_index=True)
Cache.cache_data(sampling_data, nm_marker='data_sampling_pos1_neg5')

# ## 填充缺失值部分也有尝试作为子模型 填充方式如下:

# In[ ]:

## 修正一些异常值 增加模型鲁棒性
data = train_sampling
# 修正缺失值
sparse_features = [
    'task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id', 'dev_id',
    'inter_type_cd', 'slot_id', 'spread_app_id', 'tags', 'app_first_class',
    'app_second_class', 'city', 'device_name', 'career', 'gender', 'net_type',