def get_embedding(f1_f2, f1): path = f1_f2 + '_word2vec_kv.kv' wv = KeyedVectors.load(path, mmap='r') list_df = Cache.reload_cache('CACHE_list_df_' + f1_f2 + '.pkl') list_df.columns = ['list', f1] f = open(f1_f2 + '.txt', 'r') ind = 0 buf = [] for i in f: buf_ = np.zeros(64) for j in i.strip().split(' '): buf_ = buf_ + wv[j] buf_ = buf_ / len(i) # 求平均 buf_f1 = list_df.at[ind, f1] buf__ = [] buf_ = buf_.tolist() buf__.append(buf_) buf__.append(buf_f1) buf.append(buf__) ind = ind + 1 df_f1_list = pd.DataFrame(buf) Cache.cache_data(df_f1_list, nm_marker='list_df_avg_' + f1_f2) return 0
def run_w2v(df, sentence_id, word_id, emb_size=256, window=10, slid_window=1, embedding_type='w2v'): res_dict = get_embedding_pro(df, sentence_id=sentence_id, word_id=word_id, window=window, slide_window=slid_window, emb_size=emb_size, dropna=False, n_jobs=n_jobs, return_model=False, epoch=10, embedding_type=embedding_type) Cache.cache_data( res_dict, nm_marker= f'EMB_DICT_{window}_{slid_window}_{emb_size}_{sentence_id}_{word_id}_{embedding_type}' ) return res_dict["sentence_emb_df"]
def gen_list_df(feature): print(f'{feature} start!') data = Cache.reload_cache( 'CACHE_data_sampling_pos1_neg5.pkl') # 直接对采样后的数据做序列 if feature == 'label': data.loc[data['pt_d'] >= 8, 'label'] = -1 # test的label做mask data['label'] = data['label'].astype(np.int8) data['label'] = data['label'] + 1 # 因为0用于padding置为0 data = data[['uid', feature, 'pt_d']] gc.collect() print(data.shape) data_group = data.groupby(['uid']) gc.collect() index_list = [] feature_list = [] print('index_list start') for name, group in tqdm(data_group): index_list.append(name) print('feature_list start') for i in tqdm(index_list): index_get_group = data_group.get_group(i) ptd_set = set(index_get_group['pt_d'].values.flatten().tolist()) for j in ptd_set: feature_list_ = [] buf_list = [] buf_list = index_get_group.query('pt_d < @j')[ feature].values.flatten().tolist() # 本行样本之前的点击行为序列 buf_list.append(0) # padding 0 feature_list_.append(buf_list) # 行为序列 feature_list_.append(j) # pt_d feature_list_.append(i) # uid feature_list.append(feature_list_) list_df = pd.DataFrame(feature_list) del index_list, feature_list, feature_list_, data_group, index_get_group, ptd_set gc.collect() list_df.columns = ['list', 'pt_d', 'uid'] list_df['list'] = list_df['list'].map(lambda x: [str(i) for i in x]) # 转str list_df = list_df.drop_duplicates(subset=['pt_d', 'uid']) list_df = data.merge(list_df, how='left', on=('uid', 'pt_d')) # 顺序还是用data的顺序 # 加入当天本样本 label不加 if feature != 'label': list_df['list'] = list_df[feature].map( lambda x: [str(x)]) + list_df['list'] print('w2v start!') emb_size = 32 # 预训练 embedding dim model = Word2Vec(list_df['list'].values.tolist(), size=emb_size, window=5, workers=5, min_count=1, sg=0, hs=0, negative=5, iter=5, seed=0) # 1 获取seq id_list, key2index = get_sequence(list_df, 'list', max_len=40) # 2 获取key2index emb_dict = {} for word_i in list(model.wv.vocab.keys()): if word_i in model.wv: emb_dict[word_i] = model.wv[word_i] else: emb_dict[word_i] = np.zeros(emb_size) # 3 保存 id_list_dict = {} id_list_dict['id_list'] = id_list id_list_dict['key2index'] = key2index id_list_dict['emb'] = emb_dict Cache.cache_data(id_list_dict, nm_marker=f'EMB_INPUTSEQ_stage2_{feature}') print(f'{feature} done!')
np.float32).max: df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) end_mem = df.memory_usage().sum() / 1024**2 print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format( start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem)) return df # In[2]: # import ipdb # ipdb.set train = pd.read_csv(r'./data/train_data.csv', sep='|', dtype=str) Cache.cache_data(train, nm_marker='train_raw') # In[3]: test_A = pd.read_csv(r'./data/test_data_A.csv', sep='|', dtype=str) test_A.insert(0, 'label', np.ones([1000000])) test_A['label'] = 2 Cache.cache_data(test_A, nm_marker='test_A_raw') # In[4]: test_B = pd.read_csv(r'./data/test_data_B.csv', sep='|', dtype=str) test_B.insert(0, 'label', np.ones([1000000])) test_B['label'] = 2 Cache.cache_data(test_B, nm_marker='test_B_raw')
mode_num = data[var].mode()[0] # shape_null = data.query('{}==-1'.format(var)) # .shape[0] # print('process sparse int: ', var, 'fillna: ', mode_num, 'fillna_shape: ', shape_null) data.loc[data[var] == -1, var] = mode_num data[var] = data[var].astype(int) for var in dense_features: mode_num = int(data[var].mean()) shape_null = data.query('{}==-1'.format(var)).shape[0] print('process dense int: ', var, 'fillna: ', mode_num, 'fillna_shape: ', shape_null) if shape_null > 0: data.loc[data[var] == -1, var] = mode_num data[var] = data[var].astype(int) data = reduce_mem(data, use_float16=True) Cache.cache_data(data, nm_marker=f'data_step_1_feature_0924_r{sample_rate}') # ###################################################################################### # base feature # 提取相对count特征 ## 列并行 from multiprocessing import Pool cate_cols = [ 'task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id', 'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id', 'tags', 'app_first_class', 'app_second_class', 'city', 'device_name', 'career', 'gender', 'net_type', 'residence', 'emui_dev', 'indu_name', 'cmr_0', 'cmr_1', 'cmr_2', 'cmr_3', 'cmr_4', 'cmr_5', 'cmr_6', 'cmr_7', 'cmr_8', 'cmr_9', 'cmr_10', 'cmr_11',
n_jobs=n_jobs, return_model=False, epoch=10, embedding_type=embedding_type) Cache.cache_data( res_dict, nm_marker= f'EMB_DICT_{window}_{slid_window}_{emb_size}_{sentence_id}_{word_id}_{embedding_type}' ) return res_dict["sentence_emb_df"] for var in sparse_features: fe = run_w2v(df, 'uid', var, emb_size=16, window=8, embedding_type='w2v') df = pd.concat([df, fe], axis=1) fe = run_w2v(df, 'uid', var, emb_size=16, window=8, embedding_type='fasttext') df = pd.concat([df, fe], axis=1) cols_to_save = [i for i in df.columns if i.find('EMB_') > -1] df = df[['index'] + cols_to_save] Cache.cache_data(df, nm_marker='EMB_feature0912')
columns_str = datatest.columns[0] dflisttst = [] for i in tqdm(range(datatest.shape[0])): dflisttst.append([ int(j) if index != 32 else j for index, j in enumerate(datatest[columns_str].iloc[i].split('|')) ]) del datatest gc.collect() dflisttst = pd.DataFrame(dflisttst, columns=columns_str.split('|')) dflist['id'] = -1 # train id都改成-1 dataall = pd.concat([dflist, dflisttst], ignore_index=True) del dflist, dflisttst gc.collect() dataall = reduce_mem(dataall, use_float16=False) Cache.cache_data(dataall, nm_marker='dataall0816') ############################################################################################################## # 比较慢! route = [] for i in tqdm(range(dataall.shape[0])): route.append(dataall['communication_onlinerate'].iloc[i].split('^')) route = pd.DataFrame(route) route = route.fillna(-1).astype(int) routes = [] for i in tqdm(range(route.shape[0])): routes.append(np.sum(np.eye(25)[route.iloc[i, :]], axis=0)) del route gc.collect() routes = pd.DataFrame(routes, columns=['cmr_' + str(i)
datatraintestA = Cache.reload_cache('CACHE_dataall0816.pkl') datatest = pd.read_csv('./data/test_data_B.csv') columns_str = datatest.columns[0] dflisttst = [] for i in tqdm(range(datatest.shape[0])): dflisttst.append([ int(j) if index != 32 else j for index, j in enumerate(datatest[columns_str].iloc[i].split('|')) ]) del datatest gc.collect() dflisttst = pd.DataFrame(dflisttst, columns=columns_str.split('|')) dataall = pd.concat([datatraintestA, dflisttst], ignore_index=True) dataall = reduce_mem(dataall, use_float16=False) Cache.cache_data( dataall, nm_marker='dataall_stage2_0924') # 基础特征+id 希望test a test b的id不重复 日 ############################################################################################################## # 比较慢! datatraintestA = Cache.reload_cache('CACHE_cmr0816.pkl') route = [] for i in tqdm(range(dflisttst.shape[0])): route.append(dflisttst['communication_onlinerate'].iloc[i].split('^')) route = pd.DataFrame(route) route = route.fillna(-1).astype(int) routes = [] for i in tqdm(range(route.shape[0])): routes.append(np.sum(np.eye(25)[route.iloc[i, :]], axis=0)) del route gc.collect()
def get_emb_matrix(col): """ inputs: col 需要做成预训练emb_matrix的列 cross_emb_dict 结构: (embvari,embvarj) embvari:{key:word in dataframe,value:embvec} 就是字典 data[col].unique() 需要转化的字典 不在原字典里的给-1 在的按大小顺序从1开始排 得出id_list_dict + emb_matrix """ vari, varj = col.split('__') key_to_represent_rare = -1 words_vari = list(cross_emb_dict[col][0].keys()) words_varj = list(cross_emb_dict[col][1].keys()) emb_size_vari = cross_emb_dict[col][0][words_vari[0]].shape[0] emb_size_varj = cross_emb_dict[col][1][words_varj[0]].shape[0] voc_size_vari = len(words_vari) voc_size_varj = len(words_varj) list_df_vari = list(data[vari].unique()) list_df_varj = list(data[varj].unique()) # emb 中必须要有'-1' 作为index 0 if -1 not in cross_emb_dict[col][0].keys(): # emb中无-1 为全词表数据!需要自行计算均值emb vec # 为embi 添加一个embedding # 这些词的vector求均值 vector_low_frequency_words = np.zeros((emb_size_vari, )) for w in words_vari: vector_low_frequency_words += cross_emb_dict[col][0][w] vector_low_frequency_words = vector_low_frequency_words / voc_size_vari # emb添加一个key value cross_emb_dict[col][0][ key_to_represent_rare] = vector_low_frequency_words voc_size_vari += 1 # print(f'{col} file has no key_to_represent_rare add low frequency words and fill vector as:', vector_low_frequency_words) if -1 not in cross_emb_dict[col][1].keys(): # emb中无-1 为全词表数据!需要自行计算均值emb vec # 为embi 添加一个embedding # 这些词的vector求均值 vector_low_frequency_words = np.zeros((emb_size_varj, )) for w in words_varj: vector_low_frequency_words += cross_emb_dict[col][1][w] vector_low_frequency_words = vector_low_frequency_words / voc_size_vari # emb添加一个key value cross_emb_dict[col][1][ key_to_represent_rare] = vector_low_frequency_words voc_size_varj += 1 # print(f'{col} file has no key_to_represent_rare add low frequency words and fill vector as:', vector_low_frequency_words) # 根据list_df_vari 生成emb matrix emb_matrix_vari = np.zeros((voc_size_vari + 1, emb_size_vari)) # 0是padding emb_matrix_varj = np.zeros((voc_size_varj + 1, emb_size_varj)) # 0是padding key2index_vari = {} # 要对data[vari]做mapping key2index_varj = {} # 要对data[varj]做mapping indexi = 2 # 1设为-1 for k, idx in enumerate(list_df_vari): if idx in cross_emb_dict[col][0].keys(): # 出现过 emb_matrix_vari[indexi, :] = cross_emb_dict[col][0][idx] key2index_vari[idx] = indexi indexi += 1 else: # 没出现过认为是-1 key2index_vari[idx] = 1 indexi = 2 # 1设为-1 for k, idx in enumerate(list_df_varj): if idx in cross_emb_dict[col][1].keys(): # 出现过 emb_matrix_varj[indexi, :] = cross_emb_dict[col][1][idx] key2index_varj[idx] = indexi indexi += 1 else: # 没出现过认为是-1 key2index_varj[idx] = 1 emb_matrix_vari = np.float32(emb_matrix_vari) emb_matrix_varj = np.float32(emb_matrix_varj) # 制作输入 id_list_dict_vari = [] # input vari id_list_dict_varj = [] # input varj for valuei in tqdm(list(data[vari])): id_list_dict_vari.append(np.array([key2index_vari[valuei]])) for valuej in tqdm(list(data[varj])): id_list_dict_varj.append(np.array([key2index_varj[valuej]])) Cache.cache_data([(id_list_dict_vari, emb_matrix_vari), (id_list_dict_varj, emb_matrix_varj)], nm_marker=f'CROSSEMB__{col}')
'_target_enc'].values / skf.n_splits del trn_x, val_x, enc_df, test_x gc.collect() # all features df_fe = pd.concat([train_df, test_df]) del train_df, test_df df_fe = df_fe.sort_values('index').reset_index(drop=True) df_fe = reduce_mem(df_fe, use_float16=False) droplist = [] set_test = df_fe.query('pt_d>=8') for var in df_fe.columns: if var not in ['id', 'index', 'label', 'pt_d']: if set_test[var].nunique() < 2 or set_test[var].count() < 2: droplist.append(var) print('drop list:', droplist) df_fe = df_fe.drop(droplist, axis=1) # ## data merge # In[5]: df_fe = df_fe.drop(columns=['index']) Cache.cache_data(df_fe, nm_marker='sampling_pro_feature') # In[ ]:
data_test_B = pd.read_csv( './data/test_data_B.csv', sep='|', dtype=str, nrows=100000, ) # datatraintestA = Cache.reload_cache('CACHE_dataall0816.pkl') # datatest = pd.read_csv('./data/test_data_B.csv') # columns_str = datatest.columns[0] dflisttst = [] for i in tqdm(range(datatest.shape[0])): dflisttst.append([int(j) if index != 32 else j for index, j in enumerate(datatest[columns_str].iloc[i].split('|'))]) # del datatest # gc.collect() # dflisttst = pd.DataFrame(dflisttst, columns=columns_str.split('|')) dataall = pd.concat([data_train, data_test_A, data_test_B], ignore_index=True) dataall = reduce_mem(dataall, use_float16=False) # 基础特征+id 希望test a test b的id不重复 日 Cache.cache_data(dataall, nm_marker='dataall_stage2_0924_debug') # %% ############################################################################################################## # 比较慢! # datatraintestA = Cache.reload_cache('CACHE_cmr0816.pkl') # route = [] # for i in tqdm(range(dataall.shape[0])): # route.append(dataall['communication_onlinerate'].iloc[i].split('^')) # route = pd.DataFrame(route) # # %% # route = route.fillna(-1).astype(int) # routes = [] # for i in tqdm(range(route.shape[0])): # routes.append(np.sum(np.eye(25)[route.iloc[i, :]], axis=0)) # del route
train_ptd_1 = get_sample(train, 1) train_ptd_2 = get_sample(train, 2) train_ptd_3 = get_sample(train, 3) train_ptd_4 = get_sample(train, 4) train_ptd_5 = get_sample(train, 5) train_ptd_6 = get_sample(train, 6) train_ptd_7 = get_sample(train, 7) train_sampling = pd.concat([train_ptd_1, train_ptd_2], ignore_index=True) train_sampling = pd.concat([train_sampling, train_ptd_3], ignore_index=True) train_sampling = pd.concat([train_sampling, train_ptd_4], ignore_index=True) train_sampling = pd.concat([train_sampling, train_ptd_5], ignore_index=True) train_sampling = pd.concat([train_sampling, train_ptd_6], ignore_index=True) train_sampling = pd.concat([train_sampling, train_ptd_7], ignore_index=True) Cache.cache_data(train_sampling, nm_marker='train_sampling_pos1_neg5') sampling_data = pd.concat([train_sampling, test_B], ignore_index=True) Cache.cache_data(sampling_data, nm_marker='data_sampling_pos1_neg5') # ## 填充缺失值部分也有尝试作为子模型 填充方式如下: # In[ ]: ## 修正一些异常值 增加模型鲁棒性 data = train_sampling # 修正缺失值 sparse_features = [ 'task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id', 'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id', 'tags', 'app_first_class', 'app_second_class', 'city', 'device_name', 'career', 'gender', 'net_type',