def make_features(p): if p == 0: df = train name = 'train' else: df = test name = 'test' # get vec print(name, 'sum', round(st_time - time(), 4)) question_vec_sum = pd.DataFrame( list(df['q_stop'].map(get_vector_with_words_sum))).add_prefix( 'q_stop_vec_sum_') answer_vec_sum = pd.DataFrame( list(df['a_stop'].map(get_vector_with_words_sum))).add_prefix( 'a_stop_vec_sum_') print(name, 'mean', round(st_time - time(), 4)) question_vec_mean = pd.DataFrame( list(df['q_stop'].map(get_vector_with_words_mean))).add_prefix( 'q_stop_vec_mean_') answer_vec_mean = pd.DataFrame( list(df['a_stop'].map(get_vector_with_words_mean))).add_prefix( 'a_stop_vec_mean_') result = pd.concat( [question_vec_sum, answer_vec_sum, question_vec_mean, answer_vec_mean], axis=1) utils.to_pickles(result, f'../data/202_{name}', utils.SPLIT_SIZE)
def pivot(cat): li = [] pt = pd.pivot_table(prev, index=KEY, columns=cat, values=col_num) pt.columns = [f'{PREF}_{c[0]}-{c[1]}_mean' for c in pt.columns] li.append(pt) pt = pd.pivot_table(prev, index=KEY, columns=cat, values=col_num, aggfunc=np.sum) pt.columns = [f'{PREF}_{c[0]}-{c[1]}_sum' for c in pt.columns] li.append(pt) pt = pd.pivot_table(prev, index=KEY, columns=cat, values=col_num, aggfunc=np.std, fill_value=-1) pt.columns = [f'{PREF}_{c[0]}-{c[1]}_std' for c in pt.columns] li.append(pt) base = pd.concat(li, axis=1).reset_index() base.reset_index(inplace=True) del li, pt gc.collect() df = pd.merge(train, base, on=KEY, how='left').drop(KEY, axis=1) utils.to_pickles(df, f'../data/110_{cat}_train', utils.SPLIT_SIZE) gc.collect() df = pd.merge(test, base, on=KEY, how='left').drop(KEY, axis=1) utils.to_pickles(df, f'../data/110_{cat}_test', utils.SPLIT_SIZE) gc.collect()
def multi(c1): global base print(c1) df_sum = pd.crosstab(prev[KEY], prev[c1]) df_sum.columns = [ f'{PREF}_{c1}_{str(c2).replace(" ", "-")}_sum' for c2 in df_sum.columns ] df_norm = pd.crosstab(prev[KEY], prev[c1], normalize='index') df_norm.columns = [ f'{PREF}_{c1}_{str(c2).replace(" ", "-")}_norm' for c2 in df_norm.columns ] df = pd.concat([df_sum, df_norm], axis=1) col = df.columns.tolist() base = pd.concat([base, df], axis=1) base[col] = base[col].fillna(-1) base.reset_index(inplace=True) df = pd.merge(train, base, on=KEY, how='left').drop(KEY, axis=1) utils.to_pickles(df, f'../data/110_{c1}_train', utils.SPLIT_SIZE) gc.collect() df = pd.merge(test, base, on=KEY, how='left').drop(KEY, axis=1) utils.to_pickles(df, f'../data/110_{c1}_test', utils.SPLIT_SIZE) gc.collect()
def multi(k): """ k = 'app' """ gc.collect() print(k) df = pd.crosstab(trte[k], trte.hour, normalize='index') df = df.add_prefix(f'histHourNorm_{k}_') utils.reduce_memory(df) col = df.columns.tolist() result = pd.merge(trte, df.reset_index(), on=k, how='left') gc.collect() # result.iloc[0:utils.TRAIN_SHAPE][col].to_pickle(f'../data/114__{k}_train.p') # result.iloc[utils.TRAIN_SHAPE:][col].to_pickle(f'../data/114__{k}_test.p') # gc.collect() utils.to_pickles( result.iloc[0:utils.TRAIN_SHAPE][col].reset_index(drop=True), '../data/114_train', utils.SPLIT_SIZE) gc.collect() utils.to_pickles( result.iloc[utils.TRAIN_SHAPE:][col].reset_index(drop=True), '../data/114_test', utils.SPLIT_SIZE)
def make_features(p): if p == 0: df = train name = 'train' else: df = test name = 'test' init_col = df.columns.tolist() print(init_col) df['qlenchar'] = df.question_text.apply(len) df['qlenword'] = df.question_text.apply(lambda x: len(splittext(x))) df['alenchar'] = df.answer_text.apply(len) df['alenword'] = df.answer_text.apply(lambda x: len(splittext(x))) df['difflenchar'] = df.qlenchar - df.alenchar df['difflenword'] = df.qlenword - df.alenword df['divlenchar'] = df.qlenchar / df.alenchar df['divlenword'] = df.qlenword / df.alenword df['idivlenchar'] = df.alenchar / df.qlenchar df['idivlenword'] = df.alenword / df.qlenword # df['subreddit_le'] = LabelEncoder().fit_transform(df.subreddit) # df['qid'] = LabelEncoder().fit_transform(df.question_id) df = pd.get_dummies(df, columns=['subreddit']) init_col.remove('subreddit') df['qdt_dow'] = pd.to_datetime(df.question_utc, origin='unix', unit='s').dt.dayofweek df['qdt_hour'] = pd.to_datetime(df.question_utc, origin='unix', unit='s').dt.hour df['adt_dow'] = pd.to_datetime(df.answer_utc, origin='unix', unit='s').dt.dayofweek df['adt_hour'] = pd.to_datetime(df.answer_utc, origin='unix', unit='s').dt.hour # df['question_score_l1p'] = np.log1p(df.question_score) # df['answer_score_l1p'] = np.log1p(df.answer_score) df['qboldwords'] = df.question_text.apply( lambda x: np.sum(x.isupper() for x in splittext(x) if len(x) > 1)) df['aboldwords'] = df.answer_text.apply( lambda x: np.sum(x.isupper() for x in splittext(x) if len(x) > 1)) df.drop(init_col, axis=1, inplace=True) print(name, df.columns.tolist()) utils.to_pickles(df, f'../data/701_{name}', utils.SPLIT_SIZE)
def multi(p): if p == 0: # ============================================================================= # test # ============================================================================= print('loading test_old...') test_old = pd.read_csv( '../input/test_old.csv.gz', dtype=dtypes, parse_dates=['click_time']).sort_values( utils.sort_keys) # be sure to sort by this keys print('loading test...') test = pd.read_csv('../input/test.csv.zip', dtype=dtypes, parse_dates=['click_time']).sort_values( utils.sort_keys).reset_index(drop=True) print('finish loading!') merge_key = ['ip', 'app', 'device', 'os', 'channel', 'click_time'] test_old.drop('click_id', axis=1, inplace=True) test_old = pd.merge(test_old, test[merge_key + ['click_id']], on=merge_key, how='left') utils.to_pickles(test_old, '../data/test_old', utils.SPLIT_SIZE) # utils.to_pickles(test_old.sort_values(utils.sort_keys, ascending=False), # '../data/test_old_rev', 10) utils.to_pickles(test, '../data/test', utils.SPLIT_SIZE) del test_old, test gc.collect() else: # ============================================================================= # train # ============================================================================= print('loading train...') train = pd.read_csv( '../input/train.csv.zip', dtype=dtypes, parse_dates=['click_time', 'attributed_time']).sort_values( utils.sort_keys) # be sure to sort by this keys print('finish loading!') # print('drop os; 607, 748, 866') # train = train[~train.os.isin([607, 748, 866])].reset_index(drop=True) print('train.shape', train.shape) utils.to_pickles(train, '../data/train', utils.SPLIT_SIZE) utils.to_pickles(train.is_attributed, '../data/is_attributed', utils.SPLIT_SIZE) # utils.to_pickles(train.sort_values(utils.sort_keys, ascending=False), # '../data/train_rev', 10) del train gc.collect()
def read_pickle(folder, usecols): df = pd.read_pickle(folder+'/000.p') col = list( set(usecols) & set(df.columns)) print(folder, len(col)) if len(col)>0: df = utils.read_pickles(folder, col) utils.to_pickles(df, folder+'_filtered', utils.SPLIT_SIZE) del df; gc.collect() folder = folder.replace('_train', '_test') df = utils.read_pickles(folder, col) utils.to_pickles(df, folder+'_filtered', utils.SPLIT_SIZE) else: print(f'{folder} doesnt have valid features') pass
def make_features(p): if p == 0: df = train name = 'train' else: df = test name = 'test' init_col = df.columns.tolist() vec_df = df.apply(get_sim, axis=1) df['cosine_sim_stop'] = vec_df.apply(lambda x: x[0]) df['cosine_sim_mean_stop'] = vec_df.apply(lambda x: x[1]) df.drop(init_col, axis=1, inplace=True) utils.to_pickles(df, f'../data/204_{name}', utils.SPLIT_SIZE)
def multi(p): if p == 0: train['day'] = train.click_time.dt.day train['hour'] = train.click_time.dt.hour train['hour_min'] = train['hour'] + (train.click_time.dt.minute / 60) train['timestamp'] = (train.click_time - min_time).dt.seconds col = ['day', 'hour', 'hour_min', 'timestamp'] utils.to_pickles(train[col], '../data/001_train', utils.SPLIT_SIZE) elif p == 1: test['day'] = test.click_time.dt.day test['hour'] = test.click_time.dt.hour test['hour_min'] = test['hour'] + (test.click_time.dt.minute / 60) test['timestamp'] = (test.click_time - min_time).dt.seconds col = ['day', 'hour', 'hour_min', 'timestamp'] utils.to_pickles(test[col], '../data/001_test', utils.SPLIT_SIZE)
def concat_pred_item(T, dryrun=False): if T==-1: name = 'test' else: name = 'trainT-'+str(T) df = utils.load_pred_item(name) df = pd.merge(df, pd.read_pickle('../feature/{}/f317_user-product.p'.format(name)), on=['user_id', 'product_id'],how='left') gc.collect() #============================================================================== print('output') #============================================================================== if dryrun == True: return df else: utils.to_pickles(df, '../feature/{}/all_apdx'.format(name), 20, inplace=True)
def pivot(cat): li = [] pt = pd.pivot_table(bureau, index=KEY, columns=cat, values=col_num) pt.columns = [ f'{PREF}_{cat}_{c[0]}-{c[1]}_mean'.replace(' ', '-') for c in pt.columns ] li.append(pt) pt = pd.pivot_table(bureau, index=KEY, columns=cat, values=col_num, aggfunc=np.sum) pt.columns = [ f'{PREF}_{cat}_{c[0]}-{c[1]}_sum'.replace(' ', '-') for c in pt.columns ] li.append(pt) pt = pd.pivot_table(bureau, index=KEY, columns=cat, values=col_num, aggfunc=np.std, fill_value=-1) pt.columns = [ f'{PREF}_{cat}_{c[0]}-{c[1]}_std'.replace(' ', '-') for c in pt.columns ] li.append(pt) feat = pd.concat(li, axis=1).reset_index() del li, pt gc.collect() df = pd.merge(train, feat, on=KEY, how='left').drop(KEY, axis=1) utils.to_pickles(df, f'../data/tmp_504_{cat}_train', utils.SPLIT_SIZE) gc.collect() df = pd.merge(test, feat, on=KEY, how='left').drop(KEY, axis=1) utils.to_pickles(df, f'../data/tmp_504_{cat}_test', utils.SPLIT_SIZE) gc.collect()
def make_features(p): if p == 0: df = train name = 'train' else: df = test name = 'test' init_col = df.columns.tolist() gr = df.groupby('question_id') for c in USECOLS: print(name, c) df[f'{c}_min'] = gr[c].transform(np.min) df[f'{c}_max'] = gr[c].transform(np.max) df[f'{c}_max-min'] = df[f'{c}_max'] - df[f'{c}_min'] df[f'{c}_mean'] = gr[c].transform(np.mean) df[f'{c}_std'] = gr[c].transform(np.std) df[f'{c}_nunique'] = gr[c].transform(nunique) df.drop(init_col, axis=1, inplace=True) utils.to_pickles(df, f'../data/102_{name}', utils.SPLIT_SIZE)
del train gc.collect() # ============================================================================= # test # ============================================================================= test = utils.read_pickles('../data/dtest') #X_head = pd.read_pickle('X_head.p') for c in categorical_feature: col = c filepath = f'../data/dtest_drop_{c}' categorical_feature_ = list(set(categorical_feature) - set([c])) system(f'rm -rf {filepath}') print(f'categorical_feature {categorical_feature_}') print(f'writing {filepath}...') utils.to_pickles(test.drop(col, axis=1), filepath, utils.SPLIT_SIZE) gc.collect() del test gc.collect() #============================================================================== system('touch SUCCESS_804') utils.end(__file__)
gc.collect() # ============================================================================= # concat pt1 # ============================================================================= gc.collect() pool = Pool(proc) callback = pool.map(multi, utils.comb[:10]) pool.close() # train df = pd.concat( [pd.read_pickle(f) for f in sorted(glob('../data/111__*_train.p'))], axis=1).reset_index(drop=True) utils.to_pickles(df, '../data/111-1_train', utils.SPLIT_SIZE) gc.collect() # test df = pd.concat( [pd.read_pickle(f) for f in sorted(glob('../data/111__*_test.p'))], axis=1).reset_index(drop=True) utils.to_pickles(df, '../data/111-1_test', utils.SPLIT_SIZE) os.system('rm -rf ../data/111__*.p') # ============================================================================= # concat pt2 # ============================================================================= gc.collect()
# log #============================================================================== log = pd.concat([ pd.read_csv('../input/order_products__prior.csv.gz'), pd.read_csv('../input/order_products__train.csv.gz') ], ignore_index=1) log.sort_values(['order_id', 'add_to_cart_order'], inplace=True) log.reset_index(drop=1, inplace=True) log = pd.merge(log, goods, on='product_id', how='left') log = pd.merge(log, orders, on='order_id', how='left') log['order_number_rev'] = log.groupby('user_id').order_number.transform( np.max) - log.order_number utils.to_pickles(log, '../input/mk/log', 20) gc.collect() #============================================================================== # order_tbl #============================================================================== order_product = log.groupby('order_id').product_name.apply(list).reset_index() order_tbl = pd.merge(orders, order_product, on='order_id', how='left') order_tbl.sort_values(['user_id', 'order_number'], inplace=True) order_tbl.reset_index(drop=1, inplace=True) order_tbl = pd.merge(order_tbl, log[['order_id', 'order_number_rev']].drop_duplicates(), on='order_id', how='left') order_tbl.order_number_rev = order_tbl.order_number_rev.fillna(-1).astype(int)
def main(is_eval=False): # load csv if is_eval: df = pd.read_csv('../input/sales_train_evaluation.csv') else: df = pd.read_csv('../input/sales_train_validation.csv') sub = pd.read_csv('../input/sample_submission.csv') # split test data sub['is_test1'] = sub['id'].apply(lambda x: True if '_validation' in x else False) sub['is_test2'] = sub['id'].apply(lambda x: True if '_evaluation' in x else False) test1 = sub[sub['is_test1']] test2 = sub[sub['is_test2']] del sub gc.collect() # drop flags test1.drop(['is_test1', 'is_test2'], axis=1, inplace=True) test2.drop(['is_test1', 'is_test2'], axis=1, inplace=True) # change column name test1.columns = ['id'] + COLS_TEST1 test2.columns = ['id'] + COLS_TEST2 # change id test2['id'] = test2['id'].str.replace('_evaluation', '_validation') # merge if not is_eval: df = df.merge(test1, on='id', how='left') df = df.merge(test2, on='id', how='left') del test1, test2 gc.collect() # reduce memory usage df = reduce_mem_usage(df) # date columns cols_date = [c for c in df.columns if 'd_' in c] # melt sales data print('Melting sales data...') id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'] df = pd.melt(df, id_vars=id_vars, var_name='d', value_name='demand') print('Melted sales train validation has {} rows and {} columns'.format( df.shape[0], df.shape[1])) # add numeric date df['d_numeric'] = df['d'].apply(lambda x: int(x[2:])) # drop old data (~2012/12/31) print('drop old data...') df = df[df['d_numeric'] >= 704] # drop christmas data print('drop christmas data...') df = df[df['d_numeric'] != 331] # 2011-12-25 df = df[df['d_numeric'] != 697] # 2012-12-25 df = df[df['d_numeric'] != 1062] # 2013-12-25 df = df[df['d_numeric'] != 1427] # 2014-12-25 df = df[df['d_numeric'] != 1792] # 2015-12-25 # add is zero flag df['is_zero'] = (df['demand'] == 0).astype(int) # save pkl to_pickles(df, '../feats/sales', split_size=3) # LINE notify line_notify('{} done.'.format(sys.argv[0]))
end = 0 limit = 10 for pt in range(1, 10): end += limit print(st, end) gc.collect() pool = Pool(proc) callback = pool.map(multi, utils.comb[st:end]) pool.close() st = end # train df = pd.concat( [pd.read_pickle(f) for f in sorted(glob('../data/110__*_train.p'))], axis=1).reset_index(drop=True) utils.to_pickles(df, '../data/110-{}_train'.format(pt), utils.SPLIT_SIZE) del df gc.collect() # test df = pd.concat( [pd.read_pickle(f) for f in sorted(glob('../data/110__*_test.p'))], axis=1).reset_index(drop=True) utils.to_pickles(df, '../data/110-{}_test'.format(pt), utils.SPLIT_SIZE) os.system('rm -rf ../data/110__*.p') if end >= len(utils.comb): break
sub = utils.read_pickles('../data/test_old', ['click_id']) load_folders = sorted(glob('../data/*_test/')) + ['../data/test_old/'] args = list(zip(load_folders, range(len(load_folders)))) pool = Pool(14) pool.map(multi_test, args) pool.close() print('concat test') load_files = sorted(glob('../data/804_tmp*.p')) X = pd.concat([pd.read_pickle(f) for f in load_files], axis=1) print('test.shape should be 18790469:', X[X_head.columns].shape) print('X.isnull().sum().sum():', X.isnull().sum().sum()) utils.to_pickles(X[X_head.columns], '../data/dtest', 10) del X; gc.collect() sub = sub[~sub.click_id.isnull()].reset_index(drop=True) sub.drop_duplicates('click_id', keep='last', inplace=True) # last? sub['click_id'] = sub['click_id'].map(int) sub.reset_index(drop=True, inplace=True) sub.to_pickle('../data/sub.p') system('rm ../data/804_tmp*.p') system('touch SUCCESS_804') #==============================================================================
pool.map(multi_train, args) pool.close() print('concat train') load_files = sorted(glob('../data/805_tmp*.p')) X = pd.concat([pd.read_pickle(f) for f in load_files], axis=1) print('X.isnull().sum().sum():', X.isnull().sum().sum()) system('rm ../data/dtrain.mt') system('rm -rf ../data/dtrain') y = utils.read_pickles('../data/is_attributed') lgb.Dataset( X, label=y, categorical_feature=categorical_feature).save_binary('../data/dtrain.mt') utils.to_pickles(X, '../data/dtrain', utils.SPLIT_SIZE) X_head = X.head() X_head.to_pickle('X_head.p') del X, y gc.collect() system('rm ../data/805_tmp*.p') """ X_head = pd.read_pickle('X_head.p') """ # ============================================================================= # # test
args = list(zip(load_folders, range(len(load_folders)))) pool = Pool(10) pool.map(multi_train, args) pool.close() print('concat train') load_files = sorted(glob('../data/803_tmp*.p')) X = pd.concat([pd.read_pickle(f) for f in load_files], axis=1) print('X.isnull().sum().sum():', X.isnull().sum().sum()) system('rm ../data/dtrain_429-2.mt') lgb.Dataset(X.drop('is_attributed', axis=1), label=X.is_attributed, categorical_feature=categorical_feature).save_binary('../data/dtrain_429-2.mt') utils.to_pickles(X, '../data/dtrain_429-2', utils.SPLIT_SIZE) X_head = X.head().drop('is_attributed', axis=1) X_head.to_pickle('X_head_429-2.p') del X; gc.collect() system('rm ../data/803_tmp*.p') """ X_head = pd.read_pickle('X_head_429-2.p') """ # ============================================================================= # # test
def concat_pred_None(T, W, dryrun=False): if T==-1: name = 'test' else: name = 'trainT-'+str(T) #============================================================================== print('load label') #============================================================================== # NOTE: order_id is label print('load t{}'.format(W)) X_base = pd.read_pickle('../feature/X_base_t{}.p'.format(W)) label = pd.read_pickle('../input/mk/order_None.p').rename(columns={'is_None':'y'}) order_tag = pd.read_pickle('../feature/{}/label_reordered.p'.format(name)).order_id.unique() label = label[label.order_id.isin(order_tag)].reset_index(drop=True) # 'inner' for removing t-n_order_id == NaN if 'train' in name: df = pd.merge(X_base[X_base.is_train==1], label[['order_id', 'y']], on='order_id', how='inner') elif name == 'test': df = X_base[X_base.is_train==0] if dryrun: print('dryrun') df = df.sample(9999) print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('user feature') #============================================================================== df = user_feature(df, name) print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('item') #============================================================================== def compress(df, key): """ key: str """ df_ = df.drop_duplicates(key)[[key]].set_index(key) dtypes = df.dtypes col = dtypes[dtypes!='O'].index col = [c for c in col if '_id' not in c] gr = df.groupby(key) for c in col: df_[c+'-min'] = gr[c].min() df_[c+'-mean'] = gr[c].mean() df_[c+'-median'] = gr[c].median() df_[c+'-max'] = gr[c].max() df_[c+'-std'] = gr[c].std() var = df_.var() col = var[var==0].index df_.drop(col, axis=1, inplace=True) gc.collect() return df_.reset_index() order_prod = pd.read_pickle('../feature/{}/label_reordered.p'.format(name)) order_prod = pd.merge(df[['order_id', 'order_hour_of_day', 'order_dow', 'timezone']], order_prod[['order_id', 'product_id']], how='left', on='order_id') order_prod = item_feature(order_prod, name) order_prod.drop(['order_hour_of_day', 'order_dow', 'timezone', 'product_id'], axis=1, inplace=True) key = 'order_id' feature = compress(order_prod, key) df = pd.merge(df, feature, on=key, how='left') #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(df) ix_end = df.shape[1] #============================================================================== print('user x item') #============================================================================== key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f301_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f301_order-product_n5.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f302_order-product_all.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f303_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f304-1_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f304-2_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f304-3_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f305_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f306_user-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') feature = compress(pd.read_pickle('../feature/{}/f306_user-product_n5.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f307_user-product-timezone.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f308_user-product-timezone.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f308_user-product-dow.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f309_user-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') feature = compress(pd.read_pickle('../feature/{}/f309_user-product_n5.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f310_user-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f312_user_product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') feature = compress(pd.read_pickle('../feature/{}/f312_user_product_n5.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f313_user_aisle.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f313_user_dep.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f314_user-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f315-1_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f315-2_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f315-3_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f316_order_product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') gc.collect() print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(df, ix_end) ix_end = df.shape[1] #============================================================================== print('daytime') #============================================================================== df = daytime_feature(df, name) print('{}.shape:{}\n'.format(name, df.shape)) # #============================================================================== # print('aisle') # #============================================================================== # order_aisdep = pd.read_pickle('../input/mk/order_aisle-department.p') # col = [c for c in order_aisdep.columns if 'department_' in c] # order_aisdep.drop(col, axis=1, inplace=1) # # df = pd.merge(df, order_aisdep.add_prefix('t-1_'), on='t-1_order_id', how='left') # df = pd.merge(df, order_aisdep.add_prefix('t-2_'), on='t-2_order_id', how='left') # df = pd.merge(df, order_aisdep.add_prefix('t-3_'), on='t-3_order_id', how='left') # # print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('department') #============================================================================== order_aisdep = pd.read_pickle('../input/mk/order_aisle-department.p') col = [c for c in order_aisdep.columns if 'aisle_' in c] order_aisdep.drop(col, axis=1, inplace=1) for t in range(1, W+1): df = pd.merge(df, order_aisdep.add_prefix('t-{}_'.format(t)), on='t-{}_order_id'.format(t), how='left') print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('department cumsum') #============================================================================== order_aisdep = pd.read_pickle('../input/mk/order_aisle-department_cumsum.p') col = [c for c in order_aisdep.columns if 'aisle_' in c] order_aisdep.drop(col, axis=1, inplace=1) df = pd.merge(df, order_aisdep.add_prefix('t-{}_'.format(1)), on='t-{}_order_id'.format(1), how='left') print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('feature engineering') #============================================================================== df = pd.get_dummies(df, columns=['timezone']) df = pd.get_dummies(df, columns=['order_dow']) df = pd.get_dummies(df, columns=['order_hour_of_day']) df['t-1_product_unq_len_diffByT-2'] = df['t-1_product_unq_len'] - df['t-2_product_unq_len'] df['t-1_product_unq_len_diffByT-3'] = df['t-1_product_unq_len'] - df['t-3_product_unq_len'] df['t-2_product_unq_len_diffByT-3'] = df['t-2_product_unq_len'] - df['t-3_product_unq_len'] df['t-1_product_unq_len_ratioByT-2'] = df['t-1_product_unq_len'] / df['t-2_product_unq_len'] df['t-1_product_unq_len_ratioByT-3'] = df['t-1_product_unq_len'] / df['t-3_product_unq_len'] df['t-2_product_unq_len_ratioByT-3'] = df['t-2_product_unq_len'] / df['t-3_product_unq_len'] df['T'] = T #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(df, ix_end) #============================================================================== print('output') #============================================================================== if dryrun == True: return df else: utils.to_pickles(df, '../feature/{}/all_None_w{}'.format(name, W), 20, inplace=True)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat May 19 14:18:40 2018 @author: kazuki.onodera """ import utils utils.start(__file__) y = utils.load_train()[['answer_score']] utils.to_pickles(y, f'../data/label', utils.SPLIT_SIZE) #============================================================================== utils.end(__file__)
def multi_gr2(k): gr2 = prev.groupby([KEY, k]) gc.collect() print(k) keyname = 'gby-' + '-'.join([KEY, k]) # size gr1 = gr2.size().groupby(KEY) name = f'{PREF}_{keyname}_size' base[f'{name}_min'] = gr1.min() base[f'{name}_max'] = gr1.max() base[f'{name}_max-min'] = base[f'{name}_max'] - base[f'{name}_min'] base[f'{name}_mean'] = gr1.mean() base[f'{name}_std'] = gr1.std() base[f'{name}_sum'] = gr1.sum() base[f'{name}_nunique'] = gr1.size() for v in col_num: # min gr1 = gr2[v].min().groupby(KEY) name = f'{PREF}_{keyname}_{v}_min' base[f'{name}_max'] = gr1.max() base[f'{name}_mean'] = gr1.mean() base[f'{name}_std'] = gr1.std() base[f'{name}_sum'] = gr1.sum() base[f'{name}_nunique'] = gr1.apply(nunique) # max gr1 = gr2[v].max().groupby(KEY) name = f'{PREF}_{keyname}_{v}_max' base[f'{name}_min'] = gr1.min() base[f'{name}_mean'] = gr1.mean() base[f'{name}_std'] = gr1.std() base[f'{name}_sum'] = gr1.sum() base[f'{name}_nunique'] = gr1.apply(nunique) # mean gr1 = gr2[v].mean().groupby(KEY) name = f'{PREF}_{keyname}_{v}_mean' base[f'{name}_min'] = gr1.min() base[f'{name}_max'] = gr1.max() base[f'{name}_max-min'] = base[f'{name}_max'] - base[f'{name}_min'] base[f'{name}_mean'] = gr1.mean() base[f'{name}_std'] = gr1.std() base[f'{name}_sum'] = gr1.sum() base[f'{name}_nunique'] = gr1.apply(nunique) # std gr1 = gr2[v].std().groupby(KEY) name = f'{PREF}_{keyname}_{v}_std' base[f'{name}_min'] = gr1.min() base[f'{name}_max'] = gr1.max() base[f'{name}_max-min'] = base[f'{name}_max'] - base[f'{name}_min'] base[f'{name}_mean'] = gr1.mean() base[f'{name}_std'] = gr1.std() base[f'{name}_sum'] = gr1.sum() base[f'{name}_nunique'] = gr1.apply(nunique) # sum gr1 = gr2[v].sum().groupby(KEY) name = f'{PREF}_{keyname}_{v}_sum' base[f'{name}_min'] = gr1.min() base[f'{name}_max'] = gr1.max() base[f'{name}_max-min'] = base[f'{name}_max'] - base[f'{name}_min'] base[f'{name}_mean'] = gr1.mean() base[f'{name}_std'] = gr1.std() base[f'{name}_nunique'] = gr1.apply(nunique) base.reset_index(inplace=True) df = pd.merge(train, base, on=KEY, how='left').drop(KEY, axis=1) utils.to_pickles(df, f'../data/102_{k}_train', utils.SPLIT_SIZE) df = pd.merge(test, base, on=KEY, how='left').drop(KEY, axis=1) utils.to_pickles(df, f'../data/102_{k}_test', utils.SPLIT_SIZE) print(f'finish {k}') return
def remve_stop(s): s = [w.lower() for w in s.split()] s = [w for w in s if w not in stopwords] return s def make_features(df): init_col = df.columns.tolist() # df['q_stop'] = df['question_text'].map(remve_stop) df['a_stop'] = df['answer_text'].map(remve_stop) df.drop(init_col, axis=1, inplace=True) # ============================================================================= # main # ============================================================================= make_features(train) make_features(test) utils.to_pickles(train, '../data/train_stop', utils.SPLIT_SIZE) utils.to_pickles(test, '../data/test_stop', utils.SPLIT_SIZE) #============================================================================== utils.end(__file__)
# ============================================================================= st = 0 end = 0 limit = 10 for pt in range(1, 10): end +=limit print(st, end) gc.collect() pool = Pool(proc) callback = pool.map(multi, comb[st:end]) pool.close() st = end # train df = pd.concat([pd.read_pickle(f) for f in sorted(glob('../data/109__*_train.p'))], axis=1).reset_index(drop=True) utils.to_pickles(df, '../data/109-{}_train'.format(pt), 10) gc.collect() # test df = pd.concat([pd.read_pickle(f) for f in sorted(glob('../data/109__*_test.p'))], axis=1).reset_index(drop=True) utils.to_pickles(df, '../data/109-{}_test'.format(pt), 10) os.system('rm -rf ../data/109__*.p') #============================================================================== utils.end(__file__)
def concat_pred_item(T, dryrun=False): if T == -1: name = 'test' else: name = 'trainT-' + str(T) #============================================================================== print('load label') #============================================================================== # NOTE: order_id is label print('load t3') X_base = pd.read_pickle('../feature/X_base_t3.p') label = pd.read_pickle('../feature/{}/label_reordered.p'.format(name)) # 'inner' for removing t-n_order_id == NaN if 'train' in name: df = pd.merge(X_base[X_base.is_train == 1], label, on='order_id', how='inner') elif name == 'test': df = pd.merge(X_base[X_base.is_train == 0], label, on='order_id', how='inner') if dryrun: print('dryrun') df = df.sample(9999) df = pd.merge(df, pd.read_pickle('../input/mk/goods.p')[[ 'product_id', 'aisle_id', 'department_id' ]], on='product_id', how='left') print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('user feature') #============================================================================== df = user_feature(df, name) print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('item feature') #============================================================================== df = item_feature(df, name) print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(df) ix_end = df.shape[1] #============================================================================== print('user x item') #============================================================================== df = user_item_feature(df, name) print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('user x item') #============================================================================== def compress(df, key): """ key: str """ df_ = df.drop_duplicates(key)[[key]].set_index(key) dtypes = df.dtypes col = dtypes[dtypes != 'O'].index col = [c for c in col if '_id' not in c] gr = df.groupby(key) for c in col: df_[c + '-min'] = gr[c].min() df_[c + '-mean'] = gr[c].mean() df_[c + '-median'] = gr[c].median() df_[c + '-max'] = gr[c].max() df_[c + '-std'] = gr[c].std() var = df_.var() col = var[var == 0].index df_.drop(col, axis=1, inplace=True) gc.collect() return df_.reset_index() key = 'order_id' feature = compress( pd.read_pickle('../feature/{}/f301_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') feature = compress( pd.read_pickle('../feature/{}/f301_order-product_n5.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress( pd.read_pickle('../feature/{}/f302_order-product_all.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress( pd.read_pickle('../feature/{}/f303_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress( pd.read_pickle('../feature/{}/f304-1_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress( pd.read_pickle('../feature/{}/f304-2_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress( pd.read_pickle('../feature/{}/f304-3_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress( pd.read_pickle('../feature/{}/f305_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') gc.collect() key = 'user_id' feature = compress( pd.read_pickle('../feature/{}/f306_user-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') feature = compress( pd.read_pickle('../feature/{}/f306_user-product_n5.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress( pd.read_pickle( '../feature/{}/f307_user-product-timezone.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress( pd.read_pickle( '../feature/{}/f308_user-product-timezone.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress( pd.read_pickle('../feature/{}/f308_user-product-dow.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress( pd.read_pickle('../feature/{}/f309_user-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') feature = compress( pd.read_pickle('../feature/{}/f309_user-product_n5.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress( pd.read_pickle('../feature/{}/f310_user-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress( pd.read_pickle('../feature/{}/f312_user_product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') feature = compress( pd.read_pickle('../feature/{}/f312_user_product_n5.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') gc.collect() key = 'user_id' feature = compress( pd.read_pickle('../feature/{}/f313_user_aisle.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress( pd.read_pickle('../feature/{}/f313_user_dep.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress( pd.read_pickle('../feature/{}/f314_user-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress( pd.read_pickle('../feature/{}/f315-1_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress( pd.read_pickle('../feature/{}/f315-2_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress( pd.read_pickle('../feature/{}/f315-3_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress( pd.read_pickle('../feature/{}/f316_order_product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') gc.collect() #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(df, ix_end) ix_end = df.shape[1] #============================================================================== print('daytime') #============================================================================== df = daytime_feature(df, name) print('{}.shape:{}\n'.format(name, df.shape)) # #============================================================================== # print('aisle') # #============================================================================== # order_aisdep = pd.read_pickle('../input/mk/order_aisle-department.p') # col = [c for c in order_aisdep.columns if 'department_' in c] # order_aisdep.drop(col, axis=1, inplace=1) # # df = pd.merge(df, order_aisdep.add_prefix('t-1_'), on='t-1_order_id', how='left') # df = pd.merge(df, order_aisdep.add_prefix('t-2_'), on='t-2_order_id', how='left') # # print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('feature engineering') #============================================================================== df = pd.get_dummies(df, columns=['timezone']) df = pd.get_dummies(df, columns=['order_dow']) df = pd.get_dummies(df, columns=['order_hour_of_day']) df['days_near_order_cycle'] = (df.days_since_last_order_this_item - df.item_order_days_mean).abs() df['days_last_order-min'] = df.days_since_last_order_this_item - df.useritem_order_days_min df['days_last_order-max'] = df.days_since_last_order_this_item - df.useritem_order_days_max df['pos_cart_diff'] = (df.item_mean_pos_cart - df.useritem_mean_pos_cart) df['t-1_product_unq_len_diffByT-2'] = df['t-1_product_unq_len'] - df[ 't-2_product_unq_len'] df['t-1_product_unq_len_diffByT-3'] = df['t-1_product_unq_len'] - df[ 't-3_product_unq_len'] df['t-2_product_unq_len_diffByT-3'] = df['t-2_product_unq_len'] - df[ 't-3_product_unq_len'] df['t-1_product_unq_len_ratioByT-2'] = df['t-1_product_unq_len'] / df[ 't-2_product_unq_len'] df['t-1_product_unq_len_ratioByT-3'] = df['t-1_product_unq_len'] / df[ 't-3_product_unq_len'] df['t-2_product_unq_len_ratioByT-3'] = df['t-2_product_unq_len'] / df[ 't-3_product_unq_len'] df['T'] = T #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(df, ix_end) #============================================================================== print('output') #============================================================================== if dryrun == True: return df else: utils.to_pickles(df, '../feature/{}/all'.format(name), 20, inplace=True)
goods.to_pickle('../input/mk/goods.p') gc.collect() #============================================================================== # log #============================================================================== log = pd.concat([pd.read_csv('../input/order_products__prior.csv.gz'), pd.read_csv('../input/order_products__train.csv.gz')], ignore_index=1) log.sort_values(['order_id', 'add_to_cart_order'], inplace=True) log.reset_index(drop=1, inplace=True) log = pd.merge(log, goods, on='product_id', how='left') log = pd.merge(log, orders, on='order_id', how='left') log['order_number_rev'] = log.groupby('user_id').order_number.transform(np.max) - log.order_number utils.to_pickles(log, '../input/mk/log', 20) gc.collect() #============================================================================== # order_tbl #============================================================================== order_product = log.groupby('order_id').product_name.apply(list).reset_index() order_tbl = pd.merge(orders, order_product, on='order_id', how='left') order_tbl.sort_values(['user_id', 'order_number'],inplace=True) order_tbl.reset_index(drop=1, inplace=True) order_tbl = pd.merge(order_tbl, log[['order_id','order_number_rev']].drop_duplicates(), on='order_id', how='left') order_tbl.order_number_rev = order_tbl.order_number_rev.fillna(-1).astype(int) #order_tbl['order_number_rev'] = order_tbl.groupby('user_id').order_number.transform(np.max) - order_tbl.order_number order_tbl['days_since_first_order'] = order_tbl.groupby('user_id').days_since_prior_order.cumsum()
# ============================================================================= folders = ['../feature_bureau', '../feature_bureau_unused'] for fol in folders: os.system(f'rm -rf {fol}') os.system(f'mkdir {fol}') train = utils.load_train(['SK_ID_CURR', 'TARGET']) test = utils.load_test(['SK_ID_CURR']) bureau = utils.read_pickles('../data/bureau') bureau_train = pd.merge(bureau, train, on='SK_ID_CURR', how='inner') bureau_test = pd.merge(bureau, test, on='SK_ID_CURR', how='inner') utils.to_pickles(bureau_train, '../data/bureau_train', utils.SPLIT_SIZE) utils.to_pickles(bureau_test, '../data/bureau_test', utils.SPLIT_SIZE) utils.to_pickles(bureau_train[['TARGET']], '../data/bureau_label', utils.SPLIT_SIZE) """ bureau_train = utils.read_pickles('../data/bureau_train') bureau_test = utils.read_pickles('../data/bureau_test') """ #============================================================================== utils.end(__file__)
pool = Pool(nthread) callback = pool.map(multi, utils.comb) pool.close() del trte gc.collect() # ============================================================================= # concat # ============================================================================= # train df = pd.concat( [pd.read_pickle(f) for f in sorted(glob('../data/005__*_train.p'))], axis=1).reset_index(drop=True) utils.to_pickles(df, '../data/005_train', 10) print(df.columns.tolist()) # test df = pd.concat( [pd.read_pickle(f) for f in sorted(glob('../data/005__*_test.p'))], axis=1).reset_index(drop=True) utils.to_pickles(df, '../data/005_test', 10) os.system('rm -rf ../data/005__*.p') #============================================================================== utils.end(__file__)
'../data/002__{}_test.p'.format(count_keys_)) pool = Pool(nthread) callback = pool.map(multi, utils.comb) pool.close() del trte gc.collect() # ============================================================================= # concat # ============================================================================= # train df = pd.concat( [pd.read_pickle(f) for f in sorted(glob('../data/002__*_train.p'))], axis=1).reset_index(drop=True) utils.to_pickles(df, '../data/002_train', utils.SPLIT_SIZE) # test df = pd.concat( [pd.read_pickle(f) for f in sorted(glob('../data/002__*_test.p'))], axis=1).reset_index(drop=True) utils.to_pickles(df, '../data/002_test', utils.SPLIT_SIZE) os.system('rm -rf ../data/002__*.p') #============================================================================== utils.end(__file__)
# gr2 # ============================================================================= pool = Pool(NTHREAD) callback = pool.map(multi_gr2, col_group) pool.close() # ============================================================================= # merge # ============================================================================= df = pd.concat( [pd.read_pickle(f) for f in sorted(glob(f'../data/tmp_202_{PREF}*.p'))], axis=1) base = pd.concat([base, df], axis=1) base.reset_index(inplace=True) del df gc.collect() train = utils.load_train([KEY]) train = pd.merge(train, base, on=KEY, how='left').drop(KEY, axis=1) test = utils.load_test([KEY]) test = pd.merge(test, base, on=KEY, how='left').drop(KEY, axis=1) utils.to_pickles(train, '../data/202_train', utils.SPLIT_SIZE) utils.to_pickles(test, '../data/202_test', utils.SPLIT_SIZE) os.system('rm ../data/tmp_202_*.p') #============================================================================== utils.end(__file__)
def concat_pred_item(T, dryrun=False): if T==-1: name = 'test' else: name = 'trainT-'+str(T) #============================================================================== print('load label') #============================================================================== # NOTE: order_id is label print('load t3') X_base = pd.read_pickle('../feature/X_base_t3.p') label = pd.read_pickle('../feature/{}/label_reordered.p'.format(name)) # 'inner' for removing t-n_order_id == NaN if 'train' in name: df = pd.merge(X_base[X_base.is_train==1], label, on='order_id', how='inner') elif name == 'test': df = pd.merge(X_base[X_base.is_train==0], label, on='order_id', how='inner') if dryrun: print('dryrun') df = df.sample(9999) df = pd.merge(df, pd.read_pickle('../input/mk/goods.p')[['product_id', 'aisle_id', 'department_id']], on='product_id', how='left') print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('user feature') #============================================================================== df = user_feature(df, name) print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('item feature') #============================================================================== df = item_feature(df, name) print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(df) ix_end = df.shape[1] #============================================================================== print('user x item') #============================================================================== df = user_item_feature(df, name) print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('user x item') #============================================================================== def compress(df, key): """ key: str """ df_ = df.drop_duplicates(key)[[key]].set_index(key) dtypes = df.dtypes col = dtypes[dtypes!='O'].index col = [c for c in col if '_id' not in c] gr = df.groupby(key) for c in col: df_[c+'-min'] = gr[c].min() df_[c+'-mean'] = gr[c].mean() df_[c+'-median'] = gr[c].median() df_[c+'-max'] = gr[c].max() df_[c+'-std'] = gr[c].std() var = df_.var() col = var[var==0].index df_.drop(col, axis=1, inplace=True) gc.collect() return df_.reset_index() key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f301_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') feature = compress(pd.read_pickle('../feature/{}/f301_order-product_n5.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f302_order-product_all.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f303_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f304-1_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f304-2_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f304-3_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f305_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') gc.collect() key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f306_user-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') feature = compress(pd.read_pickle('../feature/{}/f306_user-product_n5.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f307_user-product-timezone.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f308_user-product-timezone.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f308_user-product-dow.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f309_user-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') feature = compress(pd.read_pickle('../feature/{}/f309_user-product_n5.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f310_user-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f312_user_product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') feature = compress(pd.read_pickle('../feature/{}/f312_user_product_n5.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') gc.collect() key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f313_user_aisle.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f313_user_dep.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f314_user-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f315-1_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f315-2_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f315-3_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f316_order_product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') gc.collect() #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(df, ix_end) ix_end = df.shape[1] #============================================================================== print('daytime') #============================================================================== df = daytime_feature(df, name) print('{}.shape:{}\n'.format(name, df.shape)) # #============================================================================== # print('aisle') # #============================================================================== # order_aisdep = pd.read_pickle('../input/mk/order_aisle-department.p') # col = [c for c in order_aisdep.columns if 'department_' in c] # order_aisdep.drop(col, axis=1, inplace=1) # # df = pd.merge(df, order_aisdep.add_prefix('t-1_'), on='t-1_order_id', how='left') # df = pd.merge(df, order_aisdep.add_prefix('t-2_'), on='t-2_order_id', how='left') # # print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('feature engineering') #============================================================================== df = pd.get_dummies(df, columns=['timezone']) df = pd.get_dummies(df, columns=['order_dow']) df = pd.get_dummies(df, columns=['order_hour_of_day']) df['days_near_order_cycle'] = (df.days_since_last_order_this_item - df.item_order_days_mean).abs() df['days_last_order-min'] = df.days_since_last_order_this_item - df.useritem_order_days_min df['days_last_order-max'] = df.days_since_last_order_this_item - df.useritem_order_days_max df['pos_cart_diff'] = (df.item_mean_pos_cart - df.useritem_mean_pos_cart) df['t-1_product_unq_len_diffByT-2'] = df['t-1_product_unq_len'] - df['t-2_product_unq_len'] df['t-1_product_unq_len_diffByT-3'] = df['t-1_product_unq_len'] - df['t-3_product_unq_len'] df['t-2_product_unq_len_diffByT-3'] = df['t-2_product_unq_len'] - df['t-3_product_unq_len'] df['t-1_product_unq_len_ratioByT-2'] = df['t-1_product_unq_len'] / df['t-2_product_unq_len'] df['t-1_product_unq_len_ratioByT-3'] = df['t-1_product_unq_len'] / df['t-3_product_unq_len'] df['t-2_product_unq_len_ratioByT-3'] = df['t-2_product_unq_len'] / df['t-3_product_unq_len'] df['T'] = T #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(df, ix_end) #============================================================================== print('output') #============================================================================== if dryrun == True: return df else: utils.to_pickles(df, '../feature/{}/all'.format(name), 20, inplace=True)