def multi(keys): keys_ = '-'.join(keys) print(keys) gr = trte.groupby(keys) df1 = gr['hour'].var().rank(method='dense') c = 'hourvar_' + keys_ df1.name = c df2 = gr['day'].var().rank(method='dense') c = 'dayvar_' + keys_ df2.name = c df = pd.concat([df1, df2], axis=1) del df1, df2 gc.collect() utils.reduce_memory(df) col = df.columns.tolist() df.reset_index(inplace=True) result = pd.merge(trte, df, on=keys, how='left') result.iloc[0:utils.TRAIN_SHAPE][col].to_pickle( '../data/111__{}_train.p'.format(keys_)) result.iloc[utils.TRAIN_SHAPE:][col].to_pickle( '../data/111__{}_test.p'.format(keys_)) gc.collect()
def multi(k): """ k = 'app' """ gc.collect() print(k) df = pd.crosstab(trte[k], trte.hour, normalize='index') df = df.add_prefix(f'histHourNorm_{k}_') utils.reduce_memory(df) col = df.columns.tolist() result = pd.merge(trte, df.reset_index(), on=k, how='left') gc.collect() # result.iloc[0:utils.TRAIN_SHAPE][col].to_pickle(f'../data/114__{k}_train.p') # result.iloc[utils.TRAIN_SHAPE:][col].to_pickle(f'../data/114__{k}_test.p') # gc.collect() utils.to_pickles( result.iloc[0:utils.TRAIN_SHAPE][col].reset_index(drop=True), '../data/114_train', utils.SPLIT_SIZE) gc.collect() utils.to_pickles( result.iloc[utils.TRAIN_SHAPE:][col].reset_index(drop=True), '../data/114_test', utils.SPLIT_SIZE)
def multi(keys): keys_ = '-'.join(keys) print(keys) gr = trte.groupby(keys) df_min = gr['timestamp'].min() df_min.name = 'timemin_' + keys_ df_max = gr['timestamp'].max() df_max.name = 'timemax_' + keys_ df_diff1 = df_max - df_min df_diff1.name = 'timediff-minmax_' + keys_ df_diff1 = df_diff1.rank(method='dense') df_max = df_max.rank(method='dense') df_min = df_min.rank(method='dense') gc.collect() df_mean = gr['timestamp'].mean() df_mean.name = 'timemean_' + keys_ df_median = gr['timestamp'].median() df_median.name = 'timemedian_' + keys_ df_diff2 = df_mean - df_median df_diff2.name = 'timediff-meadian_' + keys_ df_diff2 = df_diff2.rank(method='dense') df_median = df_median.rank(method='dense') df_mean = df_mean.rank(method='dense') gc.collect() df_var = gr['timestamp'].var().rank(method='dense') df_var.name = 'timevar_' + keys_ df_skew = gr['timestamp'].skew().rank(method='dense') df_skew.name = 'timeskew_' + keys_ df = pd.concat([ df_min, df_max, df_diff1, df_mean, df_median, df_diff2, df_var, df_skew ], axis=1) del df_min, df_max, df_diff1, df_diff2, df_mean, df_var, df_skew gc.collect() utils.reduce_memory(df) col = df.columns.tolist() df.reset_index(inplace=True) result = pd.merge(trte, df, on=keys, how='left') result.iloc[0:utils.TRAIN_SHAPE][col].to_pickle( '../data/110__{}_train.p'.format(keys)) result.iloc[utils.TRAIN_SHAPE:][col].to_pickle( '../data/110__{}_test.p'.format(keys)) gc.collect()
def make(T): """ T = 0 folder = 'trainW-0' """ if T == -1: folder = 'test' user_logs = utils.read_multiple_csv( '../feature/{}/compressed_user_logs'.format(folder), input_col) user_logs = pd.concat([ user_logs, pd.read_csv('../input/user_logs_v2.csv', parse_dates=['date' ])[input_col] ], ignore_index=True) # user_logs_v2.csv: only inclue data of March, it's for testing set. #user_logs.sort_values(by = ['msno', 'date'], inplace = True) #這邊記憶體會激升, 速度會變慢因為concat and sort_values,現在問題是有需要sort_values麼?有groupby就不需要 else: folder = 'trainW-' + str(T) user_logs = utils.read_multiple_csv( '../feature/{}/compressed_user_logs'.format(folder), input_col) #user_logs = user_logs[user_logs.msno == 'Pz51LVoS9ENG1kNHQyrJ3gG8A163pyHi+gyvN2p+1nM='] #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(user_logs) gc.collect() print('shape1:', user_logs.shape) # core user_logs['total_secs_percentage'] = user_logs.total_secs.apply( lambda x: x / (24 * 60 * 60)) #user_logs['listening_habit_zone'] = user_logs.total_secs_percentage.apply(habit_discrimination) user_logs['num_of_time_the_user_has_logged_in'] = user_logs.groupby( 'msno').total_secs.cumsum() # make this line faster user_logs.drop('total_secs', axis=1, inplace=True) user_logs = user_logs.groupby('msno').apply( make_order_number) # make this line faster user_logs[ 'num_of_time_the_user_has_logged_in_ratio'] = user_logs.num_of_time_the_user_has_logged_in / user_logs.order_number user_logs.drop('order_number', axis=1, inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(user_logs) print('shape2:', user_logs.shape) ################################################## # write ################################################## path = '../feature/{}/user_logs_listening_habit'.format(folder) gc.collect() utils.to_multiple_csv(user_logs, path, split_size=8) del user_logs gc.collect() print('{0} done'.format(T))
def make(T): """ T = 0 folder = 'trainW-0' """ if T == -1: folder = 'test' train = pd.read_csv( '../input/sample_submission_v2.csv') # 此train代表的是test的user else: folder = 'trainW-' + str(T) train = pd.read_csv( '../input/preprocessed_data/trainW-{0}.csv'.format(T))[[ 'msno' ]] # we do not need is_churn #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(train) df = pd.merge(train, demographics, on='msno', how='left') if T == 0: now_time = datetime.strptime('2017-03-01', '%Y-%m-%d') elif T == 1: now_time = datetime.strptime('2017-02-01', '%Y-%m-%d') elif T == 2: now_time = datetime.strptime('2017-01-01', '%Y-%m-%d') else: now_time = datetime.strptime('2017-04-01', '%Y-%m-%d') df['how_long_has_benn_a_memmbership_of_kkbox_days'] = [ (now_time - datetime.utcfromtimestamp(r_i_t.tolist() / 1e9)).days if pd.notnull(r_i_t) else -1 for r_i_t in df.registration_init_time.values ] df['how_long_has_benn_a_memmbership_of_kkbox_years'] = [ h_days / 360 if h_days != -1 else -1 for h_days in df.how_long_has_benn_a_memmbership_of_kkbox_days.values ] df.drop('registration_init_time', axis=1, inplace=True) #============================================================================== print('one-hot encoding for dummy varaiables') #============================================================================== df = pd.get_dummies(df, columns=['city']) df = pd.get_dummies(df, columns=['gender']) df = pd.get_dummies(df, columns=['registered_via']) # the following's value is meaningful, so it do not need one-hot encoding # df = pd.get_dummies(df, columns=['city_zone']) # df = pd.get_dummies(df, columns=['bd_zone']) # df = pd.get_dummies(df, columns=['registered_via_zone']) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(df) # write df.to_csv('../feature/{}/membership_stat.csv'.format(folder), index=False)
def make(T): """ T = 0 folder = 'trainW-0' """ if T == -1: folder = 'test' user_logs = utils.read_multiple_csv( '../feature/{}/compressed_user_logs'.format(folder), input_col) user_logs = pd.concat([ user_logs, pd.read_csv('../input/user_logs_v2.csv', parse_dates=['date' ])[input_col] ], ignore_index=True) #user_logs.sort_values(by = ['msno', 'date'],inplace = True) else: folder = 'trainW-' + str(T) user_logs = utils.read_multiple_csv( '../feature/{}/compressed_user_logs'.format(folder), input_col) #user_logs = user_logs[user_logs.msno == 'Pz51LVoS9ENG1kNHQyrJ3gG8A163pyHi+gyvN2p+1nM='] #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(user_logs) print('shape1:', user_logs.shape) gc.collect() #incompleted vs completed user_logs['num_completed_songs'] = user_logs.num_100 + user_logs.num_985 user_logs[ 'num_incompleted_songs'] = user_logs.num_25 + user_logs.num_50 + user_logs.num_75 user_logs['completed_songs_ratio'] = user_logs.num_completed_songs / ( user_logs.num_incompleted_songs + user_logs.num_completed_songs) user_logs['is_satisfied'] = user_logs.completed_songs_ratio.apply( lambda x: 1 if x > 0.5 else 0) #num_repeated_songs user_logs['num_repeated_songs'] = (user_logs.num_100 + user_logs.num_985 + user_logs.num_75) / user_logs.num_unq user_logs.drop( ['num_25', 'num_50', 'num_75', 'num_985', 'num_100', 'num_unq'], axis=1, inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(user_logs) print('shape2:', user_logs.shape) gc.collect() ################################################## # write ################################################## path = '../feature/{}/user_logs_listening_behavior'.format(folder) gc.collect() utils.to_multiple_csv(user_logs, path, split_size=10) print('{0} done'.format(T))
def multi(keys): gc.collect() print(keys) keys1, keys2 = keys df = trte.groupby(keys1).size().groupby(keys2).size().rank(method='dense') c = 'nunique_' + '-'.join(keys1) + '_' + '-'.join(keys2) df.name = c df = df.reset_index() utils.reduce_memory(df, ix_start=-1) result = pd.merge(trte, df, on=keys2, how='left') result.iloc[0:utils.TRAIN_SHAPE][c].to_pickle('../data/103__{}_train.p'.format(c)) result.iloc[utils.TRAIN_SHAPE:][c].to_pickle('../data/103__{}_test.p'.format(c)) gc.collect()
def multi(keys): gc.collect() print(keys) keys_ = '-'.join(keys) # df = trte.groupby(keys).size().rank(method='dense') df = trte.groupby(keys).size() df.name = 'totalcount_' + keys_ df = df.reset_index() utils.reduce_memory(df, ix_start=-1) result = pd.merge(trte, df, on=keys, how='left') result.iloc[0:utils.TRAIN_SHAPE]['totalcount_' + keys_].to_pickle( '../data/101__{}_train.p'.format(keys_)) result.iloc[utils.TRAIN_SHAPE:]['totalcount_' + keys_].to_pickle( '../data/101__{}_test.p'.format(keys_)) gc.collect()
def make(T): """ T = 0 folder = 'trainW-0' """ if T == -1: folder = 'test' user_logs = utils.read_multiple_csv('../feature/{}/compressed_user_logs'.format(folder),input_col) user_logs = pd.concat([user_logs,pd.read_csv('../input/user_logs_v2.csv', parse_dates = ['date'])[input_col]], ignore_index=True) else: folder = 'trainW-'+str(T) user_logs = utils.read_multiple_csv('../feature/{}/compressed_user_logs'.format(folder), input_col) #user_logs = user_logs[user_logs.msno == 'Pz51LVoS9ENG1kNHQyrJ3gG8A163pyHi+gyvN2p+1nM='] #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(user_logs) gc.collect() print ('shape1:', user_logs.shape) #get_ratio user_logs.loc[:,"num_25":"num_100"] = user_logs.loc[:,"num_25":"num_100"].div(user_logs.loc[:,"num_25":"num_100"].sum(axis=1), axis=0) user_logs.rename(columns = {'num_25':'num_25_ratio','num_50':'num_50_ratio', 'num_75':'num_75_ratio','num_985':'num_985_ratio', 'num_100':'num_100_ratio'}, inplace =True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(user_logs) gc.collect() ################################################## # write ################################################## path = '../feature/{}/user_logs_listening_freq'.format(folder) gc.collect() utils.to_multiple_csv(user_logs, path, split_size = 10) print ('{0} done'.format(T))
def multi(keys): gc.collect() print(keys) keys = list(keys) keys_ = '-'.join(keys) c = 'sameClickTimeCount_' + keys_ df = trte.groupby(keys + ['click_time']).size().groupby(keys).max().rank( method='dense') df.name = c df = df.reset_index() utils.reduce_memory(df, ix_start=-1) gc.collect() result = pd.merge(trte, df, on=keys, how='left') gc.collect() result.iloc[0:utils.TRAIN_SHAPE][c].to_pickle( '../data/112__{}_train.p'.format(keys_)) result.iloc[utils.TRAIN_SHAPE:][c].to_pickle( '../data/112__{}_test.p'.format(keys_)) gc.collect()
def multi(keys): gc.collect() print(keys) keys = list(keys) keys_ = '-'.join(keys) c1 = 'totalCountByHour_' + keys_ c2 = 'totalRatioByHour_' + keys_ keys +=['hour'] df = trte.groupby(keys).size() df.name = c1 df = pd.merge(df.reset_index(), day_tbl, on='hour', how='left') df[c2] = df[c1] / df['hour_freq'] del df['hour_freq'] utils.reduce_memory(df, ix_start=-2) result = pd.merge(trte, df, on=keys, how='left') result.iloc[0:utils.TRAIN_SHAPE][[c1, c2]].to_pickle('../data/115__{}_train.p'.format(keys_)) result.iloc[utils.TRAIN_SHAPE:][[c1, c2]].to_pickle('../data/115__{}_test.p'.format(keys_)) gc.collect()
def concat_pred_item(T, dryrun=False): if T == -1: name = 'test' else: name = 'trainT-' + str(T) #============================================================================== print('load label') #============================================================================== # NOTE: order_id is label print('load t3') X_base = pd.read_pickle('../feature/X_base_t3.p') label = pd.read_pickle('../feature/{}/label_reordered.p'.format(name)) # 'inner' for removing t-n_order_id == NaN if 'train' in name: df = pd.merge(X_base[X_base.is_train == 1], label, on='order_id', how='inner') elif name == 'test': df = pd.merge(X_base[X_base.is_train == 0], label, on='order_id', how='inner') if dryrun: print('dryrun') df = df.sample(9999) df = pd.merge(df, pd.read_pickle('../input/mk/goods.p')[[ 'product_id', 'aisle_id', 'department_id' ]], on='product_id', how='left') print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('user feature') #============================================================================== df = user_feature(df, name) print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('item feature') #============================================================================== df = item_feature(df, name) print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(df) ix_end = df.shape[1] #============================================================================== print('user x item') #============================================================================== df = user_item_feature(df, name) print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('user x item') #============================================================================== def compress(df, key): """ key: str """ df_ = df.drop_duplicates(key)[[key]].set_index(key) dtypes = df.dtypes col = dtypes[dtypes != 'O'].index col = [c for c in col if '_id' not in c] gr = df.groupby(key) for c in col: df_[c + '-min'] = gr[c].min() df_[c + '-mean'] = gr[c].mean() df_[c + '-median'] = gr[c].median() df_[c + '-max'] = gr[c].max() df_[c + '-std'] = gr[c].std() var = df_.var() col = var[var == 0].index df_.drop(col, axis=1, inplace=True) gc.collect() return df_.reset_index() key = 'order_id' feature = compress( pd.read_pickle('../feature/{}/f301_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') feature = compress( pd.read_pickle('../feature/{}/f301_order-product_n5.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress( pd.read_pickle('../feature/{}/f302_order-product_all.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress( pd.read_pickle('../feature/{}/f303_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress( pd.read_pickle('../feature/{}/f304-1_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress( pd.read_pickle('../feature/{}/f304-2_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress( pd.read_pickle('../feature/{}/f304-3_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress( pd.read_pickle('../feature/{}/f305_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') gc.collect() key = 'user_id' feature = compress( pd.read_pickle('../feature/{}/f306_user-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') feature = compress( pd.read_pickle('../feature/{}/f306_user-product_n5.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress( pd.read_pickle( '../feature/{}/f307_user-product-timezone.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress( pd.read_pickle( '../feature/{}/f308_user-product-timezone.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress( pd.read_pickle('../feature/{}/f308_user-product-dow.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress( pd.read_pickle('../feature/{}/f309_user-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') feature = compress( pd.read_pickle('../feature/{}/f309_user-product_n5.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress( pd.read_pickle('../feature/{}/f310_user-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress( pd.read_pickle('../feature/{}/f312_user_product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') feature = compress( pd.read_pickle('../feature/{}/f312_user_product_n5.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') gc.collect() key = 'user_id' feature = compress( pd.read_pickle('../feature/{}/f313_user_aisle.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress( pd.read_pickle('../feature/{}/f313_user_dep.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress( pd.read_pickle('../feature/{}/f314_user-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress( pd.read_pickle('../feature/{}/f315-1_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress( pd.read_pickle('../feature/{}/f315-2_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress( pd.read_pickle('../feature/{}/f315-3_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress( pd.read_pickle('../feature/{}/f316_order_product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') gc.collect() #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(df, ix_end) ix_end = df.shape[1] #============================================================================== print('daytime') #============================================================================== df = daytime_feature(df, name) print('{}.shape:{}\n'.format(name, df.shape)) # #============================================================================== # print('aisle') # #============================================================================== # order_aisdep = pd.read_pickle('../input/mk/order_aisle-department.p') # col = [c for c in order_aisdep.columns if 'department_' in c] # order_aisdep.drop(col, axis=1, inplace=1) # # df = pd.merge(df, order_aisdep.add_prefix('t-1_'), on='t-1_order_id', how='left') # df = pd.merge(df, order_aisdep.add_prefix('t-2_'), on='t-2_order_id', how='left') # # print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('feature engineering') #============================================================================== df = pd.get_dummies(df, columns=['timezone']) df = pd.get_dummies(df, columns=['order_dow']) df = pd.get_dummies(df, columns=['order_hour_of_day']) df['days_near_order_cycle'] = (df.days_since_last_order_this_item - df.item_order_days_mean).abs() df['days_last_order-min'] = df.days_since_last_order_this_item - df.useritem_order_days_min df['days_last_order-max'] = df.days_since_last_order_this_item - df.useritem_order_days_max df['pos_cart_diff'] = (df.item_mean_pos_cart - df.useritem_mean_pos_cart) df['t-1_product_unq_len_diffByT-2'] = df['t-1_product_unq_len'] - df[ 't-2_product_unq_len'] df['t-1_product_unq_len_diffByT-3'] = df['t-1_product_unq_len'] - df[ 't-3_product_unq_len'] df['t-2_product_unq_len_diffByT-3'] = df['t-2_product_unq_len'] - df[ 't-3_product_unq_len'] df['t-1_product_unq_len_ratioByT-2'] = df['t-1_product_unq_len'] / df[ 't-2_product_unq_len'] df['t-1_product_unq_len_ratioByT-3'] = df['t-1_product_unq_len'] / df[ 't-3_product_unq_len'] df['t-2_product_unq_len_ratioByT-3'] = df['t-2_product_unq_len'] / df[ 't-3_product_unq_len'] df['T'] = T #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(df, ix_end) #============================================================================== print('output') #============================================================================== if dryrun == True: return df else: utils.to_pickles(df, '../feature/{}/all'.format(name), 20, inplace=True)
def concat_pred_features(T): if T == -1: name = 'test' # concat df0 = pd.read_csv('../input/test_mobile.csv.gz', compression='gzip') df1 = pd.read_csv('../input/test_female_fashion.csv.gz', compression='gzip') df2 = pd.read_csv('../input/test_male_fashion.csv.gz', compression='gzip') df = pd.concat([df0, df1, df2], ignore_index=True) else: name = 'train' #============================================================================== print('load label') #============================================================================== # concat df0 = pd.read_csv('../input/train_mobile.csv.gz', compression='gzip') df1 = pd.read_csv('../input/train_female_fashion.csv.gz', compression='gzip') df2 = pd.read_csv('../input/train_male_fashion.csv.gz', compression='gzip') df = pd.concat([df0, df1, df2], ignore_index=True) #============================================================================== print('word_given_product_name feature') #============================================================================== df = word_given_product_name_feature(df, name) print('{}.shape: {}'.format(name, df.shape)) #============================================================================== print('semantic_feature') #============================================================================== df = semantic_feature(df, name) print('{}.shape: {}'.format(name, df.shape)) #============================================================================== print('hot search feature') #============================================================================== df = hot_search_count(df, name) print('{}.shape: {}'.format(name, df.shape)) #============================================================================== print('feature engineering') #============================================================================== df = pd.get_dummies(df, columns=['Category']) df.drop_duplicates(['Product Name', 'words'], inplace=True) print('{}.shape: {}'.format(name, df.shape)) # some features with largest, we perform log transformation to them for col in df.columns: if 'count' in col and df[col].max() > 100: df['log_{}'.format(col)] = np.log(df[col] + 1) # smoothing df.drop(col, axis=1, inplace=True) if name == 'train': #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(df) #============================================================================== print('output') #============================================================================== df.to_csv('../feature/{}/all_features.csv.gz'.format(name), index=False, compression='gzip')
def run_pipeline(use_pickled_features=False, debug=False): """Run the complete pipeline. Arguments: use_pickled_features: Use features saved as pickle files (boolean, default: False). debug: Run pipeline with a subset of data (boolean, default: False) """ num_rows = 30000 if debug else None # Subset of data for debugging # Preprocess and extract features from each csv file with utils.timer("Application data"): if use_pickled_features: df = pd.read_pickle( os.path.join(config.PICKLED_DATA_DIRECTORY, 'application.pkl')) else: df = application_pipeline.get_train_test(config.DATA_DIRECTORY, num_rows=num_rows) with utils.timer("Bureau data"): if use_pickled_features: bureau_df = pd.read_pickle( os.path.join(config.PICKLED_DATA_DIRECTORY, 'bureau_and_balance.pkl')) else: bureau_df = bureau_pipeline.get_bureau(config.DATA_DIRECTORY, num_rows=num_rows) df = pd.merge(df, bureau_df, on='SK_ID_CURR', how='left') del bureau_df gc.collect() with utils.timer("Previous application data"): if use_pickled_features: prev_df = pd.read_pickle( os.path.join(config.PICKLED_DATA_DIRECTORY, 'previous.pkl')) else: prev_df = previous_pipeline.get_previous_applications( config.DATA_DIRECTORY, num_rows) df = pd.merge(df, prev_df, on='SK_ID_CURR', how='left') del prev_df gc.collect() with utils.timer("Previous balance data"): if use_pickled_features: pos = pd.read_pickle( os.path.join(config.PICKLED_DATA_DIRECTORY, 'pos_cash.pkl')) else: pos = previous_balance_pipeline.get_pos_cash( config.DATA_DIRECTORY, num_rows) df = pd.merge(df, pos, on='SK_ID_CURR', how='left') del pos gc.collect() if use_pickled_features: ins = pd.read_pickle( os.path.join(config.PICKLED_DATA_DIRECTORY, 'payments.pkl')) else: ins = previous_balance_pipeline.get_installment_payments( config.DATA_DIRECTORY, num_rows) df = pd.merge(df, ins, on='SK_ID_CURR', how='left') del ins gc.collect() if use_pickled_features: cc = pd.read_pickle( os.path.join(config.PICKLED_DATA_DIRECTORY, 'credit_card.pkl')) else: cc = previous_balance_pipeline.get_credit_card( config.DATA_DIRECTORY, num_rows) df = pd.merge(df, cc, on='SK_ID_CURR', how='left') del cc gc.collect() # Add ratios and groupby between different tables with utils.timer('Add extra features'): df = other_features.add_ratio_features(df) df = other_features.add_groupby_features(df) # Reduce memory usage df = utils.reduce_memory(df) # List categorical features for LightGBM partitioning mechanism (Fisher 1958) lgbm_categorical_feat = [ 'CODE_GENDER', 'FLAG_OWN_CAR', 'NAME_CONTRACT_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'NAME_INCOME_TYPE', 'OCCUPATION_TYPE', 'ORGANIZATION_TYPE', 'WEEKDAY_APPR_PROCESS_START' ] with utils.timer("Run LightGBM"): model.kfold_lightgbm_sklearn(df, lgbm_categorical_feat)
def _pickle_file(df, file_name): df = utils.reduce_memory(df) df.to_pickle(os.path.join(config.PICKLED_DATA_DIRECTORY, file_name)) print("Saved as {} - frame shape: {}".format(file_name, df.shape))
def make(T): """ T = 0 folder = 'trainW-0' """ if T ==-1: folder = 'test' train = pd.read_csv('../input/sample_submission_v2.csv') # 此train代表的是test的user else: folder = 'trainW-'+str(T) train = pd.read_csv('../input/preprocessed_data/trainW-{0}.csv'.format(T))[['msno']] # the following style is silly, but it's all for saving memory if T == 0: df = pd.merge(train, transactions_price_plan_days[(transactions_price_plan_days.transaction_date < datetime.strptime('2017-03-01', '%Y-%m-%d'))], on=['msno'], how='left') elif T == 1: # w = 1:使用2月之前的資料當作history df = pd.merge(train, transactions_price_plan_days[(transactions_price_plan_days.transaction_date < datetime.strptime('2017-02-01', '%Y-%m-%d'))], on=['msno'], how='left') elif T == 2: # w = 2:使用1月之前的資料當作history df = pd.merge(train, transactions_price_plan_days[(transactions_price_plan_days.transaction_date < datetime.strptime('2017-01-01', '%Y-%m-%d'))], on=['msno'], how='left') elif T == -1: # w = -1:使用4月之前的資料當作history df = pd.merge(train, transactions_price_plan_days[(transactions_price_plan_days.transaction_date < datetime.strptime('2017-04-01', '%Y-%m-%d'))], on='msno', how='left') del train gc.collect() ################################################## # All history ################################################## #df = df.dropna() ######## # core1 ######## tbl = df.groupby('msno').discount.mean().to_frame() tbl.columns = ['discount-mean'] tbl['discount-min'] = df.groupby('msno').discount.min() tbl['discount-max'] = df.groupby('msno').discount.max() tbl['discount-median'] = df.groupby('msno').discount.median() tbl['discount-std'] = df.groupby('msno').discount.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/discount.csv'.format(folder), index = False) ######## # core2 ######## tbl = df.groupby('msno').amt_per_day.mean().to_frame() tbl.columns = ['amt_per_day-mean'] tbl['amt_per_day-min'] = df.groupby('msno').amt_per_day.min() tbl['amt_per_day-max'] = df.groupby('msno').amt_per_day.max() tbl['amt_per_day-median'] = df.groupby('msno').amt_per_day.median() tbl['amt_per_day-std'] = df.groupby('msno').amt_per_day.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/amt_per_day.csv'.format(folder), index = False) ######## # core3 ######## tbl = df.groupby('msno').cp_value.mean().to_frame() tbl.columns = ['cp_value-mean'] tbl['cp_value-min'] = df.groupby('msno').cp_value.min() tbl['cp_value-max'] = df.groupby('msno').cp_value.max() tbl['cp_value-median'] = df.groupby('msno').cp_value.median() tbl['cp_value-std'] = df.groupby('msno').cp_value.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/cp_value.csv'.format(folder), index = False) ######## # core4 ######## tbl = df.groupby('msno').is_discount.sum().to_frame() tbl.columns = ['is_discount_total_count'] tbl['is_discount_total_count_ratio'] = df.groupby('msno').is_discount.mean() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/is_discount.csv'.format(folder), index = False) ################################################## # near 5 ################################################## df_ = df.groupby('msno').apply(near,5).reset_index(drop = True) ######## # core1 ######## tbl = df_.groupby('msno').discount.mean().to_frame() tbl.columns = ['discount-mean_n5'] tbl['discount-min_n5'] = df_.groupby('msno').discount.min() tbl['discount-max_n5'] = df_.groupby('msno').discount.max() tbl['discount-median_n5'] = df_.groupby('msno').discount.median() tbl['discount-std_n5'] = df_.groupby('msno').discount.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/discount_n5.csv'.format(folder), index = False) ######## # core2 ######## tbl = df_.groupby('msno').amt_per_day.mean().to_frame() tbl.columns = ['amt_per_day-mean_n5'] tbl['amt_per_day-min_n5'] = df_.groupby('msno').amt_per_day.min() tbl['amt_per_day-max_n5'] = df_.groupby('msno').amt_per_day.max() tbl['amt_per_day-median_n5'] = df_.groupby('msno').amt_per_day.median() tbl['amt_per_day-std_n5'] = df_.groupby('msno').amt_per_day.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/amt_per_day_n5.csv'.format(folder), index = False) ######## # core3 ######## tbl = df_.groupby('msno').cp_value.mean().to_frame() tbl.columns = ['cp_value-mean_n5'] tbl['cp_value-min_n5'] = df_.groupby('msno').cp_value.min() tbl['cp_value-max_n5'] = df_.groupby('msno').cp_value.max() tbl['cp_value-median_n5'] = df_.groupby('msno').cp_value.median() tbl['cp_value-std_n5'] = df_.groupby('msno').cp_value.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/cp_value_n5.csv'.format(folder), index = False) ######## # core4 ######## tbl = df_.groupby('msno').is_discount.sum().to_frame() tbl.columns = ['is_discount_total_count_n5'] tbl['is_discount_total_count_ratio_n5'] = df_.groupby('msno').is_discount.mean() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/is_discount_n5.csv'.format(folder), index = False) del df_ gc.collect() ################################################## # only one prvious order ################################################## df_ = df.groupby('msno').apply(near,1).reset_index(drop = True) ######## # core1 ######## tbl = df_.groupby('msno').discount.mean().to_frame() tbl.columns = ['discount-mean_n1'] tbl['discount-min_n1'] = df_.groupby('msno').discount.min() tbl['discount-max_n1'] = df_.groupby('msno').discount.max() tbl['discount-median_n1'] = df_.groupby('msno').discount.median() tbl['discount-std_n1'] = df_.groupby('msno').discount.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/discount_n1.csv'.format(folder), index = False) ######## # core2 ######## tbl = df_.groupby('msno').amt_per_day.mean().to_frame() tbl.columns = ['amt_per_day-mean_n1'] tbl['amt_per_day-min_n1'] = df_.groupby('msno').amt_per_day.min() tbl['amt_per_day-max_n1'] = df_.groupby('msno').amt_per_day.max() tbl['amt_per_day-median_n1'] = df_.groupby('msno').amt_per_day.median() tbl['amt_per_day-std_n1'] = df_.groupby('msno').amt_per_day.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/amt_per_day_n1.csv'.format(folder), index = False) ######## # core3 ######## tbl = df_.groupby('msno').cp_value.mean().to_frame() tbl.columns = ['cp_value-mean_n1'] tbl['cp_value-min_n1'] = df_.groupby('msno').cp_value.min() tbl['cp_value-max_n1'] = df_.groupby('msno').cp_value.max() tbl['cp_value-median_n1'] = df_.groupby('msno').cp_value.median() tbl['cp_value-std_n1'] = df_.groupby('msno').cp_value.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/cp_value_n1.csv'.format(folder), index = False) ######## # core4 ######## tbl = df_.groupby('msno').is_discount.sum().to_frame() tbl.columns = ['is_discount_total_count_n1'] tbl['is_discount_total_count_ratio_n1'] = df_.groupby('msno').is_discount.mean() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/is_discount_n1.csv'.format(folder), index = False) del df_ gc.collect()
def make(T): """ T = 0 folder = 'trainW-0' """ if T == -1: folder = 'test' train = pd.read_csv( '../input/sample_submission_v2.csv') # 此train代表的是test的user train['w'] = T membership_loyalty = utils.read_multiple_csv( '../feature/{}/days_since_the_last_transactions'.format(folder), input_col) else: folder = 'trainW-' + str(T) membership_loyalty = utils.read_multiple_csv( '../feature/{}/days_since_the_last_transactions'.format(folder), input_col) train = pd.read_csv( '../input/preprocessed_data/trainW-{0}.csv'.format(T))[[ 'msno', 'w' ]] # we do not need is_churn #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(membership_loyalty) ################################################## # All history ################################################## # merge df = pd.merge(train, membership_loyalty, on=['msno', 'w'], how='left') #df = df.head( n= 1000) #core1 tbl = df.groupby('msno').is_subscribe_early.sum().to_frame() tbl.columns = ['is_subscribe_early_count'] tbl['is_subscribe_early_ratio'] = df.groupby( 'msno').is_subscribe_early.mean() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/is_subscribe_early.csv'.format(folder), index=False) #core2 tbl = df.groupby('msno').do_change_payment_method.sum().to_frame() tbl.columns = ['do_change_payment_method_count'] tbl['do_change_payment_method_ratio'] = df.groupby( 'msno').do_change_payment_method.mean() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_change_payment_method.csv'.format(folder), index=False) #core3 tbl = df.groupby('msno').do_spend_more_money.mean().to_frame() tbl.columns = ['do_spend_more_money-mean'] tbl['do_spend_more_money-min'] = df.groupby( 'msno').do_spend_more_money.min() tbl['do_spend_more_money-max'] = df.groupby( 'msno').do_spend_more_money.max() tbl['do_spend_more_money-median'] = df.groupby( 'msno').do_spend_more_money.median() tbl['do_spend_more_money-std'] = df.groupby( 'msno').do_spend_more_money.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_spend_more_money.csv'.format(folder), index=False) #core4 tbl = df.groupby('msno').do_extend_payment_days.mean().to_frame() tbl.columns = ['do_extend_payment_days-mean'] tbl['do_extend_payment_days-min'] = df.groupby( 'msno').do_extend_payment_days.min() tbl['do_extend_payment_days-max'] = df.groupby( 'msno').do_extend_payment_days.max() tbl['do_extend_payment_days-median'] = df.groupby( 'msno').do_extend_payment_days.median() tbl['do_extend_payment_days-std'] = df.groupby( 'msno').do_extend_payment_days.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_extend_payment_days.csv'.format(folder), index=False) #core5 tbl = df.groupby('msno').do_paid_more.sum().to_frame() tbl.columns = ['do_paid_more_count'] tbl['do_paid_more_ratio'] = df.groupby('msno').do_paid_more.mean() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_paid_more.csv'.format(folder), index=False) ################################################## # near 5 ################################################## df_ = df.groupby('msno').apply(near, 5).reset_index(drop=True) #core1 tbl = df_.groupby('msno').is_subscribe_early.sum().to_frame() tbl.columns = ['is_subscribe_early_count_n5'] tbl['is_subscribe_early_ratio_n5'] = df_.groupby( 'msno').is_subscribe_early.mean() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/is_subscribe_early_n5.csv'.format(folder), index=False) #core2 tbl = df_.groupby('msno').do_change_payment_method.sum().to_frame() tbl.columns = ['do_change_payment_method_count_n5'] tbl['do_change_payment_method_ratio_n5'] = df_.groupby( 'msno').do_change_payment_method.mean() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_change_payment_method_n5.csv'.format(folder), index=False) #core3 tbl = df_.groupby('msno').do_spend_more_money.mean().to_frame() tbl.columns = ['do_spend_more_money-mean_n5'] tbl['do_spend_more_money-min_n5'] = df_.groupby( 'msno').do_spend_more_money.min() tbl['do_spend_more_money-max_n5'] = df_.groupby( 'msno').do_spend_more_money.max() tbl['do_spend_more_money-median_n5'] = df_.groupby( 'msno').do_spend_more_money.median() tbl['do_spend_more_money-std_n5'] = df_.groupby( 'msno').do_spend_more_money.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_spend_more_money_n5.csv'.format(folder), index=False) #core4 tbl = df_.groupby('msno').do_extend_payment_days.mean().to_frame() tbl.columns = ['do_extend_payment_days-mean_n5'] tbl['do_extend_payment_days-min_n5'] = df_.groupby( 'msno').do_extend_payment_days.min() tbl['do_extend_payment_days-max_n5'] = df_.groupby( 'msno').do_extend_payment_days.max() tbl['do_extend_payment_days-median_n5'] = df_.groupby( 'msno').do_extend_payment_days.median() tbl['do_extend_payment_days-std_n5'] = df_.groupby( 'msno').do_extend_payment_days.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_extend_payment_days_n5.csv'.format(folder), index=False) #core5 tbl = df_.groupby('msno').do_paid_more.sum().to_frame() tbl.columns = ['do_paid_more_count_n5'] tbl['do_paid_more_ratio_n5'] = df_.groupby('msno').do_paid_more.mean() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_paid_more_n5.csv'.format(folder), index=False) del df_ gc.collect() ################################################## # only one prvious order ################################################## df_ = df.groupby('msno').apply(near, 1).reset_index(drop=True) #core1 tbl = df_.groupby('msno').is_subscribe_early.sum().to_frame() tbl.columns = ['is_subscribe_early_count_n1'] tbl['is_subscribe_early_ratio_n1'] = df_.groupby( 'msno').is_subscribe_early.mean() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/is_subscribe_early_n1.csv'.format(folder), index=False) #core2 tbl = df_.groupby('msno').do_change_payment_method.sum().to_frame() tbl.columns = ['do_change_payment_method_count_n1'] tbl['do_change_payment_method_ratio_n1'] = df_.groupby( 'msno').do_change_payment_method.mean() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_change_payment_method_n1.csv'.format(folder), index=False) #core3 tbl = df_.groupby('msno').do_spend_more_money.mean().to_frame() tbl.columns = ['do_spend_more_money-mean_n1'] tbl['do_spend_more_money-min_n1'] = df_.groupby( 'msno').do_spend_more_money.min() tbl['do_spend_more_money-max_n1'] = df_.groupby( 'msno').do_spend_more_money.max() tbl['do_spend_more_money-median_n1'] = df_.groupby( 'msno').do_spend_more_money.median() tbl['do_spend_more_money-std_n1'] = df_.groupby( 'msno').do_spend_more_money.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_spend_more_money_n1.csv'.format(folder), index=False) #core4 tbl = df_.groupby('msno').do_extend_payment_days.mean().to_frame() tbl.columns = ['do_extend_payment_days-mean_n1'] tbl['do_extend_payment_days-min_n1'] = df_.groupby( 'msno').do_extend_payment_days.min() tbl['do_extend_payment_days-max_n1'] = df_.groupby( 'msno').do_extend_payment_days.max() tbl['do_extend_payment_days-median_n1'] = df_.groupby( 'msno').do_extend_payment_days.median() tbl['do_extend_payment_days-std_n1'] = df_.groupby( 'msno').do_extend_payment_days.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_extend_payment_days_n1.csv'.format(folder), index=False) #core5 tbl = df_.groupby('msno').do_paid_more.sum().to_frame() tbl.columns = ['do_paid_more_count_n1'] tbl['do_paid_more_ratio_n1'] = df_.groupby('msno').do_paid_more.mean() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_paid_more_n1.csv'.format(folder), index=False) del df_ gc.collect()
def make(T): """ T = 0 folder = 'trainW-0' """ if T == -1: folder = 'test' train = pd.read_csv( '../input/sample_submission_v2.csv') # 此train代表的是test的user train['w'] = T membership_loyalty = utils.read_multiple_csv( '../feature/{}/days_since_the_last_transactions'.format(folder), input_col) else: folder = 'trainW-' + str(T) membership_loyalty = utils.read_multiple_csv( '../feature/{}/days_since_the_last_transactions'.format(folder), input_col) train = pd.read_csv( '../input/preprocessed_data/trainW-{0}.csv'.format(T))[[ 'msno', 'w' ]] # we do not need is_churn #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(membership_loyalty) ################################################## # All history ################################################## # merge df = pd.merge(train, membership_loyalty, on=['msno', 'w'], how='left') ######## # core1 ######## tbl = df.groupby('msno').days_since_the_last_expiration.mean().to_frame() tbl.columns = ['days_since_the_last_expiration-mean'] tbl['days_since_the_last_expiration-min'] = df.groupby( 'msno').days_since_the_last_expiration.min() tbl['days_since_the_last_expiration-max'] = df.groupby( 'msno').days_since_the_last_expiration.max() tbl['days_since_the_last_expiration-median'] = df.groupby( 'msno').days_since_the_last_expiration.median() tbl['days_since_the_last_expiration-std'] = df.groupby( 'msno').days_since_the_last_expiration.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_expiration.csv'.format(folder), index=False) ######## # core2 ######## tbl = df.groupby('msno').days_since_the_last_subscription.mean().to_frame() tbl.columns = ['days_since_the_last_subscription-mean'] tbl['days_since_the_last_subscription-min'] = df.groupby( 'msno').days_since_the_last_subscription.min() tbl['days_since_the_last_subscription-max'] = df.groupby( 'msno').days_since_the_last_subscription.max() tbl['days_since_the_last_subscription-median'] = df.groupby( 'msno').days_since_the_last_subscription.median() tbl['days_since_the_last_subscription-std'] = df.groupby( 'msno').days_since_the_last_subscription.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_subscription.csv'.format(folder), index=False) ######### # core3 ######### tbl = df.groupby( 'msno')['days_since_the_last_expiration-cumsum'].mean().to_frame() tbl.columns = ['days_since_the_last_expiration-cumsum-mean'] tbl['days_since_the_last_expiration-cumsum-min'] = df.groupby( 'msno')['days_since_the_last_expiration-cumsum'].min() tbl['days_since_the_last_expiration-cumsum-max'] = df.groupby( 'msno')['days_since_the_last_expiration-cumsum'].max() tbl['days_since_the_last_expiration-cumsum-median'] = df.groupby( 'msno')['days_since_the_last_expiration-cumsum'].median() tbl['days_since_the_last_expiration-cumsum-std'] = df.groupby( 'msno')['days_since_the_last_expiration-cumsum'].std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_expiration-cumsum.csv'.format( folder), index=False) ######## # core4 ######## tbl = df.groupby( 'msno').days_since_the_last_expiration_ratio.mean().to_frame() tbl.columns = ['days_since_the_last_expiration_ratio-mean'] tbl['days_since_the_last_expiration_ratio-min'] = df.groupby( 'msno').days_since_the_last_expiration_ratio.min() tbl['days_since_the_last_expiration_ratio-max'] = df.groupby( 'msno').days_since_the_last_expiration_ratio.max() tbl['days_since_the_last_expiration_ratio-median'] = df.groupby( 'msno').days_since_the_last_expiration_ratio.median() tbl['days_since_the_last_expiration_ratio-std'] = df.groupby( 'msno').days_since_the_last_expiration_ratio.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/days_since_the_last_expiration_ratio.csv'.format( folder), index=False) ######## # core5 ######## tbl = df.groupby( 'msno').days_since_the_last_subscription_ratio.mean().to_frame() tbl.columns = ['days_since_the_last_subscription_ratio-mean'] tbl['days_since_the_last_subscription_ratio-min'] = df.groupby( 'msno').days_since_the_last_subscription_ratio.min() tbl['days_since_the_last_subscription_ratio-max'] = df.groupby( 'msno').days_since_the_last_subscription_ratio.max() tbl['days_since_the_last_subscription_ratio-median'] = df.groupby( 'msno').days_since_the_last_subscription_ratio.median() tbl['days_since_the_last_subscription_ratio-std'] = df.groupby( 'msno').days_since_the_last_subscription_ratio.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_subscription_ratio.csv'.format( folder), index=False) ######## # core6 ######## tbl = df.groupby( 'msno').days_since_the_first_subscription.mean().to_frame() tbl.columns = ['days_since_the_first_subscription-mean'] tbl['days_since_the_first_subscription-min'] = df.groupby( 'msno').days_since_the_first_subscription.min() tbl['days_since_the_first_subscription-max'] = df.groupby( 'msno').days_since_the_first_subscription.max() tbl['days_since_the_first_subscription-median'] = df.groupby( 'msno').days_since_the_first_subscription.median() tbl['days_since_the_first_subscription-std'] = df.groupby( 'msno').days_since_the_first_subscription.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_first_subscription.csv'.format(folder), index=False) ################################################## # near 5 ################################################## df_ = df.groupby('msno').apply(near, 5).reset_index(drop=True) ######## # core1 ######## tbl = df_.groupby('msno').days_since_the_last_expiration.mean().to_frame() tbl.columns = ['days_since_the_last_expiration-mean_n5'] tbl['days_since_the_last_expiration-min_n5'] = df_.groupby( 'msno').days_since_the_last_expiration.min() tbl['days_since_the_last_expiration-max_n5'] = df_.groupby( 'msno').days_since_the_last_expiration.max() tbl['days_since_the_last_expiration-median_n5'] = df_.groupby( 'msno').days_since_the_last_expiration.median() tbl['days_since_the_last_expiration-std_n5'] = df_.groupby( 'msno').days_since_the_last_expiration.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_expiration_n5.csv'.format(folder), index=False) ######## # core2 ######## tbl = df_.groupby( 'msno').days_since_the_last_subscription.mean().to_frame() tbl.columns = ['days_since_the_last_subscription-mean_n5'] tbl['days_since_the_last_subscription-min_n5'] = df_.groupby( 'msno').days_since_the_last_subscription.min() tbl['days_since_the_last_subscription-max_n5'] = df_.groupby( 'msno').days_since_the_last_subscription.max() tbl['days_since_the_last_subscription-median_n5'] = df_.groupby( 'msno').days_since_the_last_subscription.median() tbl['days_since_the_last_subscription-std_n5'] = df_.groupby( 'msno').days_since_the_last_subscription.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_subscription_n5.csv'.format(folder), index=False) ######### # core3 ######### tbl = df_.groupby( 'msno')['days_since_the_last_expiration-cumsum'].mean().to_frame() tbl.columns = ['days_since_the_last_expiration-cumsum-mean_n5'] tbl['days_since_the_last_expiration-cumsum-min_n5'] = df_.groupby( 'msno')['days_since_the_last_expiration-cumsum'].min() tbl['days_since_the_last_expiration-cumsum-max_n5'] = df_.groupby( 'msno')['days_since_the_last_expiration-cumsum'].max() tbl['days_since_the_last_expiration-cumsum-median_n5'] = df_.groupby( 'msno')['days_since_the_last_expiration-cumsum'].median() tbl['days_since_the_last_expiration-cumsum-std_n5'] = df_.groupby( 'msno')['days_since_the_last_expiration-cumsum'].std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_expiration-cumsum_n5.csv'.format( folder), index=False) ######## # core4 ######## tbl = df_.groupby( 'msno').days_since_the_last_expiration_ratio.mean().to_frame() tbl.columns = ['days_since_the_last_expiration_ratio-mean_n5'] tbl['days_since_the_last_expiration_ratio-min_n5'] = df_.groupby( 'msno').days_since_the_last_expiration_ratio.min() tbl['days_since_the_last_expiration_ratio-max_n5'] = df_.groupby( 'msno').days_since_the_last_expiration_ratio.max() tbl['days_since_the_last_expiration_ratio-median_n5'] = df_.groupby( 'msno').days_since_the_last_expiration_ratio.median() tbl['days_since_the_last_expiration_ratio-std_n5'] = df_.groupby( 'msno').days_since_the_last_expiration_ratio.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_expiration_ratio_n5.csv'.format( folder), index=False) ######## # core5 ######## tbl = df_.groupby( 'msno').days_since_the_last_subscription_ratio.mean().to_frame() tbl.columns = ['days_since_the_last_subscription_ratio-mean_n5'] tbl['days_since_the_last_subscription_ratio-min_n5'] = df_.groupby( 'msno').days_since_the_last_subscription_ratio.min() tbl['days_since_the_last_subscription_ratio-max_n5'] = df_.groupby( 'msno').days_since_the_last_subscription_ratio.max() tbl['days_since_the_last_subscription_ratio-median_n5'] = df_.groupby( 'msno').days_since_the_last_subscription_ratio.median() tbl['days_since_the_last_subscription_ratio-std_n5'] = df_.groupby( 'msno').days_since_the_last_subscription_ratio.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_subscription_ratio_n5.csv'.format( folder), index=False) ######## # core6 ######## tbl = df_.groupby( 'msno').days_since_the_first_subscription.mean().to_frame() tbl.columns = ['days_since_the_first_subscription-mean_n5'] tbl['days_since_the_first_subscription-min_n5'] = df_.groupby( 'msno').days_since_the_first_subscription.min() tbl['days_since_the_first_subscription-max_n5'] = df_.groupby( 'msno').days_since_the_first_subscription.max() tbl['days_since_the_first_subscription-median_n5'] = df_.groupby( 'msno').days_since_the_first_subscription.median() tbl['days_since_the_first_subscription-std_n5'] = df_.groupby( 'msno').days_since_the_first_subscription.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/days_since_the_first_subscription_n5.csv'.format( folder), index=False) del df_ ################################################## # only one prvious order ################################################## df_ = df.groupby('msno').apply(near, 1).reset_index(drop=True) ######## # core1 ######## tbl = df_.groupby('msno').days_since_the_last_expiration.mean().to_frame() tbl.columns = ['days_since_the_last_expiration-mean_n1'] tbl['days_since_the_last_expiration-min_n1'] = df_.groupby( 'msno').days_since_the_last_expiration.min() tbl['days_since_the_last_expiration-max_n1'] = df_.groupby( 'msno').days_since_the_last_expiration.max() tbl['days_since_the_last_expiration-median_n1'] = df_.groupby( 'msno').days_since_the_last_expiration.median() tbl['days_since_the_last_expiration-std_n1'] = df_.groupby( 'msno').days_since_the_last_expiration.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_expiration_n1.csv'.format(folder), index=False) ######## # core2 ######## tbl = df_.groupby( 'msno').days_since_the_last_subscription.mean().to_frame() tbl.columns = ['days_since_the_last_subscription-mean_n1'] tbl['days_since_the_last_subscription-min_n1'] = df_.groupby( 'msno').days_since_the_last_subscription.min() tbl['days_since_the_last_subscription-max_n1'] = df_.groupby( 'msno').days_since_the_last_subscription.max() tbl['days_since_the_last_subscription-median_n1'] = df_.groupby( 'msno').days_since_the_last_subscription.median() tbl['days_since_the_last_subscription-std_n1'] = df_.groupby( 'msno').days_since_the_last_subscription.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_subscription_n1.csv'.format(folder), index=False) ######### # core3 ######### tbl = df_.groupby( 'msno')['days_since_the_last_expiration-cumsum'].mean().to_frame() tbl.columns = ['days_since_the_last_expiration-cumsum-mean_n1'] tbl['days_since_the_last_expiration-cumsum-min_n1'] = df_.groupby( 'msno')['days_since_the_last_expiration-cumsum'].min() tbl['days_since_the_last_expiration-cumsum-max_n1'] = df_.groupby( 'msno')['days_since_the_last_expiration-cumsum'].max() tbl['days_since_the_last_expiration-cumsum-median_n1'] = df_.groupby( 'msno')['days_since_the_last_expiration-cumsum'].median() tbl['days_since_the_last_expiration-cumsum-std_n1'] = df_.groupby( 'msno')['days_since_the_last_expiration-cumsum'].std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_expiration-cumsum_n1.csv'.format( folder), index=False) ######## # core4 ######## tbl = df_.groupby( 'msno').days_since_the_last_expiration_ratio.mean().to_frame() tbl.columns = ['days_since_the_last_expiration_ratio-mean_n1'] tbl['days_since_the_last_expiration_ratio-min_n1'] = df_.groupby( 'msno').days_since_the_last_expiration_ratio.min() tbl['days_since_the_last_expiration_ratio-max_n1'] = df_.groupby( 'msno').days_since_the_last_expiration_ratio.max() tbl['days_since_the_last_expiration_ratio-median_n1'] = df_.groupby( 'msno').days_since_the_last_expiration_ratio.median() tbl['days_since_the_last_expiration_ratio-std_n1'] = df_.groupby( 'msno').days_since_the_last_expiration_ratio.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_expiration_ratio_n1.csv'.format( folder), index=False) ######## # core5 ######## tbl = df_.groupby( 'msno').days_since_the_last_subscription_ratio.mean().to_frame() tbl.columns = ['days_since_the_last_subscription_ratio-mean_n1'] tbl['days_since_the_last_subscription_ratio-min_n1'] = df_.groupby( 'msno').days_since_the_last_subscription_ratio.min() tbl['days_since_the_last_subscription_ratio-max_n1'] = df_.groupby( 'msno').days_since_the_last_subscription_ratio.max() tbl['days_since_the_last_subscription_ratio-median_n1'] = df_.groupby( 'msno').days_since_the_last_subscription_ratio.median() tbl['days_since_the_last_subscription_ratio-std_n1'] = df_.groupby( 'msno').days_since_the_last_subscription_ratio.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_subscription_ratio_n1.csv'.format( folder), index=False) ######## # core6 ######## tbl = df_.groupby( 'msno').days_since_the_first_subscription.mean().to_frame() tbl.columns = ['days_since_the_first_subscription-mean_n1'] tbl['days_since_the_first_subscription-min_n1'] = df_.groupby( 'msno').days_since_the_first_subscription.min() tbl['days_since_the_first_subscription-max_n1'] = df_.groupby( 'msno').days_since_the_first_subscription.max() tbl['days_since_the_first_subscription-median_n1'] = df_.groupby( 'msno').days_since_the_first_subscription.median() tbl['days_since_the_first_subscription-std_n1'] = df_.groupby( 'msno').days_since_the_first_subscription.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/days_since_the_first_subscription_n1.csv'.format( folder), index=False) del df_
input_col = ['msno', 'transaction_date', 'is_auto_renew'] transactions = utils.read_multiple_csv( '../input/preprocessed_data/transactions', input_col) # 20,000,000 #transactions = transactions.head(n = 5000) ################################################## # Convert string to datetime format ################################################## transactions['transaction_date'] = transactions.transaction_date.apply( lambda x: datetime.strptime(x, '%Y-%m-%d')) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(transactions) def near(x, keep=5): return x.tail(keep) def make_order_number(x): x['order_number'] = [i + 1 for i in range(x.shape[0])] return x #============================================================================== # def #============================================================================== def make(T):
def concat_pred_item(T, dryrun=False): if T==-1: name = 'test' else: name = 'trainT-'+str(T) #============================================================================== print('load label') #============================================================================== # NOTE: order_id is label print('load t3') X_base = pd.read_pickle('../feature/X_base_t3.p') label = pd.read_pickle('../feature/{}/label_reordered.p'.format(name)) # 'inner' for removing t-n_order_id == NaN if 'train' in name: df = pd.merge(X_base[X_base.is_train==1], label, on='order_id', how='inner') elif name == 'test': df = pd.merge(X_base[X_base.is_train==0], label, on='order_id', how='inner') if dryrun: print('dryrun') df = df.sample(9999) df = pd.merge(df, pd.read_pickle('../input/mk/goods.p')[['product_id', 'aisle_id', 'department_id']], on='product_id', how='left') print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('user feature') #============================================================================== df = user_feature(df, name) print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('item feature') #============================================================================== df = item_feature(df, name) print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(df) ix_end = df.shape[1] #============================================================================== print('user x item') #============================================================================== df = user_item_feature(df, name) print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('user x item') #============================================================================== def compress(df, key): """ key: str """ df_ = df.drop_duplicates(key)[[key]].set_index(key) dtypes = df.dtypes col = dtypes[dtypes!='O'].index col = [c for c in col if '_id' not in c] gr = df.groupby(key) for c in col: df_[c+'-min'] = gr[c].min() df_[c+'-mean'] = gr[c].mean() df_[c+'-median'] = gr[c].median() df_[c+'-max'] = gr[c].max() df_[c+'-std'] = gr[c].std() var = df_.var() col = var[var==0].index df_.drop(col, axis=1, inplace=True) gc.collect() return df_.reset_index() key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f301_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') feature = compress(pd.read_pickle('../feature/{}/f301_order-product_n5.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f302_order-product_all.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f303_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f304-1_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f304-2_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f304-3_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f305_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') gc.collect() key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f306_user-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') feature = compress(pd.read_pickle('../feature/{}/f306_user-product_n5.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f307_user-product-timezone.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f308_user-product-timezone.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f308_user-product-dow.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f309_user-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') feature = compress(pd.read_pickle('../feature/{}/f309_user-product_n5.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f310_user-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f312_user_product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') feature = compress(pd.read_pickle('../feature/{}/f312_user_product_n5.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') gc.collect() key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f313_user_aisle.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f313_user_dep.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'user_id' feature = compress(pd.read_pickle('../feature/{}/f314_user-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f315-1_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f315-2_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f315-3_order-product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') key = 'order_id' feature = compress(pd.read_pickle('../feature/{}/f316_order_product.p'.format(name)), key) df = pd.merge(df, feature, on=key, how='left') gc.collect() #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(df, ix_end) ix_end = df.shape[1] #============================================================================== print('daytime') #============================================================================== df = daytime_feature(df, name) print('{}.shape:{}\n'.format(name, df.shape)) # #============================================================================== # print('aisle') # #============================================================================== # order_aisdep = pd.read_pickle('../input/mk/order_aisle-department.p') # col = [c for c in order_aisdep.columns if 'department_' in c] # order_aisdep.drop(col, axis=1, inplace=1) # # df = pd.merge(df, order_aisdep.add_prefix('t-1_'), on='t-1_order_id', how='left') # df = pd.merge(df, order_aisdep.add_prefix('t-2_'), on='t-2_order_id', how='left') # # print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('feature engineering') #============================================================================== df = pd.get_dummies(df, columns=['timezone']) df = pd.get_dummies(df, columns=['order_dow']) df = pd.get_dummies(df, columns=['order_hour_of_day']) df['days_near_order_cycle'] = (df.days_since_last_order_this_item - df.item_order_days_mean).abs() df['days_last_order-min'] = df.days_since_last_order_this_item - df.useritem_order_days_min df['days_last_order-max'] = df.days_since_last_order_this_item - df.useritem_order_days_max df['pos_cart_diff'] = (df.item_mean_pos_cart - df.useritem_mean_pos_cart) df['t-1_product_unq_len_diffByT-2'] = df['t-1_product_unq_len'] - df['t-2_product_unq_len'] df['t-1_product_unq_len_diffByT-3'] = df['t-1_product_unq_len'] - df['t-3_product_unq_len'] df['t-2_product_unq_len_diffByT-3'] = df['t-2_product_unq_len'] - df['t-3_product_unq_len'] df['t-1_product_unq_len_ratioByT-2'] = df['t-1_product_unq_len'] / df['t-2_product_unq_len'] df['t-1_product_unq_len_ratioByT-3'] = df['t-1_product_unq_len'] / df['t-3_product_unq_len'] df['t-2_product_unq_len_ratioByT-3'] = df['t-2_product_unq_len'] / df['t-3_product_unq_len'] df['T'] = T #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(df, ix_end) #============================================================================== print('output') #============================================================================== if dryrun == True: return df else: utils.to_pickles(df, '../feature/{}/all'.format(name), 20, inplace=True)
def concat_pred_features(T): if T == -1: name = 'test' train = pd.read_csv( '../input/sample_submission_v2.csv') # 此train代表的是test的user else: #============================================================================== print('load label') #============================================================================== name = 'trainW-' + str(T) train = pd.read_csv( '../input/preprocessed_data/trainW-{0}.csv'.format(T))[[ 'msno', 'is_churn' ]] #train = train.head( n = 500) #============================================================================== print('transactions feature') #============================================================================== df = transactions_feature(train, name) print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(df) ix_end = df.shape[1] #============================================================================== print('members feature') #============================================================================== df = members_feature(df, name) print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('user_logs feature') #============================================================================== df = user_logs_feature(df, name) df.replace( np.inf, 0, inplace=True) # It may destroy feature but forget it. just noise print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(df, ix_end) ix_end = df.shape[1] #============================================================================== print('feature engineering') #============================================================================== # delta:反應這個值隨者時間的變化量(future - history) # delta1: time difference = 1 # delta2: time difference = 2 # bynum: start from the num #### #num_100_ratio-mean #### #delta1 df['num_100_ratio_delta1_by7'] = df['num_100_ratio_during_t_7-mean'] - df[ 'num_100_ratio_during_t_14-mean'] df['num_100_ratio_delta1_by14'] = df[ 'num_100_ratio_during_t_14-mean'] - df['num_100_ratio_during_t_30-mean'] df['num_100_ratio_delta1_by30'] = df[ 'num_100_ratio_during_t_30-mean'] - df['num_100_ratio_during_t_60-mean'] df['num_100_ratio_delta1_by60'] = df[ 'num_100_ratio_during_t_60-mean'] - df['num_100_ratio_during_t_90-mean'] df['num_100_ratio_delta1_by90'] = df[ 'num_100_ratio_during_t_90-mean'] - df['num_100_ratio-mean'] #delta2 df['num_100_ratio_delta2_by7'] = df['num_100_ratio_during_t_7-mean'] - df[ 'num_100_ratio_during_t_30-mean'] df['num_100_ratio_delta2_by14'] = df[ 'num_100_ratio_during_t_14-mean'] - df['num_100_ratio_during_t_60-mean'] df['num_100_ratio_delta2_by30'] = df[ 'num_100_ratio_during_t_30-mean'] - df['num_100_ratio_during_t_90-mean'] df['num_100_ratio_delta2_by60'] = df[ 'num_100_ratio_during_t_60-mean'] - df['num_100_ratio-mean'] #### #num_25_ratio-mean #### #delta1 df['num_25_ratio_delta1_by7'] = df['num_25_ratio_during_t_7-mean'] - df[ 'num_25_ratio_during_t_14-mean'] df['num_25_ratio_delta1_by14'] = df['num_25_ratio_during_t_14-mean'] - df[ 'num_25_ratio_during_t_30-mean'] df['num_25_ratio_delta1_by30'] = df['num_25_ratio_during_t_30-mean'] - df[ 'num_25_ratio_during_t_60-mean'] df['num_25_ratio_delta1_by60'] = df['num_25_ratio_during_t_60-mean'] - df[ 'num_25_ratio_during_t_90-mean'] df['num_25_ratio_delta1_by90'] = df['num_25_ratio_during_t_90-mean'] - df[ 'num_25_ratio-mean'] #delta2 df['num_25_ratio_delta2_by7'] = df['num_25_ratio_during_t_7-mean'] - df[ 'num_25_ratio_during_t_30-mean'] df['num_25_ratio_delta2_by14'] = df['num_25_ratio_during_t_14-mean'] - df[ 'num_25_ratio_during_t_60-mean'] df['num_25_ratio_delta2_by30'] = df['num_25_ratio_during_t_30-mean'] - df[ 'num_25_ratio_during_t_90-mean'] df['num_25_ratio_delta2_by60'] = df['num_25_ratio_during_t_60-mean'] - df[ 'num_25_ratio-mean'] #### #num_repeated_songs-mean #### #delta1 df['num_repeated_songs_delta1_by7'] = df[ 'num_repeated_songs_during_t_7-mean'] - df[ 'num_repeated_songs_during_t_14-mean'] df['num_repeated_songs_delta1_by14'] = df[ 'num_repeated_songs_during_t_14-mean'] - df[ 'num_repeated_songs_during_t_30-mean'] df['num_repeated_songs_delta1_by30'] = df[ 'num_repeated_songs_during_t_30-mean'] - df[ 'num_repeated_songs_during_t_60-mean'] df['num_repeated_songs_delta1_by60'] = df[ 'num_repeated_songs_during_t_60-mean'] - df[ 'num_repeated_songs_during_t_90-mean'] df['num_repeated_songs_delta1_by90'] = df[ 'num_repeated_songs_during_t_90-mean'] - df['num_repeated_songs-mean'] #delta2 df['num_repeated_songs_delta2_by7'] = df[ 'num_repeated_songs_during_t_7-mean'] - df[ 'num_repeated_songs_during_t_30-mean'] df['num_repeated_songs_delta2_by14'] = df[ 'num_repeated_songs_during_t_14-mean'] - df[ 'num_repeated_songs_during_t_60-mean'] df['num_repeated_songs_delta2_by30'] = df[ 'num_repeated_songs_during_t_30-mean'] - df[ 'num_repeated_songs_during_t_90-mean'] df['num_repeated_songs_delta2_by60'] = df[ 'num_repeated_songs_during_t_60-mean'] - df['num_repeated_songs-mean'] #### #completed_songs_ratio #### #delta1 df['completed_songs_ratio_delta1_by7'] = df[ 'completed_songs_ratio_during_t_7-mean'] - df[ 'completed_songs_ratio_during_t_14-mean'] df['completed_songs_ratio_delta1_by14'] = df[ 'completed_songs_ratio_during_t_14-mean'] - df[ 'completed_songs_ratio_during_t_30-mean'] df['completed_songs_ratio_delta1_by30'] = df[ 'completed_songs_ratio_during_t_30-mean'] - df[ 'completed_songs_ratio_during_t_60-mean'] df['completed_songs_ratio_delta1_by60'] = df[ 'completed_songs_ratio_during_t_60-mean'] - df[ 'completed_songs_ratio_during_t_90-mean'] df['completed_songs_ratio_delta1_by90'] = df[ 'completed_songs_ratio_during_t_90-mean'] - df[ 'completed_songs_ratio-mean'] #delta2 df['completed_songs_ratio_delta2_by7'] = df[ 'completed_songs_ratio_during_t_7-mean'] - df[ 'completed_songs_ratio_during_t_30-mean'] df['completed_songs_ratio_delta2_by14'] = df[ 'completed_songs_ratio_during_t_14-mean'] - df[ 'completed_songs_ratio_during_t_60-mean'] df['completed_songs_ratio_delta2_by30'] = df[ 'completed_songs_ratio_during_t_30-mean'] - df[ 'completed_songs_ratio_during_t_90-mean'] df['completed_songs_ratio_delta2_by60'] = df[ 'completed_songs_ratio_during_t_60-mean'] - df[ 'completed_songs_ratio-mean'] #### #listen_music_in_a_row_count #### #delta1 df['listen_music_in_a_row_count_delta1_by7'] = df[ 'listen_music_in_a_row_count_during_t_7'] - df[ 'listen_music_in_a_row_count_during_t_14'] df['listen_music_in_a_row_count_delta1_by14'] = df[ 'listen_music_in_a_row_count_during_t_14'] - df[ 'listen_music_in_a_row_count_during_t_30'] df['listen_music_in_a_row_count_delta1_by30'] = df[ 'listen_music_in_a_row_count_during_t_30'] - df[ 'listen_music_in_a_row_count_during_t_60'] df['listen_music_in_a_row_count_delta1_by60'] = df[ 'listen_music_in_a_row_count_during_t_60'] - df[ 'listen_music_in_a_row_count_during_t_90'] #delta2 df['listen_music_in_a_row_count_delta2_by7'] = df[ 'listen_music_in_a_row_count_during_t_7'] - df[ 'listen_music_in_a_row_count_during_t_30'] df['listen_music_in_a_row_count_delta2_by14'] = df[ 'listen_music_in_a_row_count_during_t_14'] - df[ 'listen_music_in_a_row_count_during_t_60'] df['listen_music_in_a_row_count_delta2_by30'] = df[ 'listen_music_in_a_row_count_during_t_30'] - df[ 'listen_music_in_a_row_count_during_t_90'] #### #listen_music_in_a_row_ratio #### #delta1 df['listen_music_in_a_row_ratio_delta1_by7'] = df[ 'listen_music_in_a_row_ratio_during_t_7'] - df[ 'listen_music_in_a_row_ratio_during_t_14'] df['listen_music_in_a_row_ratio_delta1_by14'] = df[ 'listen_music_in_a_row_ratio_during_t_14'] - df[ 'listen_music_in_a_row_ratio_during_t_30'] df['listen_music_in_a_row_ratio_delta1_by30'] = df[ 'listen_music_in_a_row_ratio_during_t_30'] - df[ 'listen_music_in_a_row_ratio_during_t_60'] df['listen_music_in_a_row_ratio_delta1_by60'] = df[ 'listen_music_in_a_row_ratio_during_t_60'] - df[ 'listen_music_in_a_row_ratio_during_t_90'] #delta2 df['listen_music_in_a_row_ratio_delta2_by7'] = df[ 'listen_music_in_a_row_ratio_during_t_7'] - df[ 'listen_music_in_a_row_ratio_during_t_30'] df['listen_music_in_a_row_ratio_delta2_by14'] = df[ 'listen_music_in_a_row_ratio_during_t_14'] - df[ 'listen_music_in_a_row_ratio_during_t_60'] df['listen_music_in_a_row_ratio_delta2_by30'] = df[ 'listen_music_in_a_row_ratio_during_t_30'] - df[ 'listen_music_in_a_row_ratio_during_t_90'] #### #date_diff-mean #### #delta1 df['date_diff_delta1_by7'] = df['date_diff_during_t_7-mean'] - df[ 'date_diff_during_t_14-mean'] df['date_diff_delta1_by14'] = df['date_diff_during_t_14-mean'] - df[ 'date_diff_during_t_30-mean'] df['date_diff_delta1_by30'] = df['date_diff_during_t_30-mean'] - df[ 'date_diff_during_t_60-mean'] df['date_diff_delta1_by60'] = df['date_diff_during_t_60-mean'] - df[ 'date_diff_during_t_90-mean'] df['date_diff_delta1_by90'] = df['date_diff_during_t_90-mean'] - df[ 'date_diff-mean'] #delta2 df['date_diff_delta2_by7'] = df['date_diff_during_t_7-mean'] - df[ 'completed_songs_ratio_during_t_30-mean'] df['date_diff_delta2_by14'] = df['date_diff_during_t_14-mean'] - df[ 'date_diff_during_t_60-mean'] df['date_diff_delta2_by30'] = df['date_diff_during_t_30-mean'] - df[ 'date_diff_during_t_90-mean'] df['date_diff_delta2_by60'] = df['date_diff_during_t_60-mean'] - df[ 'date_diff-mean'] #### #num_log_in #### #delta1 df['num_log_in_delta1_by7'] = df['num_log_in_during_t_7'] - df[ 'num_log_in_during_t_14'] df['num_log_in_delta1_by14'] = df['num_log_in_during_t_14'] - df[ 'num_log_in_during_t_30'] df['num_log_in_delta1_by30'] = df['num_log_in_during_t_30'] - df[ 'num_log_in_during_t_60'] df['num_log_in_delta1_by60'] = df['num_log_in_during_t_60'] - df[ 'num_log_in_during_t_90'] df['num_log_in_delta1_by90'] = df['num_log_in_during_t_90'] - df[ 'num_log_in'] #delta2 df['num_log_in_delta2_by7'] = df['num_log_in_during_t_7'] - df[ 'num_log_in_during_t_30'] df['num_log_in_delta2_by14'] = df['num_log_in_during_t_14'] - df[ 'num_log_in_during_t_60'] df['num_log_in_delta2_by30'] = df['num_log_in_during_t_30'] - df[ 'num_log_in_during_t_90'] df['num_log_in_delta2_by60'] = df['num_log_in_during_t_60'] - df[ 'num_log_in'] #### #log_in_ratio #### #delta1 df['log_in_ratio_delta1_by7'] = df['log_in_ratio_during_t_7'] - df[ 'log_in_ratio_during_t_14'] df['log_in_ratio_delta1_by14'] = df['log_in_ratio_during_t_14'] - df[ 'log_in_ratio_during_t_30'] df['log_in_ratio_delta1_by30'] = df['log_in_ratio_during_t_30'] - df[ 'log_in_ratio_during_t_60'] df['log_in_ratio_delta1_by60'] = df['log_in_ratio_during_t_60'] - df[ 'log_in_ratio_during_t_90'] df['log_in_ratio_delta1_by90'] = df['log_in_ratio_during_t_90'] - df[ 'log_in_ratio'] #delta2 df['log_in_ratio_delta2_by7'] = df['log_in_ratio_during_t_7'] - df[ 'log_in_ratio_during_t_30'] df['log_in_ratio_delta2_by14'] = df['log_in_ratio_during_t_14'] - df[ 'log_in_ratio_during_t_60'] df['log_in_ratio_delta2_by30'] = df['log_in_ratio_during_t_30'] - df[ 'log_in_ratio_during_t_90'] df['log_in_ratio_delta2_by60'] = df['log_in_ratio_during_t_60'] - df[ 'log_in_ratio'] print('{}.shape:{}\n'.format(name, df.shape)) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(df, ix_end) #============================================================================== print('output') #============================================================================== utils.to_multiple_csv(df, '../feature/{}/all'.format(name), 20) # 存一個all_sampling_for_developing
def make(T): """ T = 0 folder = 'trainW-0' """ input_col = ['msno','date','num_25','num_100'] # for speed, only considering two extremes, num_25 and num_100 if T == -1: folder = 'test' #label train = pd.read_csv('../input/sample_submission_v2.csv')[['msno']] # 此train代表的是test的user #file1 user_logs = utils.read_multiple_csv('../feature/{}/compressed_user_logs'.format(folder), input_col, parse_dates = ['date']) user_logs = pd.concat([user_logs,pd.read_csv('../input/user_logs_v2.csv',parse_dates = ['date'])[input_col]], ignore_index = True) #user_logs.sort_values(by = ['msno', 'date'],inplace = True) else: folder = 'trainW-'+ str(T) #label train = pd.read_csv('../input/preprocessed_data/trainW-{0}.csv'.format(T))[['msno']] #file1 user_logs = utils.read_multiple_csv('../feature/{}/compressed_user_logs'.format(folder), input_col, parse_dates = ['date'] ) ################################################## # basic procedure ################################################## #get_ratio user_logs.loc[:,"num_25":"num_100"] = user_logs.loc[:,"num_25":"num_100"].div(user_logs.loc[:,"num_25":"num_100"].sum(axis=1), axis=0) user_logs.rename(columns = {'num_25':'num_25_ratio', 'num_100':'num_100_ratio'}, inplace =True) user_logs.dropna(inplace = True) # 0/0會有問題,把他drop掉 #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(user_logs) df = pd.merge(train,user_logs, on = 'msno', how = 'left') del user_logs gc.collect() print ('shape of df:', df.shape) # ################################################## # # All history # ################################################## # #core1:num_25_ratio # print ('core1') # tbl = df.groupby('msno').num_25_ratio.mean().to_frame() # tbl.columns = ['num_25_ratio-mean'] # tbl['num_25_ratio-min'] = df.groupby('msno').num_25_ratio.min() # tbl['num_25_ratio-max'] = df.groupby('msno').num_25_ratio.max() # tbl['num_25_ratio-median'] = df.groupby('msno').num_25_ratio.median() # tbl['num_25_ratio-std'] = df.groupby('msno').num_25_ratio.std() # tbl.reset_index(inplace = True) # #============================================================================== # print('reduce memory') # #============================================================================== # utils.reduce_memory(tbl) # # write # tbl.to_csv('../feature/{}/num_25_ratio.csv'.format(folder), index = False) # del tbl # gc.collect() # #core2:num_100_ratio # print ('core2') # tbl = df.groupby('msno').num_100_ratio.mean().to_frame() # tbl.columns = ['num_100_ratio-mean'] # tbl['num_100_ratio-min'] = df.groupby('msno').num_100_ratio.min() # tbl['num_100_ratio-max'] = df.groupby('msno').num_100_ratio.max() # tbl['num_100_ratio-median'] = df.groupby('msno').num_100_ratio.median() # tbl['num_100_ratio-std'] = df.groupby('msno').num_100_ratio.std() # tbl.reset_index(inplace = True) # #============================================================================== # print('reduce memory') # #============================================================================== # utils.reduce_memory(tbl) # # write # tbl.to_csv('../feature/{}/num_100_ratio.csv'.format(folder), index = False) # del tbl # gc.collect() ################################################## # n = 7 ################################################## df_ = df.groupby('msno').apply(within_n_days,T, n = 7).reset_index(drop = True) # #core1:num_25_ratio # print ('core1') # tbl = df_.groupby('msno').num_25_ratio.mean().to_frame() # tbl.columns = ['num_25_ratio_during_t_7-mean'] # tbl['num_25_ratio_during_t_7-min'] = df_.groupby('msno').num_25_ratio.min() # tbl['num_25_ratio_during_t_7-max'] = df_.groupby('msno').num_25_ratio.max() # tbl['num_25_ratio_during_t_7-median'] = df_.groupby('msno').num_25_ratio.median() # tbl['num_25_ratio_during_t_7-std'] = df_.groupby('msno').num_25_ratio.std() # tbl.reset_index(inplace = True) # #============================================================================== # print('reduce memory') # #============================================================================== # utils.reduce_memory(tbl) # # write # tbl.to_csv('../feature/{}/num_25_ratio_during_t_7.csv'.format(folder), index = False) # del tbl # gc.collect() #core2:num_100_ratio print ('core2') tbl = df_.groupby('msno').num_100_ratio.mean().to_frame() tbl.columns = ['num_100_ratio_during_t_7-mean'] #--->e04, 這邊打錯了,但沒有時間重跑了,原本是num_repeated_songs_during_t_7 tbl['num_100_ratio_during_t_7-min'] = df_.groupby('msno').num_100_ratio.min() tbl['num_100_ratio_during_t_7-max'] = df_.groupby('msno').num_100_ratio.max() tbl['num_100_ratio_during_t_7-median'] = df_.groupby('msno').num_100_ratio.median() tbl['num_100_ratio_during_t_7-std'] = df_.groupby('msno').num_100_ratio.std() tbl.reset_index(inplace = True) del df_ gc.collect() #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_100_ratio_during_t_7.csv'.format(folder), index = False) del tbl gc.collect()
def make(T): """ T = 0 folder = 'trainW-0' """ input_col = ['msno', 'date'] if T == -1: folder = 'test' #label train = pd.read_csv('../input/sample_submission_v2.csv')[[ 'msno' ]] # 此train代表的是test的user #file1 user_logs = utils.read_multiple_csv( '../feature/{}/compressed_user_logs'.format(folder), input_col, parse_dates=['date']) user_logs = pd.concat([ user_logs, pd.read_csv('../input/user_logs_v2.csv', parse_dates=['date' ])[input_col] ], ignore_index=True) #user_logs.sort_values(by = ['msno', 'date'],inplace = True) else: folder = 'trainW-' + str(T) #label train = pd.read_csv( '../input/preprocessed_data/trainW-{0}.csv'.format(T))[['msno']] #file1 user_logs = utils.read_multiple_csv( '../feature/{}/compressed_user_logs'.format(folder), input_col, parse_dates=['date']) ################################################## # basic procedure ################################################## #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(user_logs) df = pd.merge(train, user_logs, on='msno', how='left') del user_logs gc.collect() df.sort_values(by=['msno', 'date'], inplace=True) # have to do this line for next line df['date_diff'] = [i.days for i in (df.date - df['date'].shift(1))] print('shape of df:', df.shape) df = df.groupby('msno').apply(drop_first_columns) # 每個user第一欄不用 df.reset_index(drop=True, inplace=True) # ################################################## # # All history # ################################################## # for speed # # core # tbl = df[df.date_diff == 1].groupby('msno').date_diff.size().to_frame() # date_diff == 1: mean in a row # tbl.columns = ['listen_music_in_a_row_count'] # tbl['listen_music_in_a_row_ratio'] = tbl.listen_music_in_a_row_count / df.groupby('msno').date_diff.apply(len) # tbl.reset_index(inplace = True) # #============================================================================== # print('reduce memory') # #============================================================================== # utils.reduce_memory(tbl) # # write # tbl.to_csv('../feature/{}/listen_music_in_a_row_count.csv'.format(folder), index = False) # del tbl # gc.collect() ################################################## # n = 7 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=7).reset_index(drop=True) #core tbl = df_[df_.date_diff == 1].groupby('msno').date_diff.size().to_frame() tbl.columns = ['listen_music_in_a_row_count_during_t_7'] tbl['listen_music_in_a_row_ratio_during_t_7'] = tbl.listen_music_in_a_row_count_during_t_7 / df_.groupby( 'msno').date_diff.apply(len) tbl.reset_index(inplace=True) del df_ gc.collect() #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/listen_music_in_a_row_count_during_t_7.csv'.format( folder), index=False) del tbl gc.collect() ################################################## # n = 14 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=14).reset_index(drop=True) #core tbl = df_[df_.date_diff == 1].groupby('msno').date_diff.size().to_frame() tbl.columns = ['listen_music_in_a_row_count_during_t_14'] tbl['listen_music_in_a_row_ratio_during_t_14'] = tbl.listen_music_in_a_row_count_during_t_14 / df_.groupby( 'msno').date_diff.apply(len) tbl.reset_index(inplace=True) del df_ gc.collect() #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/listen_music_in_a_row_count_during_t_14.csv'.format( folder), index=False) del tbl gc.collect() ################################################## # n = 30 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=30).reset_index(drop=True) #core tbl = df_[df_.date_diff == 1].groupby('msno').date_diff.size().to_frame() tbl.columns = ['listen_music_in_a_row_count_during_t_30'] tbl['listen_music_in_a_row_ratio_during_t_30'] = tbl.listen_music_in_a_row_count_during_t_30 / df_.groupby( 'msno').date_diff.apply(len) tbl.reset_index(inplace=True) del df_ gc.collect() #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/listen_music_in_a_row_count_during_t_30.csv'.format( folder), index=False) del tbl gc.collect() ################################################## # n = 60 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=60).reset_index(drop=True) #core tbl = df_[df_.date_diff == 1].groupby('msno').date_diff.size().to_frame() tbl.columns = ['listen_music_in_a_row_count_during_t_60'] tbl['listen_music_in_a_row_ratio_during_t_60'] = tbl.listen_music_in_a_row_count_during_t_60 / df_.groupby( 'msno').date_diff.apply(len) tbl.reset_index(inplace=True) del df_ gc.collect() #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/listen_music_in_a_row_count_during_t_60.csv'.format( folder), index=False) del tbl gc.collect() ################################################## # n = 90 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=90).reset_index(drop=True) #core tbl = df_[df_.date_diff == 1].groupby('msno').date_diff.size().to_frame() tbl.columns = ['listen_music_in_a_row_count_during_t_90'] tbl['listen_music_in_a_row_ratio_during_t_90'] = tbl.listen_music_in_a_row_count_during_t_90 / df_.groupby( 'msno').date_diff.apply(len) tbl.reset_index(inplace=True) del df_ gc.collect() #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/listen_music_in_a_row_count_during_t_90.csv'.format( folder), index=False) del tbl gc.collect()
################################################## # Load members and ################################################## demographics = pd.read_csv('../input/preprocessed_data/demographics.csv') ################################################## # Convert string to datetime format ################################################## demographics[ 'registration_init_time'] = demographics.registration_init_time.apply( lambda x: datetime.strptime(str(x), '%Y%m%d')) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(demographics) #============================================================================== # def #============================================================================== def make(T): """ T = 0 folder = 'trainW-0' """ if T == -1: folder = 'test' train = pd.read_csv( '../input/sample_submission_v2.csv') # 此train代表的是test的user
def make(T): """ T = 0 folder = 'trainW-0' """ input_col = ['msno', 'date'] #output_col = ['msno','num_log_in','listening_longevity','log_in_ratio'] if T == -1: folder = 'test' #label train = pd.read_csv('../input/sample_submission_v2.csv')[[ 'msno' ]] # 此train代表的是test的user user_logs = utils.read_multiple_csv( '../feature/{}/compressed_user_logs'.format(folder), input_col, parse_dates=['date']) user_logs = pd.concat([ user_logs, pd.read_csv('../input/user_logs_v2.csv', parse_dates=['date' ])[input_col] ], ignore_index=True) #user_logs.sort_values(by = ['msno', 'date'],inplace = True) else: folder = 'trainW-' + str(T) #label train = pd.read_csv( '../input/preprocessed_data/trainW-{0}.csv'.format(T))[['msno']] user_logs = utils.read_multiple_csv( '../feature/{}/compressed_user_logs'.format(folder), input_col, parse_dates=['date']) ################################################## # basic procedure ################################################## #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(user_logs) df = pd.merge(train, user_logs, on='msno', how='left') del user_logs gc.collect() print('shape of df:', df.shape) ################################################## # All history ################################################## # count tbl = df.groupby('msno').date.size().to_frame() tbl.columns = ['num_log_in'] tbl.reset_index(inplace=True) # for computing log_in_ratio user_logs_copy = df.groupby('msno').apply(listening_longevity) user_logs_copy.drop_duplicates('msno', inplace=True) tbl = pd.merge(tbl, user_logs_copy, on='msno', how='left') del user_logs_copy #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) gc.collect() #log_in_ratio tbl['log_in_ratio'] = tbl.num_log_in / tbl.listening_longevity tbl.drop(['date', 'listening_longevity'], axis=1, inplace=True) # write tbl.to_csv('../feature/{}/num_log_in.csv'.format(folder), index=False) del tbl gc.collect() ################################################## # n = 7 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=7).reset_index(drop=True) tbl = df_.groupby('msno').date.size().to_frame() tbl.columns = ['num_log_in_during_t_7'] tbl.reset_index(inplace=True) # for computing log_in_ratio user_logs_copy = df_.groupby('msno').apply(listening_longevity) user_logs_copy.drop_duplicates('msno', inplace=True) tbl = pd.merge(tbl, user_logs_copy, on='msno', how='left') del user_logs_copy #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) gc.collect() #log_in_ratio tbl['log_in_ratio_during_t_7'] = tbl.num_log_in_during_t_7 / tbl.listening_longevity tbl.drop(['date', 'listening_longevity'], axis=1, inplace=True) # write tbl.to_csv('../feature/{}/num_log_in_during_t_7.csv'.format(folder), index=False) del tbl gc.collect() ################################################## # n = 14 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=14).reset_index(drop=True) tbl = df_.groupby('msno').date.size().to_frame() tbl.columns = ['num_log_in_during_t_14'] tbl.reset_index(inplace=True) # for computing log_in_ratio user_logs_copy = df_.groupby('msno').apply(listening_longevity) user_logs_copy.drop_duplicates('msno', inplace=True) tbl = pd.merge(tbl, user_logs_copy, on='msno', how='left') del user_logs_copy gc.collect() #log_in_ratio tbl['log_in_ratio_during_t_14'] = tbl.num_log_in_during_t_14 / tbl.listening_longevity tbl.drop(['date', 'listening_longevity'], axis=1, inplace=True) # write tbl.to_csv('../feature/{}/num_log_in_during_t_14.csv'.format(folder), index=False) del tbl gc.collect() ################################################## # n = 30 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=30).reset_index(drop=True) tbl = df_.groupby('msno').date.size().to_frame() tbl.columns = ['num_log_in_during_t_30'] tbl.reset_index(inplace=True) # for computing log_in_ratio user_logs_copy = df_.groupby('msno').apply(listening_longevity) user_logs_copy.drop_duplicates('msno', inplace=True) tbl = pd.merge(tbl, user_logs_copy, on='msno', how='left') del user_logs_copy gc.collect() #log_in_ratio tbl['log_in_ratio_during_t_30'] = tbl.num_log_in_during_t_30 / tbl.listening_longevity tbl.drop(['date', 'listening_longevity'], axis=1, inplace=True) # write tbl.to_csv('../feature/{}/num_log_in_during_t_30.csv'.format(folder), index=False) del tbl gc.collect() ################################################## # n = 60 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=60).reset_index(drop=True) tbl = df_.groupby('msno').date.size().to_frame() tbl.columns = ['num_log_in_during_t_60'] tbl.reset_index(inplace=True) # for computing log_in_ratio user_logs_copy = df_.groupby('msno').apply(listening_longevity) user_logs_copy.drop_duplicates('msno', inplace=True) tbl = pd.merge(tbl, user_logs_copy, on='msno', how='left') del user_logs_copy gc.collect() #log_in_ratio tbl['log_in_ratio_during_t_60'] = tbl.num_log_in_during_t_60 / tbl.listening_longevity tbl.drop(['date', 'listening_longevity'], axis=1, inplace=True) # write tbl.to_csv('../feature/{}/num_log_in_during_t_60.csv'.format(folder), index=False) del tbl gc.collect() ################################################## # n = 90 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=90).reset_index(drop=True) tbl = df_.groupby('msno').date.size().to_frame() tbl.columns = ['num_log_in_during_t_90'] tbl.reset_index(inplace=True) # for computing log_in_ratio user_logs_copy = df_.groupby('msno').apply(listening_longevity) user_logs_copy.drop_duplicates('msno', inplace=True) tbl = pd.merge(tbl, user_logs_copy, on='msno', how='left') del user_logs_copy gc.collect() #log_in_ratio tbl['log_in_ratio_during_t_90'] = tbl.num_log_in_during_t_90 / tbl.listening_longevity tbl.drop(['date', 'listening_longevity'], axis=1, inplace=True) # write tbl.to_csv('../feature/{}/num_log_in_during_t_90.csv'.format(folder), index=False) del tbl gc.collect()
def make(T): """ T = 0 folder = 'trainW-0' """ if T == -1: folder = 'test' user_logs = utils.read_multiple_csv('../feature/{}/compressed_user_logs'.format(folder),input_col) user_logs = pd.concat([user_logs,pd.read_csv('../input/user_logs_v2.csv')], ignore_index=True) # user_logs_v2.csv: inclue data of March, it's for testing set. user_logs.sort_values(by = ['msno', 'date'],inplace = True) train = pd.read_csv('../input/sample_submission_v2.csv') # 此train代表的是test的user else: folder = 'trainW-'+str(T) user_logs = utils.read_multiple_csv('../feature/{}/compressed_user_logs'.format(folder), input_col) train = pd.read_csv('../input/preprocessed_data/trainW-{0}.csv'.format(T))[['msno']] # we do not need is_churn #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(user_logs) utils.reduce_memory(train) #user_logs = user_logs.head(n = 5000) #merge df = pd.merge(train,user_logs, on = 'msno', how = 'left') #df = df.dropna() del user_logs gc.collect() ################################################## # All history ################################################## ######## # core1 ######## tbl = df.groupby('msno').num_25.mean().to_frame() tbl.columns = ['num_25-mean'] tbl['num_25-min'] = df.groupby('msno').num_25.min() tbl['num_25-max'] = df.groupby('msno').num_25.max() tbl['num_25-median'] = df.groupby('msno').num_25.median() tbl['num_25-std'] = df.groupby('msno').num_25.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_25.csv'.format(folder), index = False) ######## # core2 ######## tbl = df.groupby('msno').num_50.mean().to_frame() tbl.columns = ['num_50-mean'] tbl['num_50-min'] = df.groupby('msno').num_50.min() tbl['num_50-max'] = df.groupby('msno').num_50.max() tbl['num_50-median'] = df.groupby('msno').num_50.median() tbl['num_50-std'] = df.groupby('msno').num_50.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_50.csv'.format(folder), index = False) ######## # core3 ######## tbl = df.groupby('msno').num_75.mean().to_frame() tbl.columns = ['num_75-mean'] tbl['num_75-min'] = df.groupby('msno').num_75.min() tbl['num_75-max'] = df.groupby('msno').num_75.max() tbl['num_75-median'] = df.groupby('msno').num_75.median() tbl['num_75-std'] = df.groupby('msno').num_75.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_75.csv'.format(folder), index = False) ######## # core4 ######## tbl = df.groupby('msno').num_985.mean().to_frame() tbl.columns = ['num_985-mean'] tbl['num_985-min'] = df.groupby('msno').num_985.min() tbl['num_985-max'] = df.groupby('msno').num_985.max() tbl['num_985-median'] = df.groupby('msno').num_985.median() tbl['num_985-std'] = df.groupby('msno').num_985.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_985.csv'.format(folder), index = False) ######## # core5 ######## tbl = df.groupby('msno').num_100.mean().to_frame() tbl.columns = ['num_100-mean'] tbl['num_100-min'] = df.groupby('msno').num_100.min() tbl['num_100-max'] = df.groupby('msno').num_100.max() tbl['num_100-median'] = df.groupby('msno').num_100.median() tbl['num_100-std'] = df.groupby('msno').num_100.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_100.csv'.format(folder), index = False) ######## # core6 ######## tbl = df.groupby('msno').num_unq.mean().to_frame() tbl.columns = ['num_unq-mean'] tbl['num_unq-min'] = df.groupby('msno').num_unq.min() tbl['num_unq-max'] = df.groupby('msno').num_unq.max() tbl['num_unq-median'] = df.groupby('msno').num_unq.median() tbl['num_unq-std'] = df.groupby('msno').num_unq.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_unq.csv'.format(folder), index = False) del tbl gc.collect() ################################################## # near 5 ################################################## df_ = df.groupby('msno').apply(near,5).reset_index(drop = True) ######## # core1 ######## tbl = df_.groupby('msno').num_25.mean().to_frame() tbl.columns = ['num_25-mean_n5'] tbl['num_25-min_n5'] = df_.groupby('msno').num_25.min() tbl['num_25-max_n5'] = df_.groupby('msno').num_25.max() tbl['num_25-median_n5'] = df_.groupby('msno').num_25.median() tbl['num_25-std_n5'] = df_.groupby('msno').num_25.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_25_n5.csv'.format(folder), index = False) ######## # core2 ######## tbl = df_.groupby('msno').num_50.mean().to_frame() tbl.columns = ['num_50-mean_n5'] tbl['num_50-min_n5'] = df_.groupby('msno').num_50.min() tbl['num_50-max_n5'] = df_.groupby('msno').num_50.max() tbl['num_50-median_n5'] = df_.groupby('msno').num_50.median() tbl['num_50-std_n5'] = df_.groupby('msno').num_50.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_50_n5.csv'.format(folder), index = False) ######### # core3 ######### tbl = df_.groupby('msno').num_75.mean().to_frame() tbl.columns = ['num_75-mean_n5'] tbl['num_75-min_n5'] = df_.groupby('msno').num_75.min() tbl['num_75-max_n5'] = df_.groupby('msno').num_75.max() tbl['num_75-median_n5'] = df_.groupby('msno').num_75.median() tbl['num_75-std_n5'] = df_.groupby('msno').num_75.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_75_n5.csv'.format(folder), index = False) ######## # core4 ######## tbl = df_.groupby('msno').num_985.mean().to_frame() tbl.columns = ['num_985-mean_n5'] tbl['num_985-min_n5'] = df_.groupby('msno').num_985.min() tbl['num_985-max_n5'] = df_.groupby('msno').num_985.max() tbl['num_985-median_n5'] = df_.groupby('msno').num_985.median() tbl['num_985-std_n5'] = df_.groupby('msno').num_985.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_985_n5.csv'.format(folder), index = False) ######## # core5 ######## tbl = df_.groupby('msno').num_100.mean().to_frame() tbl.columns = ['num_100-mean_n5'] tbl['num_100-min_n5'] = df_.groupby('msno').num_100.min() tbl['num_100-max_n5'] = df_.groupby('msno').num_100.max() tbl['num_100-median_n5'] = df_.groupby('msno').num_100.median() tbl['num_100-std_n5'] = df_.groupby('msno').num_100.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_100_n5.csv'.format(folder), index = False) ######## # core6 ######## tbl = df_.groupby('msno').num_unq.mean().to_frame() tbl.columns = ['num_unq-mean_n5'] tbl['num_unq-min_n5'] = df_.groupby('msno').num_unq.min() tbl['num_unq-max_n5'] = df_.groupby('msno').num_unq.max() tbl['num_unq-median_n5'] = df_.groupby('msno').num_unq.median() tbl['num_unq-std_n5'] = df_.groupby('msno').num_unq.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_unq_n5.csv'.format(folder), index = False) del tbl del df_ gc.collect() ################################################## # only one prvious order ################################################## df_ = df.groupby('msno').apply(near,1).reset_index(drop = True) ######## # core1 ######## tbl = df_.groupby('msno').num_25.mean().to_frame() tbl.columns = ['num_25-mean_n1'] tbl['num_25-min_n1'] = df_.groupby('msno').num_25.min() tbl['num_25-max_n1'] = df_.groupby('msno').num_25.max() tbl['num_25-median_n1'] = df_.groupby('msno').num_25.median() tbl['num_25-std_n1'] = df_.groupby('msno').num_25.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_25_n1.csv'.format(folder), index = False) ######## # core2 ######## tbl = df_.groupby('msno').num_50.mean().to_frame() tbl.columns = ['num_50-mean_n1'] tbl['num_50-min_n1'] = df_.groupby('msno').num_50.min() tbl['num_50-max_n1'] = df_.groupby('msno').num_50.max() tbl['num_50-median_n1'] = df_.groupby('msno').num_50.median() tbl['num_50-std_n1'] = df_.groupby('msno').num_50.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_50_n1.csv'.format(folder), index = False) ######### # core3 ######### tbl = df_.groupby('msno').num_75.mean().to_frame() tbl.columns = ['num_75-mean_n1'] tbl['num_75-min_n1'] = df_.groupby('msno').num_75.min() tbl['num_75-max_n1'] = df_.groupby('msno').num_75.max() tbl['num_75-median_n1'] = df_.groupby('msno').num_75.median() tbl['num_75-std_n1'] = df_.groupby('msno').num_75.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_75_n1.csv'.format(folder), index = False) ######## # core4 ######## tbl = df_.groupby('msno').num_985.mean().to_frame() tbl.columns = ['num_985-mean_n1'] tbl['num_985-min_n1'] = df_.groupby('msno').num_985.min() tbl['num_985-max_n1'] = df_.groupby('msno').num_985.max() tbl['num_985-median_n1'] = df_.groupby('msno').num_985.median() tbl['num_985-std_n1'] = df_.groupby('msno').num_985.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_985_n1.csv'.format(folder), index = False) ######## # core5 ######## tbl = df_.groupby('msno').num_100.mean().to_frame() tbl.columns = ['num_100-mean_n1'] tbl['num_100-min_n1'] = df_.groupby('msno').num_100.min() tbl['num_100-max_n1'] = df_.groupby('msno').num_100.max() tbl['num_100-median_n1'] = df_.groupby('msno').num_100.median() tbl['num_100-std_n1'] = df_.groupby('msno').num_100.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_100_n1.csv'.format(folder), index = False) ######## # core6 ######## tbl = df_.groupby('msno').num_unq.mean().to_frame() tbl.columns = ['num_unq-mean_n1'] tbl['num_unq-min_n1'] = df_.groupby('msno').num_unq.min() tbl['num_unq-max_n1'] = df_.groupby('msno').num_unq.max() tbl['num_unq-median_n1'] = df_.groupby('msno').num_unq.median() tbl['num_unq-std_n1'] = df_.groupby('msno').num_unq.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_unq_n1.csv'.format(folder), index = False) del tbl del df_ gc.collect()
################################################## input_col = ['msno','transaction_date','is_membership_duration_equal_to_plan_days', 'is_membership_duration_longer_than_plan_days','is_early_expiration'] membership_loyalty = utils.read_multiple_csv('../input/preprocessed_data/transactions_date_base',input_col) # 20,000,000 #membership_loyalty = membership_loyalty.head(n = 500) ################################################## # Convert string to datetime format ################################################## membership_loyalty['transaction_date'] = membership_loyalty.transaction_date.apply(lambda x: datetime.strptime(x, '%Y-%m-%d')) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(membership_loyalty) def near(x, keep = 5): return x.tail(keep) #============================================================================== # def #============================================================================== def make(T): """ T = 0 folder = 'trainW-0' """ if T ==-1: folder = 'test'
################################################## # Load ################################################## input_col = ['msno', 'transaction_date', 'discount', 'is_discount', 'amt_per_day', 'cp_value'] transactions_price_plan_days = utils.read_multiple_csv('../input/preprocessed_data/transaction_price_and_play_days_base') # 20,000,000 #transactions_price_plan_days = transactions_price_plan_days.head( n = 1000) ################################################## # Convert string to datetime format ################################################## transactions_price_plan_days['transaction_date'] = transactions_price_plan_days.transaction_date.apply(lambda x: datetime.strptime(str(x), '%Y-%m-%d')) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(transactions_price_plan_days) def near(x, keep = 5): return x.tail(keep) #============================================================================== # def #============================================================================== def make(T): """ T = 0 folder = 'trainW-0' """
def make(T): """ T = 0 folder = 'trainW-0' """ if T ==-1: folder = 'test' train = pd.read_csv('../input/sample_submission_v2.csv') # 此train代表的是test的user else: folder = 'trainW-'+str(T) train = pd.read_csv('../input/preprocessed_data/trainW-{0}.csv'.format(T))[['msno','is_churn']] # the following style is silly, but it's all for saving memory if T == 0: df = pd.merge(train, membership_loyalty[(membership_loyalty.transaction_date < datetime.strptime('2017-03-01', '%Y-%m-%d'))], on=['msno'], how='left') del train elif T == 1: # w = 1:使用2月之前的資料當作history df = pd.merge(train, membership_loyalty[(membership_loyalty.transaction_date < datetime.strptime('2017-02-01', '%Y-%m-%d'))], on=['msno'], how='left') del train elif T == 2: # w = 2:使用1月之前的資料當作history df = pd.merge(train, membership_loyalty[(membership_loyalty.transaction_date < datetime.strptime('2017-01-01', '%Y-%m-%d'))], on=['msno'], how='left') del train elif T == -1: # w = -1:使用4月之前的資料當作history df = pd.merge(train, membership_loyalty[(membership_loyalty.transaction_date < datetime.strptime('2017-04-01', '%Y-%m-%d'))], on='msno', how='left') del train ################################################## # All history ################################################## #df = df.dropna() #core tbl = df.groupby(['msno']).is_membership_duration_equal_to_plan_days.sum().to_frame() tbl.columns = ['is_membership_duration_equal_to_plan_days_cnt'] tbl['is_membership_duration_equal_to_plan_days_ratio'] = df.groupby('msno').is_membership_duration_equal_to_plan_days.mean() tbl['is_membership_duration_longer_than_plan_days_cnt'] = df.groupby('msno').is_membership_duration_longer_than_plan_days.sum() tbl['is_membership_duration_longer_than_plan_days_ratio'] = df.groupby('msno').is_membership_duration_longer_than_plan_days.mean() tbl['is_early_expiration_cnt'] = df.groupby('msno').is_early_expiration.sum() tbl['is_early_expiration_ratio'] = df.groupby('msno').is_early_expiration.mean() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/regular_membership.csv'.format(folder), index = False) del tbl gc.collect() ################################################## # near 5 ################################################## #core df_ = df.groupby('msno').apply(near,5).reset_index(drop = True) tbl = df_.groupby(['msno']).is_membership_duration_equal_to_plan_days.sum().to_frame() tbl.columns = ['is_membership_duration_equal_to_plan_days_cnt_n5'] tbl['is_membership_duration_equal_to_plan_days_ratio_n5'] = df_.groupby('msno').is_membership_duration_equal_to_plan_days.mean() tbl['is_membership_duration_longer_than_plan_days_cnt_n5'] = df_.groupby('msno').is_membership_duration_longer_than_plan_days.sum() tbl['is_membership_duration_longer_than_plan_days_ratio_n5'] = df_.groupby('msno').is_membership_duration_longer_than_plan_days.mean() tbl['is_early_expiration_cnt_n5'] = df_.groupby('msno').is_early_expiration.sum() tbl['is_early_expiration_ratio_n5'] = df_.groupby('msno').is_early_expiration.mean() tbl.reset_index(inplace = True) del df_ #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/regular_membership_n5.csv'.format(folder), index = False) del tbl gc.collect() ################################################## # only one prvious order ################################################## #core df_ = df.groupby('msno').apply(near,1).reset_index(drop = True) tbl = df_.groupby(['msno']).is_membership_duration_equal_to_plan_days.sum().to_frame() tbl.columns = ['is_membership_duration_equal_to_plan_days_cnt_n1'] tbl['is_membership_duration_equal_to_plan_days_ratio_n1'] = df_.groupby('msno').is_membership_duration_equal_to_plan_days.mean() tbl['is_membership_duration_longer_than_plan_days_cnt_n1'] = df_.groupby('msno').is_membership_duration_longer_than_plan_days.sum() tbl['is_membership_duration_longer_than_plan_days_ratio_n1'] = df_.groupby('msno').is_membership_duration_longer_than_plan_days.mean() tbl['is_early_expiration_cnt_n1'] = df_.groupby('msno').is_early_expiration.sum() tbl['is_early_expiration_ratio_n1'] = df_.groupby('msno').is_early_expiration.mean() tbl.reset_index(inplace = True) del df_ #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/regular_membership_n1.csv'.format(folder), index = False) del tbl gc.collect()
def make(T): """ T = 0 folder = 'trainW-0' """ if T == -1: folder = 'test' train = pd.read_csv( '../input/sample_submission_v2.csv') # 此train代表的是test的user else: folder = 'trainW-' + str(T) train = pd.read_csv( '../input/preprocessed_data/trainW-{0}.csv'.format(T))[['msno']] # the following style is silly, but it's all for saving memory if T == 0: df = pd.merge( train, transactions[(transactions.transaction_date < datetime.strptime( '2017-03-01', '%Y-%m-%d'))], on=['msno'], how='left') del train elif T == 1: # w = 1:使用2月之前的資料當作history df = pd.merge( train, transactions[(transactions.transaction_date < datetime.strptime( '2017-02-01', '%Y-%m-%d'))], on=['msno'], how='left') del train elif T == 2: # w = 2:使用1月之前的資料當作history df = pd.merge( train, transactions[(transactions.transaction_date < datetime.strptime( '2017-01-01', '%Y-%m-%d'))], on=['msno'], how='left') del train elif T == -1: # w = -1:使用4月之前的資料當作history df = pd.merge( train, transactions[(transactions.transaction_date < datetime.strptime( '2017-04-01', '%Y-%m-%d'))], on='msno', how='left') del train ################################################## # All history ################################################## #df = df.dropna() df_ = df.groupby('msno').apply(make_order_number) #count cnt = df_.groupby(['msno', 'is_auto_renew']).size() cnt.name = 'cnt' cnt = cnt.reset_index() # chance user_onb_max = df_.groupby('msno').order_number.max().reset_index() user_onb_max.columns = ['msno', 'onb_max'] user_is_auto_renew_min = df_.groupby(['msno', 'is_auto_renew' ]).order_number.min().reset_index() user_is_auto_renew_min.columns = ['msno', 'is_auto_renew', 'onb_min'] chance = pd.merge(user_is_auto_renew_min, user_onb_max, on='msno', how='left') chance['is_auto_renew_chance'] = chance.onb_max - chance.onb_min + 1 tbl = pd.merge(cnt, chance, on=['msno', 'is_auto_renew'], how='left') tbl['auto_renew_ratio_by_chance'] = tbl.cnt / tbl.is_auto_renew_chance # total_count tbl_ = df_.groupby('msno').is_auto_renew.sum().to_frame() tbl_.columns = ['auto_renew_total_count'] tbl_['auto_renew_total_count_ratio'] = df_.groupby( 'msno').is_auto_renew.mean() tbl_.reset_index(inplace=True) tbl = pd.merge(tbl, tbl_, on='msno') col = [ 'msno', 'is_auto_renew_chance', 'auto_renew_ratio_by_chance', 'auto_renew_total_count', 'auto_renew_total_count_ratio' ] tbl = tbl[col] tbl.drop_duplicates('msno', keep='first', inplace=True) # 只要 is_auto_renew == 0 #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/is_auto_renew.csv'.format(folder), index=False) del tbl del tbl_ del df_ gc.collect() ################################################## # near 5 ################################################## df_ = df.groupby('msno').apply(near, 5).reset_index(drop=True) df_ = df_.groupby('msno').apply(make_order_number) #count cnt = df_.groupby(['msno', 'is_auto_renew']).size() cnt.name = 'cnt' cnt = cnt.reset_index() # chance user_onb_max = df_.groupby('msno').order_number.max().reset_index() user_onb_max.columns = ['msno', 'onb_max'] user_is_cancel_min = df_.groupby(['msno', 'is_auto_renew' ]).order_number.min().reset_index() user_is_cancel_min.columns = ['msno', 'is_auto_renew', 'onb_min'] chance = pd.merge(user_is_cancel_min, user_onb_max, on='msno', how='left') chance['is_auto_renew_chance_n5'] = chance.onb_max - chance.onb_min + 1 tbl = pd.merge(cnt, chance, on=['msno', 'is_auto_renew'], how='left') tbl['auto_renew_ratio_by_chance_n5'] = tbl.cnt / tbl.is_auto_renew_chance_n5 # total_count tbl_ = df_.groupby('msno').is_auto_renew.sum().to_frame() tbl_.columns = ['auto_renew_chance_n5_total_count_n5'] tbl_['auto_renew_chance_n5_total_count_ratio_n5'] = df_.groupby( 'msno').is_auto_renew.mean() tbl_.reset_index(inplace=True) tbl = pd.merge(tbl, tbl_, on='msno') col = [ 'msno', 'is_auto_renew_chance_n5', 'auto_renew_ratio_by_chance_n5', 'auto_renew_chance_n5_total_count_n5', 'auto_renew_chance_n5_total_count_ratio_n5' ] tbl = tbl[col] tbl.drop_duplicates('msno', keep='first', inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/is_auto_renew_n5.csv'.format(folder), index=False) del tbl del tbl_ del df_ gc.collect() ################################################## # only one prvious order ################################################## df_ = df.groupby('msno').apply(near, 1).reset_index(drop=True) df_ = df_.groupby('msno').apply(make_order_number) #count cnt = df_.groupby(['msno', 'is_auto_renew']).size() cnt.name = 'cnt' cnt = cnt.reset_index() # chance user_onb_max = df_.groupby('msno').order_number.max().reset_index() user_onb_max.columns = ['msno', 'onb_max'] user_is_cancel_min = df_.groupby(['msno', 'is_auto_renew' ]).order_number.min().reset_index() user_is_cancel_min.columns = ['msno', 'is_auto_renew', 'onb_min'] chance = pd.merge(user_is_cancel_min, user_onb_max, on='msno', how='left') chance['is_auto_renew_chance_n1'] = chance.onb_max - chance.onb_min + 1 tbl = pd.merge(cnt, chance, on=['msno', 'is_auto_renew'], how='left') tbl['auto_renew_chance_n1_ratio_by_chance_n1'] = tbl.cnt / tbl.is_auto_renew_chance_n1 # total_count tbl_ = df_.groupby('msno').is_auto_renew.sum().to_frame() tbl_.columns = ['auto_renew_total_count_n1'] tbl_['auto_renew_total_count_ratio_n1'] = df_.groupby( 'msno').is_auto_renew.mean() tbl_.reset_index(inplace=True) tbl = pd.merge(tbl, tbl_, on='msno') col = [ 'msno', 'is_auto_renew_chance_n1', 'auto_renew_chance_n1_ratio_by_chance_n1', 'auto_renew_total_count_n1', 'auto_renew_total_count_ratio_n1' ] tbl = tbl[col] tbl.drop_duplicates('msno', keep='first', inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/is_auto_renew_n1.csv'.format(folder), index=False) gc.collect()
else: return 4 # 最容易流失的city_zone def registered_via_zone(x): if x in set([4,3]): return 1 # 最容易流失的registered_via elif x in set([7,2]): return 2 # 最不容易流失的registered_via else: return 3 members['bd_zone'] = members.bd.apply(bd_zone) members['city_zone'] = members.city.apply(city_zone) members['registered_via_zone'] = members.city.apply(registered_via_zone) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(members) gc.collect() # write path = '../input/preprocessed_data/demographics.csv' members.to_csv(path, index = False)