def make(T): """ T = 0 folder = 'trainW-0' """ if T == -1: folder = 'test' user_logs = utils.read_multiple_csv( '../feature/{}/compressed_user_logs'.format(folder), input_col) user_logs = pd.concat([ user_logs, pd.read_csv('../input/user_logs_v2.csv', parse_dates=['date' ])[input_col] ], ignore_index=True) # user_logs_v2.csv: only inclue data of March, it's for testing set. #user_logs.sort_values(by = ['msno', 'date'], inplace = True) #這邊記憶體會激升, 速度會變慢因為concat and sort_values,現在問題是有需要sort_values麼?有groupby就不需要 else: folder = 'trainW-' + str(T) user_logs = utils.read_multiple_csv( '../feature/{}/compressed_user_logs'.format(folder), input_col) #user_logs = user_logs[user_logs.msno == 'Pz51LVoS9ENG1kNHQyrJ3gG8A163pyHi+gyvN2p+1nM='] #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(user_logs) gc.collect() print('shape1:', user_logs.shape) # core user_logs['total_secs_percentage'] = user_logs.total_secs.apply( lambda x: x / (24 * 60 * 60)) #user_logs['listening_habit_zone'] = user_logs.total_secs_percentage.apply(habit_discrimination) user_logs['num_of_time_the_user_has_logged_in'] = user_logs.groupby( 'msno').total_secs.cumsum() # make this line faster user_logs.drop('total_secs', axis=1, inplace=True) user_logs = user_logs.groupby('msno').apply( make_order_number) # make this line faster user_logs[ 'num_of_time_the_user_has_logged_in_ratio'] = user_logs.num_of_time_the_user_has_logged_in / user_logs.order_number user_logs.drop('order_number', axis=1, inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(user_logs) print('shape2:', user_logs.shape) ################################################## # write ################################################## path = '../feature/{}/user_logs_listening_habit'.format(folder) gc.collect() utils.to_multiple_csv(user_logs, path, split_size=8) del user_logs gc.collect() print('{0} done'.format(T))
def make(T): """ T = 0 folder = 'trainW-0' """ if T == -1: folder = 'test' user_logs = utils.read_multiple_csv( '../feature/{}/compressed_user_logs'.format(folder), input_col) user_logs = pd.concat([ user_logs, pd.read_csv('../input/user_logs_v2.csv', parse_dates=['date' ])[input_col] ], ignore_index=True) #user_logs.sort_values(by = ['msno', 'date'],inplace = True) else: folder = 'trainW-' + str(T) user_logs = utils.read_multiple_csv( '../feature/{}/compressed_user_logs'.format(folder), input_col) #user_logs = user_logs[user_logs.msno == 'Pz51LVoS9ENG1kNHQyrJ3gG8A163pyHi+gyvN2p+1nM='] #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(user_logs) print('shape1:', user_logs.shape) gc.collect() #incompleted vs completed user_logs['num_completed_songs'] = user_logs.num_100 + user_logs.num_985 user_logs[ 'num_incompleted_songs'] = user_logs.num_25 + user_logs.num_50 + user_logs.num_75 user_logs['completed_songs_ratio'] = user_logs.num_completed_songs / ( user_logs.num_incompleted_songs + user_logs.num_completed_songs) user_logs['is_satisfied'] = user_logs.completed_songs_ratio.apply( lambda x: 1 if x > 0.5 else 0) #num_repeated_songs user_logs['num_repeated_songs'] = (user_logs.num_100 + user_logs.num_985 + user_logs.num_75) / user_logs.num_unq user_logs.drop( ['num_25', 'num_50', 'num_75', 'num_985', 'num_100', 'num_unq'], axis=1, inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(user_logs) print('shape2:', user_logs.shape) gc.collect() ################################################## # write ################################################## path = '../feature/{}/user_logs_listening_behavior'.format(folder) gc.collect() utils.to_multiple_csv(user_logs, path, split_size=10) print('{0} done'.format(T))
def make(T): """ T = 0 folder = 'trainW-0' """ if T == -1: folder = 'test' user_logs = utils.read_multiple_csv('../feature/{}/compressed_user_logs'.format(folder),input_col) user_logs = pd.concat([user_logs,pd.read_csv('../input/user_logs_v2.csv', parse_dates = ['date'])[input_col]], ignore_index=True) else: folder = 'trainW-'+str(T) user_logs = utils.read_multiple_csv('../feature/{}/compressed_user_logs'.format(folder), input_col) #user_logs = user_logs[user_logs.msno == 'Pz51LVoS9ENG1kNHQyrJ3gG8A163pyHi+gyvN2p+1nM='] #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(user_logs) gc.collect() print ('shape1:', user_logs.shape) #get_ratio user_logs.loc[:,"num_25":"num_100"] = user_logs.loc[:,"num_25":"num_100"].div(user_logs.loc[:,"num_25":"num_100"].sum(axis=1), axis=0) user_logs.rename(columns = {'num_25':'num_25_ratio','num_50':'num_50_ratio', 'num_75':'num_75_ratio','num_985':'num_985_ratio', 'num_100':'num_100_ratio'}, inplace =True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(user_logs) gc.collect() ################################################## # write ################################################## path = '../feature/{}/user_logs_listening_freq'.format(folder) gc.collect() utils.to_multiple_csv(user_logs, path, split_size = 10) print ('{0} done'.format(T))
import numpy as np from tqdm import tqdm import os import utils # written by author from glob import glob from datetime import datetime, timedelta import multiprocessing as mp import gc # for automatic releasing memory from collections import Counter ################################################## # Load ################################################## input_col = ['msno', 'transaction_date', 'discount', 'is_discount', 'amt_per_day', 'cp_value'] transactions_price_plan_days = utils.read_multiple_csv('../input/preprocessed_data/transaction_price_and_play_days_base') # 20,000,000 #transactions_price_plan_days = transactions_price_plan_days.head( n = 1000) ################################################## # Convert string to datetime format ################################################## transactions_price_plan_days['transaction_date'] = transactions_price_plan_days.transaction_date.apply(lambda x: datetime.strptime(str(x), '%Y-%m-%d')) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(transactions_price_plan_days) def near(x, keep = 5): return x.tail(keep)
import pandas as pd import numpy as np from tqdm import tqdm import os import utils # written by author from glob import glob from datetime import datetime, timedelta import multiprocessing as mp import gc # for automatic releasing memory ################################################## # Load ################################################## input_col = ['msno','transaction_date','is_membership_duration_equal_to_plan_days', 'is_membership_duration_longer_than_plan_days','is_early_expiration'] membership_loyalty = utils.read_multiple_csv('../input/preprocessed_data/transactions_date_base',input_col) # 20,000,000 #membership_loyalty = membership_loyalty.head(n = 500) ################################################## # Convert string to datetime format ################################################## membership_loyalty['transaction_date'] = membership_loyalty.transaction_date.apply(lambda x: datetime.strptime(x, '%Y-%m-%d')) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(membership_loyalty) def near(x, keep = 5): return x.tail(keep)
import time import pandas as pd import numpy as np from tqdm import tqdm import os import utils # written by author from glob import glob from datetime import datetime from collections import Counter from collections import defaultdict ################################################## # Load transaction ################################################## transactions = utils.read_multiple_csv('../input/preprocessed_data/transactions') ################################################## # payment_method_id ################################################## payment_method_id_count = Counter(transactions['payment_method_id']).most_common() #core payment_method = pd.DataFrame({'payment_method_id':[i[0] for i in payment_method_id_count], 'count': [i[1] for i in payment_method_id_count] })[['payment_method_id','count']] payment_method['method_ratio'] = payment_method['count'] / sum(payment_method['count']) payment_method['top_3_payment_method'] = [0 if c <1819465 else 1 for c in payment_method['count'] ] payment_method['between_3_to_5_payment_method'] = [1 if 182139 < c <1819465 else 0 for c in payment_method['count'] ] payment_method['out_of_10_payment_method'] = [1 if c <247463 else 0 for c in payment_method['count'] ]
def make(T): """ T = 0 folder = 'trainW-0' """ if T == -1: folder = 'test' train = pd.read_csv( '../input/sample_submission_v2.csv') # 此train代表的是test的user train['w'] = T membership_loyalty = utils.read_multiple_csv( '../feature/{}/days_since_the_last_transactions'.format(folder), input_col) else: folder = 'trainW-' + str(T) membership_loyalty = utils.read_multiple_csv( '../feature/{}/days_since_the_last_transactions'.format(folder), input_col) train = pd.read_csv( '../input/preprocessed_data/trainW-{0}.csv'.format(T))[[ 'msno', 'w' ]] # we do not need is_churn #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(membership_loyalty) ################################################## # All history ################################################## # merge df = pd.merge(train, membership_loyalty, on=['msno', 'w'], how='left') ######## # core1 ######## tbl = df.groupby('msno').days_since_the_last_expiration.mean().to_frame() tbl.columns = ['days_since_the_last_expiration-mean'] tbl['days_since_the_last_expiration-min'] = df.groupby( 'msno').days_since_the_last_expiration.min() tbl['days_since_the_last_expiration-max'] = df.groupby( 'msno').days_since_the_last_expiration.max() tbl['days_since_the_last_expiration-median'] = df.groupby( 'msno').days_since_the_last_expiration.median() tbl['days_since_the_last_expiration-std'] = df.groupby( 'msno').days_since_the_last_expiration.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_expiration.csv'.format(folder), index=False) ######## # core2 ######## tbl = df.groupby('msno').days_since_the_last_subscription.mean().to_frame() tbl.columns = ['days_since_the_last_subscription-mean'] tbl['days_since_the_last_subscription-min'] = df.groupby( 'msno').days_since_the_last_subscription.min() tbl['days_since_the_last_subscription-max'] = df.groupby( 'msno').days_since_the_last_subscription.max() tbl['days_since_the_last_subscription-median'] = df.groupby( 'msno').days_since_the_last_subscription.median() tbl['days_since_the_last_subscription-std'] = df.groupby( 'msno').days_since_the_last_subscription.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_subscription.csv'.format(folder), index=False) ######### # core3 ######### tbl = df.groupby( 'msno')['days_since_the_last_expiration-cumsum'].mean().to_frame() tbl.columns = ['days_since_the_last_expiration-cumsum-mean'] tbl['days_since_the_last_expiration-cumsum-min'] = df.groupby( 'msno')['days_since_the_last_expiration-cumsum'].min() tbl['days_since_the_last_expiration-cumsum-max'] = df.groupby( 'msno')['days_since_the_last_expiration-cumsum'].max() tbl['days_since_the_last_expiration-cumsum-median'] = df.groupby( 'msno')['days_since_the_last_expiration-cumsum'].median() tbl['days_since_the_last_expiration-cumsum-std'] = df.groupby( 'msno')['days_since_the_last_expiration-cumsum'].std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_expiration-cumsum.csv'.format( folder), index=False) ######## # core4 ######## tbl = df.groupby( 'msno').days_since_the_last_expiration_ratio.mean().to_frame() tbl.columns = ['days_since_the_last_expiration_ratio-mean'] tbl['days_since_the_last_expiration_ratio-min'] = df.groupby( 'msno').days_since_the_last_expiration_ratio.min() tbl['days_since_the_last_expiration_ratio-max'] = df.groupby( 'msno').days_since_the_last_expiration_ratio.max() tbl['days_since_the_last_expiration_ratio-median'] = df.groupby( 'msno').days_since_the_last_expiration_ratio.median() tbl['days_since_the_last_expiration_ratio-std'] = df.groupby( 'msno').days_since_the_last_expiration_ratio.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/days_since_the_last_expiration_ratio.csv'.format( folder), index=False) ######## # core5 ######## tbl = df.groupby( 'msno').days_since_the_last_subscription_ratio.mean().to_frame() tbl.columns = ['days_since_the_last_subscription_ratio-mean'] tbl['days_since_the_last_subscription_ratio-min'] = df.groupby( 'msno').days_since_the_last_subscription_ratio.min() tbl['days_since_the_last_subscription_ratio-max'] = df.groupby( 'msno').days_since_the_last_subscription_ratio.max() tbl['days_since_the_last_subscription_ratio-median'] = df.groupby( 'msno').days_since_the_last_subscription_ratio.median() tbl['days_since_the_last_subscription_ratio-std'] = df.groupby( 'msno').days_since_the_last_subscription_ratio.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_subscription_ratio.csv'.format( folder), index=False) ######## # core6 ######## tbl = df.groupby( 'msno').days_since_the_first_subscription.mean().to_frame() tbl.columns = ['days_since_the_first_subscription-mean'] tbl['days_since_the_first_subscription-min'] = df.groupby( 'msno').days_since_the_first_subscription.min() tbl['days_since_the_first_subscription-max'] = df.groupby( 'msno').days_since_the_first_subscription.max() tbl['days_since_the_first_subscription-median'] = df.groupby( 'msno').days_since_the_first_subscription.median() tbl['days_since_the_first_subscription-std'] = df.groupby( 'msno').days_since_the_first_subscription.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_first_subscription.csv'.format(folder), index=False) ################################################## # near 5 ################################################## df_ = df.groupby('msno').apply(near, 5).reset_index(drop=True) ######## # core1 ######## tbl = df_.groupby('msno').days_since_the_last_expiration.mean().to_frame() tbl.columns = ['days_since_the_last_expiration-mean_n5'] tbl['days_since_the_last_expiration-min_n5'] = df_.groupby( 'msno').days_since_the_last_expiration.min() tbl['days_since_the_last_expiration-max_n5'] = df_.groupby( 'msno').days_since_the_last_expiration.max() tbl['days_since_the_last_expiration-median_n5'] = df_.groupby( 'msno').days_since_the_last_expiration.median() tbl['days_since_the_last_expiration-std_n5'] = df_.groupby( 'msno').days_since_the_last_expiration.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_expiration_n5.csv'.format(folder), index=False) ######## # core2 ######## tbl = df_.groupby( 'msno').days_since_the_last_subscription.mean().to_frame() tbl.columns = ['days_since_the_last_subscription-mean_n5'] tbl['days_since_the_last_subscription-min_n5'] = df_.groupby( 'msno').days_since_the_last_subscription.min() tbl['days_since_the_last_subscription-max_n5'] = df_.groupby( 'msno').days_since_the_last_subscription.max() tbl['days_since_the_last_subscription-median_n5'] = df_.groupby( 'msno').days_since_the_last_subscription.median() tbl['days_since_the_last_subscription-std_n5'] = df_.groupby( 'msno').days_since_the_last_subscription.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_subscription_n5.csv'.format(folder), index=False) ######### # core3 ######### tbl = df_.groupby( 'msno')['days_since_the_last_expiration-cumsum'].mean().to_frame() tbl.columns = ['days_since_the_last_expiration-cumsum-mean_n5'] tbl['days_since_the_last_expiration-cumsum-min_n5'] = df_.groupby( 'msno')['days_since_the_last_expiration-cumsum'].min() tbl['days_since_the_last_expiration-cumsum-max_n5'] = df_.groupby( 'msno')['days_since_the_last_expiration-cumsum'].max() tbl['days_since_the_last_expiration-cumsum-median_n5'] = df_.groupby( 'msno')['days_since_the_last_expiration-cumsum'].median() tbl['days_since_the_last_expiration-cumsum-std_n5'] = df_.groupby( 'msno')['days_since_the_last_expiration-cumsum'].std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_expiration-cumsum_n5.csv'.format( folder), index=False) ######## # core4 ######## tbl = df_.groupby( 'msno').days_since_the_last_expiration_ratio.mean().to_frame() tbl.columns = ['days_since_the_last_expiration_ratio-mean_n5'] tbl['days_since_the_last_expiration_ratio-min_n5'] = df_.groupby( 'msno').days_since_the_last_expiration_ratio.min() tbl['days_since_the_last_expiration_ratio-max_n5'] = df_.groupby( 'msno').days_since_the_last_expiration_ratio.max() tbl['days_since_the_last_expiration_ratio-median_n5'] = df_.groupby( 'msno').days_since_the_last_expiration_ratio.median() tbl['days_since_the_last_expiration_ratio-std_n5'] = df_.groupby( 'msno').days_since_the_last_expiration_ratio.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_expiration_ratio_n5.csv'.format( folder), index=False) ######## # core5 ######## tbl = df_.groupby( 'msno').days_since_the_last_subscription_ratio.mean().to_frame() tbl.columns = ['days_since_the_last_subscription_ratio-mean_n5'] tbl['days_since_the_last_subscription_ratio-min_n5'] = df_.groupby( 'msno').days_since_the_last_subscription_ratio.min() tbl['days_since_the_last_subscription_ratio-max_n5'] = df_.groupby( 'msno').days_since_the_last_subscription_ratio.max() tbl['days_since_the_last_subscription_ratio-median_n5'] = df_.groupby( 'msno').days_since_the_last_subscription_ratio.median() tbl['days_since_the_last_subscription_ratio-std_n5'] = df_.groupby( 'msno').days_since_the_last_subscription_ratio.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_subscription_ratio_n5.csv'.format( folder), index=False) ######## # core6 ######## tbl = df_.groupby( 'msno').days_since_the_first_subscription.mean().to_frame() tbl.columns = ['days_since_the_first_subscription-mean_n5'] tbl['days_since_the_first_subscription-min_n5'] = df_.groupby( 'msno').days_since_the_first_subscription.min() tbl['days_since_the_first_subscription-max_n5'] = df_.groupby( 'msno').days_since_the_first_subscription.max() tbl['days_since_the_first_subscription-median_n5'] = df_.groupby( 'msno').days_since_the_first_subscription.median() tbl['days_since_the_first_subscription-std_n5'] = df_.groupby( 'msno').days_since_the_first_subscription.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/days_since_the_first_subscription_n5.csv'.format( folder), index=False) del df_ ################################################## # only one prvious order ################################################## df_ = df.groupby('msno').apply(near, 1).reset_index(drop=True) ######## # core1 ######## tbl = df_.groupby('msno').days_since_the_last_expiration.mean().to_frame() tbl.columns = ['days_since_the_last_expiration-mean_n1'] tbl['days_since_the_last_expiration-min_n1'] = df_.groupby( 'msno').days_since_the_last_expiration.min() tbl['days_since_the_last_expiration-max_n1'] = df_.groupby( 'msno').days_since_the_last_expiration.max() tbl['days_since_the_last_expiration-median_n1'] = df_.groupby( 'msno').days_since_the_last_expiration.median() tbl['days_since_the_last_expiration-std_n1'] = df_.groupby( 'msno').days_since_the_last_expiration.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_expiration_n1.csv'.format(folder), index=False) ######## # core2 ######## tbl = df_.groupby( 'msno').days_since_the_last_subscription.mean().to_frame() tbl.columns = ['days_since_the_last_subscription-mean_n1'] tbl['days_since_the_last_subscription-min_n1'] = df_.groupby( 'msno').days_since_the_last_subscription.min() tbl['days_since_the_last_subscription-max_n1'] = df_.groupby( 'msno').days_since_the_last_subscription.max() tbl['days_since_the_last_subscription-median_n1'] = df_.groupby( 'msno').days_since_the_last_subscription.median() tbl['days_since_the_last_subscription-std_n1'] = df_.groupby( 'msno').days_since_the_last_subscription.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_subscription_n1.csv'.format(folder), index=False) ######### # core3 ######### tbl = df_.groupby( 'msno')['days_since_the_last_expiration-cumsum'].mean().to_frame() tbl.columns = ['days_since_the_last_expiration-cumsum-mean_n1'] tbl['days_since_the_last_expiration-cumsum-min_n1'] = df_.groupby( 'msno')['days_since_the_last_expiration-cumsum'].min() tbl['days_since_the_last_expiration-cumsum-max_n1'] = df_.groupby( 'msno')['days_since_the_last_expiration-cumsum'].max() tbl['days_since_the_last_expiration-cumsum-median_n1'] = df_.groupby( 'msno')['days_since_the_last_expiration-cumsum'].median() tbl['days_since_the_last_expiration-cumsum-std_n1'] = df_.groupby( 'msno')['days_since_the_last_expiration-cumsum'].std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_expiration-cumsum_n1.csv'.format( folder), index=False) ######## # core4 ######## tbl = df_.groupby( 'msno').days_since_the_last_expiration_ratio.mean().to_frame() tbl.columns = ['days_since_the_last_expiration_ratio-mean_n1'] tbl['days_since_the_last_expiration_ratio-min_n1'] = df_.groupby( 'msno').days_since_the_last_expiration_ratio.min() tbl['days_since_the_last_expiration_ratio-max_n1'] = df_.groupby( 'msno').days_since_the_last_expiration_ratio.max() tbl['days_since_the_last_expiration_ratio-median_n1'] = df_.groupby( 'msno').days_since_the_last_expiration_ratio.median() tbl['days_since_the_last_expiration_ratio-std_n1'] = df_.groupby( 'msno').days_since_the_last_expiration_ratio.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_expiration_ratio_n1.csv'.format( folder), index=False) ######## # core5 ######## tbl = df_.groupby( 'msno').days_since_the_last_subscription_ratio.mean().to_frame() tbl.columns = ['days_since_the_last_subscription_ratio-mean_n1'] tbl['days_since_the_last_subscription_ratio-min_n1'] = df_.groupby( 'msno').days_since_the_last_subscription_ratio.min() tbl['days_since_the_last_subscription_ratio-max_n1'] = df_.groupby( 'msno').days_since_the_last_subscription_ratio.max() tbl['days_since_the_last_subscription_ratio-median_n1'] = df_.groupby( 'msno').days_since_the_last_subscription_ratio.median() tbl['days_since_the_last_subscription_ratio-std_n1'] = df_.groupby( 'msno').days_since_the_last_subscription_ratio.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/days_since_the_last_subscription_ratio_n1.csv'.format( folder), index=False) ######## # core6 ######## tbl = df_.groupby( 'msno').days_since_the_first_subscription.mean().to_frame() tbl.columns = ['days_since_the_first_subscription-mean_n1'] tbl['days_since_the_first_subscription-min_n1'] = df_.groupby( 'msno').days_since_the_first_subscription.min() tbl['days_since_the_first_subscription-max_n1'] = df_.groupby( 'msno').days_since_the_first_subscription.max() tbl['days_since_the_first_subscription-median_n1'] = df_.groupby( 'msno').days_since_the_first_subscription.median() tbl['days_since_the_first_subscription-std_n1'] = df_.groupby( 'msno').days_since_the_first_subscription.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/days_since_the_first_subscription_n1.csv'.format( folder), index=False) del df_
''' import warnings warnings.filterwarnings("ignore") import pandas as pd import numpy as np import utils # made by author for efficiently dealing with data import datetime # datetime preprocessing import gc # reduce memory ################################## # loading data ################################## train = pd.read_csv('../Raw_Data/training-set.csv', header=None) test = pd.read_csv('../Raw_Data/testing-set.csv', header=None) query_log = utils.read_multiple_csv( '../Raw_Data/query_log') # 從2017年三月到2017年五月的可疑檔案紀錄 test.columns = ['id', 'label'] train.columns = ['id', 'label'] query_log.columns = ['id', 'CustomerID', 'QueryTS', 'ProductID'] # CustomerID: 使用者裝置ID,用來識別同一個裝置 # QueryTS: 該筆資料發生的時間 # ProductID: 該使用者裝置的產品代碼 def unixtimestamp_to_datetime(unixtimestamp): ''' Convert Unix timestamp to datetime in Python Parameters ---------- unixtimestamp : str
import time import pandas as pd import numpy as np from tqdm import tqdm import os import utils # written by author from glob import glob from datetime import datetime, timedelta import multiprocessing as mp import gc # for automatic releasing memory ################################################## # Load ################################################## input_col = ['msno', 'transaction_date', 'is_auto_renew'] transactions = utils.read_multiple_csv( '../input/preprocessed_data/transactions', input_col) # 20,000,000 #transactions = transactions.head(n = 5000) ################################################## # Convert string to datetime format ################################################## transactions['transaction_date'] = transactions.transaction_date.apply( lambda x: datetime.strptime(x, '%Y-%m-%d')) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(transactions)
def make(T): """ T = 0 folder = 'trainW-0' """ input_col = ['msno','date','num_25','num_100'] # for speed, only considering two extremes, num_25 and num_100 if T == -1: folder = 'test' #label train = pd.read_csv('../input/sample_submission_v2.csv')[['msno']] # 此train代表的是test的user #file1 user_logs = utils.read_multiple_csv('../feature/{}/compressed_user_logs'.format(folder), input_col, parse_dates = ['date']) user_logs = pd.concat([user_logs,pd.read_csv('../input/user_logs_v2.csv',parse_dates = ['date'])[input_col]], ignore_index = True) #user_logs.sort_values(by = ['msno', 'date'],inplace = True) else: folder = 'trainW-'+ str(T) #label train = pd.read_csv('../input/preprocessed_data/trainW-{0}.csv'.format(T))[['msno']] #file1 user_logs = utils.read_multiple_csv('../feature/{}/compressed_user_logs'.format(folder), input_col, parse_dates = ['date'] ) ################################################## # basic procedure ################################################## #get_ratio user_logs.loc[:,"num_25":"num_100"] = user_logs.loc[:,"num_25":"num_100"].div(user_logs.loc[:,"num_25":"num_100"].sum(axis=1), axis=0) user_logs.rename(columns = {'num_25':'num_25_ratio', 'num_100':'num_100_ratio'}, inplace =True) user_logs.dropna(inplace = True) # 0/0會有問題,把他drop掉 #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(user_logs) df = pd.merge(train,user_logs, on = 'msno', how = 'left') del user_logs gc.collect() print ('shape of df:', df.shape) # ################################################## # # All history # ################################################## # #core1:num_25_ratio # print ('core1') # tbl = df.groupby('msno').num_25_ratio.mean().to_frame() # tbl.columns = ['num_25_ratio-mean'] # tbl['num_25_ratio-min'] = df.groupby('msno').num_25_ratio.min() # tbl['num_25_ratio-max'] = df.groupby('msno').num_25_ratio.max() # tbl['num_25_ratio-median'] = df.groupby('msno').num_25_ratio.median() # tbl['num_25_ratio-std'] = df.groupby('msno').num_25_ratio.std() # tbl.reset_index(inplace = True) # #============================================================================== # print('reduce memory') # #============================================================================== # utils.reduce_memory(tbl) # # write # tbl.to_csv('../feature/{}/num_25_ratio.csv'.format(folder), index = False) # del tbl # gc.collect() # #core2:num_100_ratio # print ('core2') # tbl = df.groupby('msno').num_100_ratio.mean().to_frame() # tbl.columns = ['num_100_ratio-mean'] # tbl['num_100_ratio-min'] = df.groupby('msno').num_100_ratio.min() # tbl['num_100_ratio-max'] = df.groupby('msno').num_100_ratio.max() # tbl['num_100_ratio-median'] = df.groupby('msno').num_100_ratio.median() # tbl['num_100_ratio-std'] = df.groupby('msno').num_100_ratio.std() # tbl.reset_index(inplace = True) # #============================================================================== # print('reduce memory') # #============================================================================== # utils.reduce_memory(tbl) # # write # tbl.to_csv('../feature/{}/num_100_ratio.csv'.format(folder), index = False) # del tbl # gc.collect() ################################################## # n = 7 ################################################## df_ = df.groupby('msno').apply(within_n_days,T, n = 7).reset_index(drop = True) # #core1:num_25_ratio # print ('core1') # tbl = df_.groupby('msno').num_25_ratio.mean().to_frame() # tbl.columns = ['num_25_ratio_during_t_7-mean'] # tbl['num_25_ratio_during_t_7-min'] = df_.groupby('msno').num_25_ratio.min() # tbl['num_25_ratio_during_t_7-max'] = df_.groupby('msno').num_25_ratio.max() # tbl['num_25_ratio_during_t_7-median'] = df_.groupby('msno').num_25_ratio.median() # tbl['num_25_ratio_during_t_7-std'] = df_.groupby('msno').num_25_ratio.std() # tbl.reset_index(inplace = True) # #============================================================================== # print('reduce memory') # #============================================================================== # utils.reduce_memory(tbl) # # write # tbl.to_csv('../feature/{}/num_25_ratio_during_t_7.csv'.format(folder), index = False) # del tbl # gc.collect() #core2:num_100_ratio print ('core2') tbl = df_.groupby('msno').num_100_ratio.mean().to_frame() tbl.columns = ['num_100_ratio_during_t_7-mean'] #--->e04, 這邊打錯了,但沒有時間重跑了,原本是num_repeated_songs_during_t_7 tbl['num_100_ratio_during_t_7-min'] = df_.groupby('msno').num_100_ratio.min() tbl['num_100_ratio_during_t_7-max'] = df_.groupby('msno').num_100_ratio.max() tbl['num_100_ratio_during_t_7-median'] = df_.groupby('msno').num_100_ratio.median() tbl['num_100_ratio_during_t_7-std'] = df_.groupby('msno').num_100_ratio.std() tbl.reset_index(inplace = True) del df_ gc.collect() #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_100_ratio_during_t_7.csv'.format(folder), index = False) del tbl gc.collect()
def make(T): """ T = 0 folder = 'trainW-0' """ input_col = ['msno', 'date'] if T == -1: folder = 'test' #label train = pd.read_csv('../input/sample_submission_v2.csv')[[ 'msno' ]] # 此train代表的是test的user #file1 user_logs = utils.read_multiple_csv( '../feature/{}/compressed_user_logs'.format(folder), input_col, parse_dates=['date']) user_logs = pd.concat([ user_logs, pd.read_csv('../input/user_logs_v2.csv', parse_dates=['date' ])[input_col] ], ignore_index=True) #user_logs.sort_values(by = ['msno', 'date'],inplace = True) else: folder = 'trainW-' + str(T) #label train = pd.read_csv( '../input/preprocessed_data/trainW-{0}.csv'.format(T))[['msno']] #file1 user_logs = utils.read_multiple_csv( '../feature/{}/compressed_user_logs'.format(folder), input_col, parse_dates=['date']) ################################################## # basic procedure ################################################## #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(user_logs) df = pd.merge(train, user_logs, on='msno', how='left') del user_logs gc.collect() df.sort_values(by=['msno', 'date'], inplace=True) # have to do this line for next line df['date_diff'] = [i.days for i in (df.date - df['date'].shift(1))] print('shape of df:', df.shape) df = df.groupby('msno').apply(drop_first_columns) # 每個user第一欄不用 df.reset_index(drop=True, inplace=True) # ################################################## # # All history # ################################################## # for speed # # core # tbl = df[df.date_diff == 1].groupby('msno').date_diff.size().to_frame() # date_diff == 1: mean in a row # tbl.columns = ['listen_music_in_a_row_count'] # tbl['listen_music_in_a_row_ratio'] = tbl.listen_music_in_a_row_count / df.groupby('msno').date_diff.apply(len) # tbl.reset_index(inplace = True) # #============================================================================== # print('reduce memory') # #============================================================================== # utils.reduce_memory(tbl) # # write # tbl.to_csv('../feature/{}/listen_music_in_a_row_count.csv'.format(folder), index = False) # del tbl # gc.collect() ################################################## # n = 7 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=7).reset_index(drop=True) #core tbl = df_[df_.date_diff == 1].groupby('msno').date_diff.size().to_frame() tbl.columns = ['listen_music_in_a_row_count_during_t_7'] tbl['listen_music_in_a_row_ratio_during_t_7'] = tbl.listen_music_in_a_row_count_during_t_7 / df_.groupby( 'msno').date_diff.apply(len) tbl.reset_index(inplace=True) del df_ gc.collect() #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/listen_music_in_a_row_count_during_t_7.csv'.format( folder), index=False) del tbl gc.collect() ################################################## # n = 14 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=14).reset_index(drop=True) #core tbl = df_[df_.date_diff == 1].groupby('msno').date_diff.size().to_frame() tbl.columns = ['listen_music_in_a_row_count_during_t_14'] tbl['listen_music_in_a_row_ratio_during_t_14'] = tbl.listen_music_in_a_row_count_during_t_14 / df_.groupby( 'msno').date_diff.apply(len) tbl.reset_index(inplace=True) del df_ gc.collect() #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/listen_music_in_a_row_count_during_t_14.csv'.format( folder), index=False) del tbl gc.collect() ################################################## # n = 30 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=30).reset_index(drop=True) #core tbl = df_[df_.date_diff == 1].groupby('msno').date_diff.size().to_frame() tbl.columns = ['listen_music_in_a_row_count_during_t_30'] tbl['listen_music_in_a_row_ratio_during_t_30'] = tbl.listen_music_in_a_row_count_during_t_30 / df_.groupby( 'msno').date_diff.apply(len) tbl.reset_index(inplace=True) del df_ gc.collect() #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/listen_music_in_a_row_count_during_t_30.csv'.format( folder), index=False) del tbl gc.collect() ################################################## # n = 60 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=60).reset_index(drop=True) #core tbl = df_[df_.date_diff == 1].groupby('msno').date_diff.size().to_frame() tbl.columns = ['listen_music_in_a_row_count_during_t_60'] tbl['listen_music_in_a_row_ratio_during_t_60'] = tbl.listen_music_in_a_row_count_during_t_60 / df_.groupby( 'msno').date_diff.apply(len) tbl.reset_index(inplace=True) del df_ gc.collect() #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/listen_music_in_a_row_count_during_t_60.csv'.format( folder), index=False) del tbl gc.collect() ################################################## # n = 90 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=90).reset_index(drop=True) #core tbl = df_[df_.date_diff == 1].groupby('msno').date_diff.size().to_frame() tbl.columns = ['listen_music_in_a_row_count_during_t_90'] tbl['listen_music_in_a_row_ratio_during_t_90'] = tbl.listen_music_in_a_row_count_during_t_90 / df_.groupby( 'msno').date_diff.apply(len) tbl.reset_index(inplace=True) del df_ gc.collect() #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/listen_music_in_a_row_count_during_t_90.csv'.format( folder), index=False) del tbl gc.collect()
def make(T): """ T = 0 folder = 'trainW-0' """ input_col = ['msno', 'date'] #output_col = ['msno','num_log_in','listening_longevity','log_in_ratio'] if T == -1: folder = 'test' #label train = pd.read_csv('../input/sample_submission_v2.csv')[[ 'msno' ]] # 此train代表的是test的user user_logs = utils.read_multiple_csv( '../feature/{}/compressed_user_logs'.format(folder), input_col, parse_dates=['date']) user_logs = pd.concat([ user_logs, pd.read_csv('../input/user_logs_v2.csv', parse_dates=['date' ])[input_col] ], ignore_index=True) #user_logs.sort_values(by = ['msno', 'date'],inplace = True) else: folder = 'trainW-' + str(T) #label train = pd.read_csv( '../input/preprocessed_data/trainW-{0}.csv'.format(T))[['msno']] user_logs = utils.read_multiple_csv( '../feature/{}/compressed_user_logs'.format(folder), input_col, parse_dates=['date']) ################################################## # basic procedure ################################################## #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(user_logs) df = pd.merge(train, user_logs, on='msno', how='left') del user_logs gc.collect() print('shape of df:', df.shape) ################################################## # All history ################################################## # count tbl = df.groupby('msno').date.size().to_frame() tbl.columns = ['num_log_in'] tbl.reset_index(inplace=True) # for computing log_in_ratio user_logs_copy = df.groupby('msno').apply(listening_longevity) user_logs_copy.drop_duplicates('msno', inplace=True) tbl = pd.merge(tbl, user_logs_copy, on='msno', how='left') del user_logs_copy #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) gc.collect() #log_in_ratio tbl['log_in_ratio'] = tbl.num_log_in / tbl.listening_longevity tbl.drop(['date', 'listening_longevity'], axis=1, inplace=True) # write tbl.to_csv('../feature/{}/num_log_in.csv'.format(folder), index=False) del tbl gc.collect() ################################################## # n = 7 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=7).reset_index(drop=True) tbl = df_.groupby('msno').date.size().to_frame() tbl.columns = ['num_log_in_during_t_7'] tbl.reset_index(inplace=True) # for computing log_in_ratio user_logs_copy = df_.groupby('msno').apply(listening_longevity) user_logs_copy.drop_duplicates('msno', inplace=True) tbl = pd.merge(tbl, user_logs_copy, on='msno', how='left') del user_logs_copy #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) gc.collect() #log_in_ratio tbl['log_in_ratio_during_t_7'] = tbl.num_log_in_during_t_7 / tbl.listening_longevity tbl.drop(['date', 'listening_longevity'], axis=1, inplace=True) # write tbl.to_csv('../feature/{}/num_log_in_during_t_7.csv'.format(folder), index=False) del tbl gc.collect() ################################################## # n = 14 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=14).reset_index(drop=True) tbl = df_.groupby('msno').date.size().to_frame() tbl.columns = ['num_log_in_during_t_14'] tbl.reset_index(inplace=True) # for computing log_in_ratio user_logs_copy = df_.groupby('msno').apply(listening_longevity) user_logs_copy.drop_duplicates('msno', inplace=True) tbl = pd.merge(tbl, user_logs_copy, on='msno', how='left') del user_logs_copy gc.collect() #log_in_ratio tbl['log_in_ratio_during_t_14'] = tbl.num_log_in_during_t_14 / tbl.listening_longevity tbl.drop(['date', 'listening_longevity'], axis=1, inplace=True) # write tbl.to_csv('../feature/{}/num_log_in_during_t_14.csv'.format(folder), index=False) del tbl gc.collect() ################################################## # n = 30 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=30).reset_index(drop=True) tbl = df_.groupby('msno').date.size().to_frame() tbl.columns = ['num_log_in_during_t_30'] tbl.reset_index(inplace=True) # for computing log_in_ratio user_logs_copy = df_.groupby('msno').apply(listening_longevity) user_logs_copy.drop_duplicates('msno', inplace=True) tbl = pd.merge(tbl, user_logs_copy, on='msno', how='left') del user_logs_copy gc.collect() #log_in_ratio tbl['log_in_ratio_during_t_30'] = tbl.num_log_in_during_t_30 / tbl.listening_longevity tbl.drop(['date', 'listening_longevity'], axis=1, inplace=True) # write tbl.to_csv('../feature/{}/num_log_in_during_t_30.csv'.format(folder), index=False) del tbl gc.collect() ################################################## # n = 60 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=60).reset_index(drop=True) tbl = df_.groupby('msno').date.size().to_frame() tbl.columns = ['num_log_in_during_t_60'] tbl.reset_index(inplace=True) # for computing log_in_ratio user_logs_copy = df_.groupby('msno').apply(listening_longevity) user_logs_copy.drop_duplicates('msno', inplace=True) tbl = pd.merge(tbl, user_logs_copy, on='msno', how='left') del user_logs_copy gc.collect() #log_in_ratio tbl['log_in_ratio_during_t_60'] = tbl.num_log_in_during_t_60 / tbl.listening_longevity tbl.drop(['date', 'listening_longevity'], axis=1, inplace=True) # write tbl.to_csv('../feature/{}/num_log_in_during_t_60.csv'.format(folder), index=False) del tbl gc.collect() ################################################## # n = 90 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=90).reset_index(drop=True) tbl = df_.groupby('msno').date.size().to_frame() tbl.columns = ['num_log_in_during_t_90'] tbl.reset_index(inplace=True) # for computing log_in_ratio user_logs_copy = df_.groupby('msno').apply(listening_longevity) user_logs_copy.drop_duplicates('msno', inplace=True) tbl = pd.merge(tbl, user_logs_copy, on='msno', how='left') del user_logs_copy gc.collect() #log_in_ratio tbl['log_in_ratio_during_t_90'] = tbl.num_log_in_during_t_90 / tbl.listening_longevity tbl.drop(['date', 'listening_longevity'], axis=1, inplace=True) # write tbl.to_csv('../feature/{}/num_log_in_during_t_90.csv'.format(folder), index=False) del tbl gc.collect()
def make(T): """ T = 0 folder = 'trainW-0' """ if T == -1: folder = 'test' user_logs = utils.read_multiple_csv('../feature/{}/compressed_user_logs'.format(folder),input_col) user_logs = pd.concat([user_logs,pd.read_csv('../input/user_logs_v2.csv')], ignore_index=True) # user_logs_v2.csv: inclue data of March, it's for testing set. user_logs.sort_values(by = ['msno', 'date'],inplace = True) train = pd.read_csv('../input/sample_submission_v2.csv') # 此train代表的是test的user else: folder = 'trainW-'+str(T) user_logs = utils.read_multiple_csv('../feature/{}/compressed_user_logs'.format(folder), input_col) train = pd.read_csv('../input/preprocessed_data/trainW-{0}.csv'.format(T))[['msno']] # we do not need is_churn #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(user_logs) utils.reduce_memory(train) #user_logs = user_logs.head(n = 5000) #merge df = pd.merge(train,user_logs, on = 'msno', how = 'left') #df = df.dropna() del user_logs gc.collect() ################################################## # All history ################################################## ######## # core1 ######## tbl = df.groupby('msno').num_25.mean().to_frame() tbl.columns = ['num_25-mean'] tbl['num_25-min'] = df.groupby('msno').num_25.min() tbl['num_25-max'] = df.groupby('msno').num_25.max() tbl['num_25-median'] = df.groupby('msno').num_25.median() tbl['num_25-std'] = df.groupby('msno').num_25.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_25.csv'.format(folder), index = False) ######## # core2 ######## tbl = df.groupby('msno').num_50.mean().to_frame() tbl.columns = ['num_50-mean'] tbl['num_50-min'] = df.groupby('msno').num_50.min() tbl['num_50-max'] = df.groupby('msno').num_50.max() tbl['num_50-median'] = df.groupby('msno').num_50.median() tbl['num_50-std'] = df.groupby('msno').num_50.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_50.csv'.format(folder), index = False) ######## # core3 ######## tbl = df.groupby('msno').num_75.mean().to_frame() tbl.columns = ['num_75-mean'] tbl['num_75-min'] = df.groupby('msno').num_75.min() tbl['num_75-max'] = df.groupby('msno').num_75.max() tbl['num_75-median'] = df.groupby('msno').num_75.median() tbl['num_75-std'] = df.groupby('msno').num_75.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_75.csv'.format(folder), index = False) ######## # core4 ######## tbl = df.groupby('msno').num_985.mean().to_frame() tbl.columns = ['num_985-mean'] tbl['num_985-min'] = df.groupby('msno').num_985.min() tbl['num_985-max'] = df.groupby('msno').num_985.max() tbl['num_985-median'] = df.groupby('msno').num_985.median() tbl['num_985-std'] = df.groupby('msno').num_985.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_985.csv'.format(folder), index = False) ######## # core5 ######## tbl = df.groupby('msno').num_100.mean().to_frame() tbl.columns = ['num_100-mean'] tbl['num_100-min'] = df.groupby('msno').num_100.min() tbl['num_100-max'] = df.groupby('msno').num_100.max() tbl['num_100-median'] = df.groupby('msno').num_100.median() tbl['num_100-std'] = df.groupby('msno').num_100.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_100.csv'.format(folder), index = False) ######## # core6 ######## tbl = df.groupby('msno').num_unq.mean().to_frame() tbl.columns = ['num_unq-mean'] tbl['num_unq-min'] = df.groupby('msno').num_unq.min() tbl['num_unq-max'] = df.groupby('msno').num_unq.max() tbl['num_unq-median'] = df.groupby('msno').num_unq.median() tbl['num_unq-std'] = df.groupby('msno').num_unq.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_unq.csv'.format(folder), index = False) del tbl gc.collect() ################################################## # near 5 ################################################## df_ = df.groupby('msno').apply(near,5).reset_index(drop = True) ######## # core1 ######## tbl = df_.groupby('msno').num_25.mean().to_frame() tbl.columns = ['num_25-mean_n5'] tbl['num_25-min_n5'] = df_.groupby('msno').num_25.min() tbl['num_25-max_n5'] = df_.groupby('msno').num_25.max() tbl['num_25-median_n5'] = df_.groupby('msno').num_25.median() tbl['num_25-std_n5'] = df_.groupby('msno').num_25.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_25_n5.csv'.format(folder), index = False) ######## # core2 ######## tbl = df_.groupby('msno').num_50.mean().to_frame() tbl.columns = ['num_50-mean_n5'] tbl['num_50-min_n5'] = df_.groupby('msno').num_50.min() tbl['num_50-max_n5'] = df_.groupby('msno').num_50.max() tbl['num_50-median_n5'] = df_.groupby('msno').num_50.median() tbl['num_50-std_n5'] = df_.groupby('msno').num_50.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_50_n5.csv'.format(folder), index = False) ######### # core3 ######### tbl = df_.groupby('msno').num_75.mean().to_frame() tbl.columns = ['num_75-mean_n5'] tbl['num_75-min_n5'] = df_.groupby('msno').num_75.min() tbl['num_75-max_n5'] = df_.groupby('msno').num_75.max() tbl['num_75-median_n5'] = df_.groupby('msno').num_75.median() tbl['num_75-std_n5'] = df_.groupby('msno').num_75.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_75_n5.csv'.format(folder), index = False) ######## # core4 ######## tbl = df_.groupby('msno').num_985.mean().to_frame() tbl.columns = ['num_985-mean_n5'] tbl['num_985-min_n5'] = df_.groupby('msno').num_985.min() tbl['num_985-max_n5'] = df_.groupby('msno').num_985.max() tbl['num_985-median_n5'] = df_.groupby('msno').num_985.median() tbl['num_985-std_n5'] = df_.groupby('msno').num_985.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_985_n5.csv'.format(folder), index = False) ######## # core5 ######## tbl = df_.groupby('msno').num_100.mean().to_frame() tbl.columns = ['num_100-mean_n5'] tbl['num_100-min_n5'] = df_.groupby('msno').num_100.min() tbl['num_100-max_n5'] = df_.groupby('msno').num_100.max() tbl['num_100-median_n5'] = df_.groupby('msno').num_100.median() tbl['num_100-std_n5'] = df_.groupby('msno').num_100.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_100_n5.csv'.format(folder), index = False) ######## # core6 ######## tbl = df_.groupby('msno').num_unq.mean().to_frame() tbl.columns = ['num_unq-mean_n5'] tbl['num_unq-min_n5'] = df_.groupby('msno').num_unq.min() tbl['num_unq-max_n5'] = df_.groupby('msno').num_unq.max() tbl['num_unq-median_n5'] = df_.groupby('msno').num_unq.median() tbl['num_unq-std_n5'] = df_.groupby('msno').num_unq.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_unq_n5.csv'.format(folder), index = False) del tbl del df_ gc.collect() ################################################## # only one prvious order ################################################## df_ = df.groupby('msno').apply(near,1).reset_index(drop = True) ######## # core1 ######## tbl = df_.groupby('msno').num_25.mean().to_frame() tbl.columns = ['num_25-mean_n1'] tbl['num_25-min_n1'] = df_.groupby('msno').num_25.min() tbl['num_25-max_n1'] = df_.groupby('msno').num_25.max() tbl['num_25-median_n1'] = df_.groupby('msno').num_25.median() tbl['num_25-std_n1'] = df_.groupby('msno').num_25.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_25_n1.csv'.format(folder), index = False) ######## # core2 ######## tbl = df_.groupby('msno').num_50.mean().to_frame() tbl.columns = ['num_50-mean_n1'] tbl['num_50-min_n1'] = df_.groupby('msno').num_50.min() tbl['num_50-max_n1'] = df_.groupby('msno').num_50.max() tbl['num_50-median_n1'] = df_.groupby('msno').num_50.median() tbl['num_50-std_n1'] = df_.groupby('msno').num_50.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_50_n1.csv'.format(folder), index = False) ######### # core3 ######### tbl = df_.groupby('msno').num_75.mean().to_frame() tbl.columns = ['num_75-mean_n1'] tbl['num_75-min_n1'] = df_.groupby('msno').num_75.min() tbl['num_75-max_n1'] = df_.groupby('msno').num_75.max() tbl['num_75-median_n1'] = df_.groupby('msno').num_75.median() tbl['num_75-std_n1'] = df_.groupby('msno').num_75.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_75_n1.csv'.format(folder), index = False) ######## # core4 ######## tbl = df_.groupby('msno').num_985.mean().to_frame() tbl.columns = ['num_985-mean_n1'] tbl['num_985-min_n1'] = df_.groupby('msno').num_985.min() tbl['num_985-max_n1'] = df_.groupby('msno').num_985.max() tbl['num_985-median_n1'] = df_.groupby('msno').num_985.median() tbl['num_985-std_n1'] = df_.groupby('msno').num_985.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_985_n1.csv'.format(folder), index = False) ######## # core5 ######## tbl = df_.groupby('msno').num_100.mean().to_frame() tbl.columns = ['num_100-mean_n1'] tbl['num_100-min_n1'] = df_.groupby('msno').num_100.min() tbl['num_100-max_n1'] = df_.groupby('msno').num_100.max() tbl['num_100-median_n1'] = df_.groupby('msno').num_100.median() tbl['num_100-std_n1'] = df_.groupby('msno').num_100.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_100_n1.csv'.format(folder), index = False) ######## # core6 ######## tbl = df_.groupby('msno').num_unq.mean().to_frame() tbl.columns = ['num_unq-mean_n1'] tbl['num_unq-min_n1'] = df_.groupby('msno').num_unq.min() tbl['num_unq-max_n1'] = df_.groupby('msno').num_unq.max() tbl['num_unq-median_n1'] = df_.groupby('msno').num_unq.median() tbl['num_unq-std_n1'] = df_.groupby('msno').num_unq.std() tbl.reset_index(inplace = True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_unq_n1.csv'.format(folder), index = False) del tbl del df_ gc.collect()
def make(T): """ T = 0 folder = 'trainW-0' """ if T == -1: folder = 'test' train = pd.read_csv( '../input/sample_submission_v2.csv') # 此train代表的是test的user train['w'] = T membership_loyalty = utils.read_multiple_csv( '../feature/{}/days_since_the_last_transactions'.format(folder), input_col) else: folder = 'trainW-' + str(T) membership_loyalty = utils.read_multiple_csv( '../feature/{}/days_since_the_last_transactions'.format(folder), input_col) train = pd.read_csv( '../input/preprocessed_data/trainW-{0}.csv'.format(T))[[ 'msno', 'w' ]] # we do not need is_churn #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(membership_loyalty) ################################################## # All history ################################################## # merge df = pd.merge(train, membership_loyalty, on=['msno', 'w'], how='left') #df = df.head( n= 1000) #core1 tbl = df.groupby('msno').is_subscribe_early.sum().to_frame() tbl.columns = ['is_subscribe_early_count'] tbl['is_subscribe_early_ratio'] = df.groupby( 'msno').is_subscribe_early.mean() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/is_subscribe_early.csv'.format(folder), index=False) #core2 tbl = df.groupby('msno').do_change_payment_method.sum().to_frame() tbl.columns = ['do_change_payment_method_count'] tbl['do_change_payment_method_ratio'] = df.groupby( 'msno').do_change_payment_method.mean() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_change_payment_method.csv'.format(folder), index=False) #core3 tbl = df.groupby('msno').do_spend_more_money.mean().to_frame() tbl.columns = ['do_spend_more_money-mean'] tbl['do_spend_more_money-min'] = df.groupby( 'msno').do_spend_more_money.min() tbl['do_spend_more_money-max'] = df.groupby( 'msno').do_spend_more_money.max() tbl['do_spend_more_money-median'] = df.groupby( 'msno').do_spend_more_money.median() tbl['do_spend_more_money-std'] = df.groupby( 'msno').do_spend_more_money.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_spend_more_money.csv'.format(folder), index=False) #core4 tbl = df.groupby('msno').do_extend_payment_days.mean().to_frame() tbl.columns = ['do_extend_payment_days-mean'] tbl['do_extend_payment_days-min'] = df.groupby( 'msno').do_extend_payment_days.min() tbl['do_extend_payment_days-max'] = df.groupby( 'msno').do_extend_payment_days.max() tbl['do_extend_payment_days-median'] = df.groupby( 'msno').do_extend_payment_days.median() tbl['do_extend_payment_days-std'] = df.groupby( 'msno').do_extend_payment_days.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_extend_payment_days.csv'.format(folder), index=False) #core5 tbl = df.groupby('msno').do_paid_more.sum().to_frame() tbl.columns = ['do_paid_more_count'] tbl['do_paid_more_ratio'] = df.groupby('msno').do_paid_more.mean() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_paid_more.csv'.format(folder), index=False) ################################################## # near 5 ################################################## df_ = df.groupby('msno').apply(near, 5).reset_index(drop=True) #core1 tbl = df_.groupby('msno').is_subscribe_early.sum().to_frame() tbl.columns = ['is_subscribe_early_count_n5'] tbl['is_subscribe_early_ratio_n5'] = df_.groupby( 'msno').is_subscribe_early.mean() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/is_subscribe_early_n5.csv'.format(folder), index=False) #core2 tbl = df_.groupby('msno').do_change_payment_method.sum().to_frame() tbl.columns = ['do_change_payment_method_count_n5'] tbl['do_change_payment_method_ratio_n5'] = df_.groupby( 'msno').do_change_payment_method.mean() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_change_payment_method_n5.csv'.format(folder), index=False) #core3 tbl = df_.groupby('msno').do_spend_more_money.mean().to_frame() tbl.columns = ['do_spend_more_money-mean_n5'] tbl['do_spend_more_money-min_n5'] = df_.groupby( 'msno').do_spend_more_money.min() tbl['do_spend_more_money-max_n5'] = df_.groupby( 'msno').do_spend_more_money.max() tbl['do_spend_more_money-median_n5'] = df_.groupby( 'msno').do_spend_more_money.median() tbl['do_spend_more_money-std_n5'] = df_.groupby( 'msno').do_spend_more_money.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_spend_more_money_n5.csv'.format(folder), index=False) #core4 tbl = df_.groupby('msno').do_extend_payment_days.mean().to_frame() tbl.columns = ['do_extend_payment_days-mean_n5'] tbl['do_extend_payment_days-min_n5'] = df_.groupby( 'msno').do_extend_payment_days.min() tbl['do_extend_payment_days-max_n5'] = df_.groupby( 'msno').do_extend_payment_days.max() tbl['do_extend_payment_days-median_n5'] = df_.groupby( 'msno').do_extend_payment_days.median() tbl['do_extend_payment_days-std_n5'] = df_.groupby( 'msno').do_extend_payment_days.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_extend_payment_days_n5.csv'.format(folder), index=False) #core5 tbl = df_.groupby('msno').do_paid_more.sum().to_frame() tbl.columns = ['do_paid_more_count_n5'] tbl['do_paid_more_ratio_n5'] = df_.groupby('msno').do_paid_more.mean() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_paid_more_n5.csv'.format(folder), index=False) del df_ gc.collect() ################################################## # only one prvious order ################################################## df_ = df.groupby('msno').apply(near, 1).reset_index(drop=True) #core1 tbl = df_.groupby('msno').is_subscribe_early.sum().to_frame() tbl.columns = ['is_subscribe_early_count_n1'] tbl['is_subscribe_early_ratio_n1'] = df_.groupby( 'msno').is_subscribe_early.mean() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/is_subscribe_early_n1.csv'.format(folder), index=False) #core2 tbl = df_.groupby('msno').do_change_payment_method.sum().to_frame() tbl.columns = ['do_change_payment_method_count_n1'] tbl['do_change_payment_method_ratio_n1'] = df_.groupby( 'msno').do_change_payment_method.mean() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_change_payment_method_n1.csv'.format(folder), index=False) #core3 tbl = df_.groupby('msno').do_spend_more_money.mean().to_frame() tbl.columns = ['do_spend_more_money-mean_n1'] tbl['do_spend_more_money-min_n1'] = df_.groupby( 'msno').do_spend_more_money.min() tbl['do_spend_more_money-max_n1'] = df_.groupby( 'msno').do_spend_more_money.max() tbl['do_spend_more_money-median_n1'] = df_.groupby( 'msno').do_spend_more_money.median() tbl['do_spend_more_money-std_n1'] = df_.groupby( 'msno').do_spend_more_money.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_spend_more_money_n1.csv'.format(folder), index=False) #core4 tbl = df_.groupby('msno').do_extend_payment_days.mean().to_frame() tbl.columns = ['do_extend_payment_days-mean_n1'] tbl['do_extend_payment_days-min_n1'] = df_.groupby( 'msno').do_extend_payment_days.min() tbl['do_extend_payment_days-max_n1'] = df_.groupby( 'msno').do_extend_payment_days.max() tbl['do_extend_payment_days-median_n1'] = df_.groupby( 'msno').do_extend_payment_days.median() tbl['do_extend_payment_days-std_n1'] = df_.groupby( 'msno').do_extend_payment_days.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_extend_payment_days_n1.csv'.format(folder), index=False) #core5 tbl = df_.groupby('msno').do_paid_more.sum().to_frame() tbl.columns = ['do_paid_more_count_n1'] tbl['do_paid_more_ratio_n1'] = df_.groupby('msno').do_paid_more.mean() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) #write tbl.to_csv('../feature/{}/do_paid_more_n1.csv'.format(folder), index=False) del df_ gc.collect()
def make(T): """ T = 0 folder = 'trainW-0' """ input_col = [ 'msno', 'date', 'num_25', 'num_50', 'num_75', 'num_985', 'num_100', 'num_unq' ] if T == -1: folder = 'test' #label train = pd.read_csv('../input/sample_submission_v2.csv')[[ 'msno' ]] # 此train代表的是test的user #file1 user_logs = utils.read_multiple_csv( '../feature/{}/compressed_user_logs'.format(folder), input_col, parse_dates=['date']) user_logs = pd.concat([ user_logs, pd.read_csv('../input/user_logs_v2.csv', parse_dates=['date' ])[input_col] ], ignore_index=True) #user_logs.sort_values(by = ['msno', 'date'],inplace = True) else: folder = 'trainW-' + str(T) #label train = pd.read_csv( '../input/preprocessed_data/trainW-{0}.csv'.format(T))[['msno']] #file1 user_logs = utils.read_multiple_csv( '../feature/{}/compressed_user_logs'.format(folder), input_col, parse_dates=['date']) ################################################## # basic procedure ################################################## #incompleted vs completed user_logs['completed_songs_ratio'] = ( user_logs.num_100 + user_logs.num_985) / ( user_logs.num_25 + user_logs.num_50 + user_logs.num_75 + user_logs.num_100 + user_logs.num_985) #num_repeated_songs user_logs['num_repeated_songs'] = (user_logs.num_100 + user_logs.num_985 + user_logs.num_75) / user_logs.num_unq user_logs.drop( ['num_25', 'num_50', 'num_75', 'num_985', 'num_100', 'num_unq'], axis=1, inplace=True) user_logs.dropna(inplace=True) # 把0/0 給drop掉 gc.collect() # release memory fisrt #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(user_logs) df = pd.merge(train, user_logs, on='msno', how='left') del user_logs gc.collect() print('shape of df:', df.shape) ################################################## # All history ################################################## #core1:completed_songs_ratio print('core1') tbl = df.groupby('msno').completed_songs_ratio.mean().to_frame() tbl.columns = ['completed_songs_ratio-mean'] tbl['completed_songs_ratio-min'] = df.groupby( 'msno').completed_songs_ratio.min() tbl['completed_songs_ratio-max'] = df.groupby( 'msno').completed_songs_ratio.max() tbl['completed_songs_ratio-median'] = df.groupby( 'msno').completed_songs_ratio.median() tbl['completed_songs_ratio-std'] = df.groupby( 'msno').completed_songs_ratio.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/completed_songs_ratio.csv'.format(folder), index=False) del tbl gc.collect() #core2:num_repeated_songs print('core2') tbl = df.groupby('msno').num_repeated_songs.mean().to_frame() tbl.columns = ['num_repeated_songs-mean'] tbl['num_repeated_songs-min'] = df.groupby('msno').num_repeated_songs.min() tbl['num_repeated_songs-max'] = df.groupby('msno').num_repeated_songs.max() tbl['num_repeated_songs-median'] = df.groupby( 'msno').num_repeated_songs.median() tbl['num_repeated_songs-std'] = df.groupby('msno').num_repeated_songs.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv('../feature/{}/num_repeated_songs.csv'.format(folder), index=False) del tbl gc.collect() ################################################## # n = 7 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=7).reset_index(drop=True) #core1:completed_songs_ratio print('core1') tbl = df_.groupby('msno').completed_songs_ratio.mean().to_frame() tbl.columns = ['completed_songs_ratio_during_t_7-mean'] tbl['completed_songs_ratio_during_t_7-min'] = df_.groupby( 'msno').completed_songs_ratio.min() tbl['completed_songs_ratio_during_t_7-max'] = df_.groupby( 'msno').completed_songs_ratio.max() tbl['completed_songs_ratio_during_t_7-median'] = df_.groupby( 'msno').completed_songs_ratio.median() tbl['completed_songs_ratio_during_t_7-std'] = df_.groupby( 'msno').completed_songs_ratio.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/completed_songs_ratio_during_t_7.csv'.format(folder), index=False) del tbl gc.collect() #core2:num_repeated_songs print('core2') tbl = df_.groupby('msno').num_repeated_songs.mean().to_frame() tbl.columns = ['num_repeated_songs_during_t_7-mean'] tbl['num_repeated_songs_during_t_7-min'] = df_.groupby( 'msno').num_repeated_songs.min() tbl['num_repeated_songs_during_t_7-max'] = df_.groupby( 'msno').num_repeated_songs.max() tbl['num_repeated_songs_during_t_7-median'] = df_.groupby( 'msno').num_repeated_songs.median() tbl['num_repeated_songs_during_t_7-std'] = df_.groupby( 'msno').num_repeated_songs.std() tbl.reset_index(inplace=True) del df_ gc.collect() #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/num_repeated_songs_during_t_7.csv'.format(folder), index=False) del tbl gc.collect() ################################################## # n = 14 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=14).reset_index(drop=True) #core1:completed_songs_ratio print('core1') tbl = df_.groupby('msno').completed_songs_ratio.mean().to_frame() tbl.columns = ['completed_songs_ratio_during_t_14-mean'] tbl['completed_songs_ratio_during_t_14-min'] = df_.groupby( 'msno').completed_songs_ratio.min() tbl['completed_songs_ratio_during_t_14-max'] = df_.groupby( 'msno').completed_songs_ratio.max() tbl['completed_songs_ratio_during_t_14-median'] = df_.groupby( 'msno').completed_songs_ratio.median() tbl['completed_songs_ratio_during_t_14-std'] = df_.groupby( 'msno').completed_songs_ratio.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/completed_songs_ratio_during_t_14.csv'.format(folder), index=False) del tbl gc.collect() #core2:num_repeated_songs print('core2') tbl = df_.groupby('msno').num_repeated_songs.mean().to_frame() tbl.columns = ['num_repeated_songs_during_t_14-mean'] tbl['num_repeated_songs_during_t_14-min'] = df_.groupby( 'msno').num_repeated_songs.min() tbl['num_repeated_songs_during_t_14-max'] = df_.groupby( 'msno').num_repeated_songs.max() tbl['num_repeated_songs_during_t_14-median'] = df_.groupby( 'msno').num_repeated_songs.median() tbl['num_repeated_songs_during_t_14-std'] = df_.groupby( 'msno').num_repeated_songs.std() tbl.reset_index(inplace=True) del df_ gc.collect() #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/num_repeated_songs_during_t_14.csv'.format(folder), index=False) del tbl gc.collect() ################################################## # n = 30 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=30).reset_index(drop=True) #core1:completed_songs_ratio print('core1') tbl = df_.groupby('msno').completed_songs_ratio.mean().to_frame() tbl.columns = ['completed_songs_ratio_during_t_30-mean'] tbl['completed_songs_ratio_during_t_30-min'] = df_.groupby( 'msno').completed_songs_ratio.min() tbl['completed_songs_ratio_during_t_30-max'] = df_.groupby( 'msno').completed_songs_ratio.max() tbl['completed_songs_ratio_during_t_30-median'] = df_.groupby( 'msno').completed_songs_ratio.median() tbl['completed_songs_ratio_during_t_30-std'] = df_.groupby( 'msno').completed_songs_ratio.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/completed_songs_ratio_during_t_30.csv'.format(folder), index=False) del tbl gc.collect() #core2:num_repeated_songs print('core2') tbl = df_.groupby('msno').num_repeated_songs.mean().to_frame() tbl.columns = ['num_repeated_songs_during_t_30-mean'] tbl['num_repeated_songs_during_t_30-min'] = df_.groupby( 'msno').num_repeated_songs.min() tbl['num_repeated_songs_during_t_30-max'] = df_.groupby( 'msno').num_repeated_songs.max() tbl['num_repeated_songs_during_t_30-median'] = df_.groupby( 'msno').num_repeated_songs.median() tbl['num_repeated_songs_during_t_30-std'] = df_.groupby( 'msno').num_repeated_songs.std() tbl.reset_index(inplace=True) del df_ gc.collect() #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/num_repeated_songs_during_t_30.csv'.format(folder), index=False) del tbl gc.collect() ################################################## # n = 60 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=60).reset_index(drop=True) #core1:completed_songs_ratio print('core1') tbl = df_.groupby('msno').completed_songs_ratio.mean().to_frame() tbl.columns = ['completed_songs_ratio_during_t_60-mean'] tbl['completed_songs_ratio_during_t_60-min'] = df_.groupby( 'msno').completed_songs_ratio.min() tbl['completed_songs_ratio_during_t_60-max'] = df_.groupby( 'msno').completed_songs_ratio.max() tbl['completed_songs_ratio_during_t_60-median'] = df_.groupby( 'msno').completed_songs_ratio.median() tbl['completed_songs_ratio_during_t_60-std'] = df_.groupby( 'msno').completed_songs_ratio.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/completed_songs_ratio_during_t_60.csv'.format(folder), index=False) del tbl gc.collect() #core2:num_repeated_songs print('core2') tbl = df_.groupby('msno').num_repeated_songs.mean().to_frame() tbl.columns = ['num_repeated_songs_during_t_60-mean'] tbl['num_repeated_songs_during_t_60-min'] = df_.groupby( 'msno').num_repeated_songs.min() tbl['num_repeated_songs_during_t_60-max'] = df_.groupby( 'msno').num_repeated_songs.max() tbl['num_repeated_songs_during_t_60-median'] = df_.groupby( 'msno').num_repeated_songs.median() tbl['num_repeated_songs_during_t_60-std'] = df_.groupby( 'msno').num_repeated_songs.std() tbl.reset_index(inplace=True) del df_ gc.collect() #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/num_repeated_songs_during_t_60.csv'.format(folder), index=False) del tbl gc.collect() ################################################## # n = 90 ################################################## df_ = df.groupby('msno').apply(within_n_days, T, n=90).reset_index(drop=True) #core1:completed_songs_ratio print('core1') tbl = df_.groupby('msno').completed_songs_ratio.mean().to_frame() tbl.columns = ['completed_songs_ratio_during_t_90-mean'] tbl['completed_songs_ratio_during_t_90-min'] = df_.groupby( 'msno').completed_songs_ratio.min() tbl['completed_songs_ratio_during_t_90-max'] = df_.groupby( 'msno').completed_songs_ratio.max() tbl['completed_songs_ratio_during_t_90-median'] = df_.groupby( 'msno').completed_songs_ratio.median() tbl['completed_songs_ratio_during_t_90-std'] = df_.groupby( 'msno').completed_songs_ratio.std() tbl.reset_index(inplace=True) #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/completed_songs_ratio_during_t_90.csv'.format(folder), index=False) del tbl gc.collect() #core2:num_repeated_songs print('core2') tbl = df_.groupby('msno').num_repeated_songs.mean().to_frame() tbl.columns = ['num_repeated_songs_during_t_90-mean'] tbl['num_repeated_songs_during_t_90-min'] = df_.groupby( 'msno').num_repeated_songs.min() tbl['num_repeated_songs_during_t_90-max'] = df_.groupby( 'msno').num_repeated_songs.max() tbl['num_repeated_songs_during_t_90-median'] = df_.groupby( 'msno').num_repeated_songs.median() tbl['num_repeated_songs_during_t_90-std'] = df_.groupby( 'msno').num_repeated_songs.std() tbl.reset_index(inplace=True) del df_ gc.collect() #============================================================================== print('reduce memory') #============================================================================== utils.reduce_memory(tbl) # write tbl.to_csv( '../feature/{}/num_repeated_songs_during_t_90.csv'.format(folder), index=False) del tbl gc.collect()