Пример #1
0
def multi(keys):
    keys_ = '-'.join(keys)
    print(keys)

    gr = trte.groupby(keys)

    df1 = gr['hour'].var().rank(method='dense')
    c = 'hourvar_' + keys_
    df1.name = c

    df2 = gr['day'].var().rank(method='dense')
    c = 'dayvar_' + keys_
    df2.name = c

    df = pd.concat([df1, df2], axis=1)
    del df1, df2
    gc.collect()

    utils.reduce_memory(df)
    col = df.columns.tolist()
    df.reset_index(inplace=True)

    result = pd.merge(trte, df, on=keys, how='left')

    result.iloc[0:utils.TRAIN_SHAPE][col].to_pickle(
        '../data/111__{}_train.p'.format(keys_))
    result.iloc[utils.TRAIN_SHAPE:][col].to_pickle(
        '../data/111__{}_test.p'.format(keys_))
    gc.collect()
def multi(k):
    """
    k = 'app'
    """
    gc.collect()
    print(k)

    df = pd.crosstab(trte[k], trte.hour, normalize='index')
    df = df.add_prefix(f'histHourNorm_{k}_')

    utils.reduce_memory(df)
    col = df.columns.tolist()

    result = pd.merge(trte, df.reset_index(), on=k, how='left')
    gc.collect()

    #    result.iloc[0:utils.TRAIN_SHAPE][col].to_pickle(f'../data/114__{k}_train.p')
    #    result.iloc[utils.TRAIN_SHAPE:][col].to_pickle(f'../data/114__{k}_test.p')
    #    gc.collect()

    utils.to_pickles(
        result.iloc[0:utils.TRAIN_SHAPE][col].reset_index(drop=True),
        '../data/114_train', utils.SPLIT_SIZE)
    gc.collect()
    utils.to_pickles(
        result.iloc[utils.TRAIN_SHAPE:][col].reset_index(drop=True),
        '../data/114_test', utils.SPLIT_SIZE)
Пример #3
0
def multi(keys):
    keys_ = '-'.join(keys)
    print(keys)

    gr = trte.groupby(keys)
    df_min = gr['timestamp'].min()
    df_min.name = 'timemin_' + keys_

    df_max = gr['timestamp'].max()
    df_max.name = 'timemax_' + keys_

    df_diff1 = df_max - df_min
    df_diff1.name = 'timediff-minmax_' + keys_

    df_diff1 = df_diff1.rank(method='dense')
    df_max = df_max.rank(method='dense')
    df_min = df_min.rank(method='dense')

    gc.collect()

    df_mean = gr['timestamp'].mean()
    df_mean.name = 'timemean_' + keys_

    df_median = gr['timestamp'].median()
    df_median.name = 'timemedian_' + keys_

    df_diff2 = df_mean - df_median
    df_diff2.name = 'timediff-meadian_' + keys_

    df_diff2 = df_diff2.rank(method='dense')
    df_median = df_median.rank(method='dense')
    df_mean = df_mean.rank(method='dense')

    gc.collect()

    df_var = gr['timestamp'].var().rank(method='dense')
    df_var.name = 'timevar_' + keys_

    df_skew = gr['timestamp'].skew().rank(method='dense')
    df_skew.name = 'timeskew_' + keys_

    df = pd.concat([
        df_min, df_max, df_diff1, df_mean, df_median, df_diff2, df_var, df_skew
    ],
                   axis=1)
    del df_min, df_max, df_diff1, df_diff2, df_mean, df_var, df_skew
    gc.collect()

    utils.reduce_memory(df)
    col = df.columns.tolist()
    df.reset_index(inplace=True)

    result = pd.merge(trte, df, on=keys, how='left')

    result.iloc[0:utils.TRAIN_SHAPE][col].to_pickle(
        '../data/110__{}_train.p'.format(keys))
    result.iloc[utils.TRAIN_SHAPE:][col].to_pickle(
        '../data/110__{}_test.p'.format(keys))
    gc.collect()
def make(T):
    """
    T = 0
    folder = 'trainW-0'
    """

    if T == -1:
        folder = 'test'
        user_logs = utils.read_multiple_csv(
            '../feature/{}/compressed_user_logs'.format(folder), input_col)
        user_logs = pd.concat([
            user_logs,
            pd.read_csv('../input/user_logs_v2.csv', parse_dates=['date'
                                                                  ])[input_col]
        ],
                              ignore_index=True)
        # user_logs_v2.csv: only inclue data of March, it's for testing set.
        #user_logs.sort_values(by = ['msno', 'date'], inplace = True)
        #這邊記憶體會激升, 速度會變慢因為concat and sort_values,現在問題是有需要sort_values麼?有groupby就不需要
    else:
        folder = 'trainW-' + str(T)
        user_logs = utils.read_multiple_csv(
            '../feature/{}/compressed_user_logs'.format(folder), input_col)

    #user_logs = user_logs[user_logs.msno == 'Pz51LVoS9ENG1kNHQyrJ3gG8A163pyHi+gyvN2p+1nM=']
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(user_logs)
    gc.collect()
    print('shape1:', user_logs.shape)
    # core
    user_logs['total_secs_percentage'] = user_logs.total_secs.apply(
        lambda x: x / (24 * 60 * 60))
    #user_logs['listening_habit_zone'] = user_logs.total_secs_percentage.apply(habit_discrimination)

    user_logs['num_of_time_the_user_has_logged_in'] = user_logs.groupby(
        'msno').total_secs.cumsum()  # make this line faster
    user_logs.drop('total_secs', axis=1, inplace=True)
    user_logs = user_logs.groupby('msno').apply(
        make_order_number)  # make this line faster
    user_logs[
        'num_of_time_the_user_has_logged_in_ratio'] = user_logs.num_of_time_the_user_has_logged_in / user_logs.order_number
    user_logs.drop('order_number', axis=1, inplace=True)
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(user_logs)
    print('shape2:', user_logs.shape)
    ##################################################
    # write
    ##################################################
    path = '../feature/{}/user_logs_listening_habit'.format(folder)
    gc.collect()
    utils.to_multiple_csv(user_logs, path, split_size=8)
    del user_logs
    gc.collect()
    print('{0} done'.format(T))
def make(T):
    """
	T = 0
	folder = 'trainW-0'
	"""

    if T == -1:
        folder = 'test'
        train = pd.read_csv(
            '../input/sample_submission_v2.csv')  # 此train代表的是test的user
    else:
        folder = 'trainW-' + str(T)
        train = pd.read_csv(
            '../input/preprocessed_data/trainW-{0}.csv'.format(T))[[
                'msno'
            ]]  # we do not need is_churn
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(train)

    df = pd.merge(train, demographics, on='msno', how='left')
    if T == 0:
        now_time = datetime.strptime('2017-03-01', '%Y-%m-%d')
    elif T == 1:
        now_time = datetime.strptime('2017-02-01', '%Y-%m-%d')
    elif T == 2:
        now_time = datetime.strptime('2017-01-01', '%Y-%m-%d')
    else:
        now_time = datetime.strptime('2017-04-01', '%Y-%m-%d')
    df['how_long_has_benn_a_memmbership_of_kkbox_days'] = [
        (now_time - datetime.utcfromtimestamp(r_i_t.tolist() / 1e9)).days
        if pd.notnull(r_i_t) else -1
        for r_i_t in df.registration_init_time.values
    ]
    df['how_long_has_benn_a_memmbership_of_kkbox_years'] = [
        h_days / 360 if h_days != -1 else -1
        for h_days in df.how_long_has_benn_a_memmbership_of_kkbox_days.values
    ]
    df.drop('registration_init_time', axis=1, inplace=True)
    #==============================================================================
    print('one-hot encoding for dummy varaiables')
    #==============================================================================
    df = pd.get_dummies(df, columns=['city'])
    df = pd.get_dummies(df, columns=['gender'])
    df = pd.get_dummies(df, columns=['registered_via'])
    # the following's value is meaningful, so it do not need one-hot encoding
    # df = pd.get_dummies(df, columns=['city_zone'])
    # df = pd.get_dummies(df, columns=['bd_zone'])
    # df = pd.get_dummies(df, columns=['registered_via_zone'])

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(df)
    # write
    df.to_csv('../feature/{}/membership_stat.csv'.format(folder), index=False)
def make(T):
    """
    T = 0
    folder = 'trainW-0'
    """

    if T == -1:
        folder = 'test'
        user_logs = utils.read_multiple_csv(
            '../feature/{}/compressed_user_logs'.format(folder), input_col)
        user_logs = pd.concat([
            user_logs,
            pd.read_csv('../input/user_logs_v2.csv', parse_dates=['date'
                                                                  ])[input_col]
        ],
                              ignore_index=True)
        #user_logs.sort_values(by = ['msno', 'date'],inplace = True)
    else:
        folder = 'trainW-' + str(T)
        user_logs = utils.read_multiple_csv(
            '../feature/{}/compressed_user_logs'.format(folder), input_col)
    #user_logs = user_logs[user_logs.msno == 'Pz51LVoS9ENG1kNHQyrJ3gG8A163pyHi+gyvN2p+1nM=']
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(user_logs)
    print('shape1:', user_logs.shape)
    gc.collect()
    #incompleted vs completed
    user_logs['num_completed_songs'] = user_logs.num_100 + user_logs.num_985
    user_logs[
        'num_incompleted_songs'] = user_logs.num_25 + user_logs.num_50 + user_logs.num_75
    user_logs['completed_songs_ratio'] = user_logs.num_completed_songs / (
        user_logs.num_incompleted_songs + user_logs.num_completed_songs)
    user_logs['is_satisfied'] = user_logs.completed_songs_ratio.apply(
        lambda x: 1 if x > 0.5 else 0)
    #num_repeated_songs
    user_logs['num_repeated_songs'] = (user_logs.num_100 + user_logs.num_985 +
                                       user_logs.num_75) / user_logs.num_unq
    user_logs.drop(
        ['num_25', 'num_50', 'num_75', 'num_985', 'num_100', 'num_unq'],
        axis=1,
        inplace=True)
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(user_logs)
    print('shape2:', user_logs.shape)
    gc.collect()
    ##################################################
    # write
    ##################################################
    path = '../feature/{}/user_logs_listening_behavior'.format(folder)
    gc.collect()
    utils.to_multiple_csv(user_logs, path, split_size=10)
    print('{0} done'.format(T))
def multi(keys):
    gc.collect()
    print(keys)
    keys1, keys2 = keys
    
    df = trte.groupby(keys1).size().groupby(keys2).size().rank(method='dense')
    c = 'nunique_' + '-'.join(keys1) + '_' + '-'.join(keys2)
    df.name = c
    df = df.reset_index()
    utils.reduce_memory(df, ix_start=-1)
    
    result = pd.merge(trte, df, on=keys2, how='left')
    
    result.iloc[0:utils.TRAIN_SHAPE][c].to_pickle('../data/103__{}_train.p'.format(c))
    result.iloc[utils.TRAIN_SHAPE:][c].to_pickle('../data/103__{}_test.p'.format(c))
    gc.collect()
def multi(keys):
    gc.collect()
    print(keys)

    keys_ = '-'.join(keys)
    #    df = trte.groupby(keys).size().rank(method='dense')
    df = trte.groupby(keys).size()
    df.name = 'totalcount_' + keys_
    df = df.reset_index()
    utils.reduce_memory(df, ix_start=-1)

    result = pd.merge(trte, df, on=keys, how='left')

    result.iloc[0:utils.TRAIN_SHAPE]['totalcount_' + keys_].to_pickle(
        '../data/101__{}_train.p'.format(keys_))
    result.iloc[utils.TRAIN_SHAPE:]['totalcount_' + keys_].to_pickle(
        '../data/101__{}_test.p'.format(keys_))
    gc.collect()
Пример #9
0
def make(T):
    """
    T = 0
    folder = 'trainW-0'
    """

    if T == -1:
        folder = 'test'
        user_logs = utils.read_multiple_csv('../feature/{}/compressed_user_logs'.format(folder),input_col) 
        user_logs = pd.concat([user_logs,pd.read_csv('../input/user_logs_v2.csv', parse_dates = ['date'])[input_col]],
        ignore_index=True) 
    else:
        folder = 'trainW-'+str(T)
        user_logs = utils.read_multiple_csv('../feature/{}/compressed_user_logs'.format(folder), input_col)
    #user_logs = user_logs[user_logs.msno == 'Pz51LVoS9ENG1kNHQyrJ3gG8A163pyHi+gyvN2p+1nM=']
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(user_logs)
    gc.collect()
    print ('shape1:', user_logs.shape)

    #get_ratio
    user_logs.loc[:,"num_25":"num_100"] = user_logs.loc[:,"num_25":"num_100"].div(user_logs.loc[:,"num_25":"num_100"].sum(axis=1), axis=0)
    user_logs.rename(columns = {'num_25':'num_25_ratio','num_50':'num_50_ratio',
                           'num_75':'num_75_ratio','num_985':'num_985_ratio',
                           'num_100':'num_100_ratio'}, inplace =True)
    
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(user_logs)
    gc.collect()
    ##################################################
    # write
    ##################################################
    path = '../feature/{}/user_logs_listening_freq'.format(folder)
    gc.collect()
    utils.to_multiple_csv(user_logs, path, split_size = 10)
    print ('{0} done'.format(T))
def multi(keys):
    gc.collect()
    print(keys)
    keys = list(keys)

    keys_ = '-'.join(keys)
    c = 'sameClickTimeCount_' + keys_
    df = trte.groupby(keys + ['click_time']).size().groupby(keys).max().rank(
        method='dense')
    df.name = c
    df = df.reset_index()
    utils.reduce_memory(df, ix_start=-1)
    gc.collect()

    result = pd.merge(trte, df, on=keys, how='left')
    gc.collect()

    result.iloc[0:utils.TRAIN_SHAPE][c].to_pickle(
        '../data/112__{}_train.p'.format(keys_))
    result.iloc[utils.TRAIN_SHAPE:][c].to_pickle(
        '../data/112__{}_test.p'.format(keys_))
    gc.collect()
def multi(keys):
    gc.collect()
    print(keys)
    keys = list(keys)
    
    keys_ = '-'.join(keys)
    c1 = 'totalCountByHour_' + keys_
    c2 = 'totalRatioByHour_' + keys_
    
    keys +=['hour']
    df = trte.groupby(keys).size()
    df.name = c1
    df = pd.merge(df.reset_index(), day_tbl, on='hour', how='left')
    df[c2] = df[c1] / df['hour_freq']
    del df['hour_freq']
    
    utils.reduce_memory(df, ix_start=-2)
    
    result = pd.merge(trte, df, on=keys, how='left')
    
    result.iloc[0:utils.TRAIN_SHAPE][[c1, c2]].to_pickle('../data/115__{}_train.p'.format(keys_))
    result.iloc[utils.TRAIN_SHAPE:][[c1, c2]].to_pickle('../data/115__{}_test.p'.format(keys_))
    gc.collect()
Пример #12
0
def concat_pred_item(T, dryrun=False):
    if T == -1:
        name = 'test'
    else:
        name = 'trainT-' + str(T)
    #==============================================================================
    print('load label')
    #==============================================================================
    # NOTE: order_id is label
    print('load t3')
    X_base = pd.read_pickle('../feature/X_base_t3.p')

    label = pd.read_pickle('../feature/{}/label_reordered.p'.format(name))

    # 'inner' for removing t-n_order_id == NaN
    if 'train' in name:
        df = pd.merge(X_base[X_base.is_train == 1],
                      label,
                      on='order_id',
                      how='inner')
    elif name == 'test':
        df = pd.merge(X_base[X_base.is_train == 0],
                      label,
                      on='order_id',
                      how='inner')

    if dryrun:
        print('dryrun')
        df = df.sample(9999)

    df = pd.merge(df,
                  pd.read_pickle('../input/mk/goods.p')[[
                      'product_id', 'aisle_id', 'department_id'
                  ]],
                  on='product_id',
                  how='left')

    print('{}.shape:{}\n'.format(name, df.shape))

    #==============================================================================
    print('user feature')
    #==============================================================================

    df = user_feature(df, name)

    print('{}.shape:{}\n'.format(name, df.shape))

    #==============================================================================
    print('item feature')
    #==============================================================================

    df = item_feature(df, name)

    print('{}.shape:{}\n'.format(name, df.shape))

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(df)
    ix_end = df.shape[1]

    #==============================================================================
    print('user x item')
    #==============================================================================

    df = user_item_feature(df, name)

    print('{}.shape:{}\n'.format(name, df.shape))

    #==============================================================================
    print('user x item')

    #==============================================================================
    def compress(df, key):
        """
        key: str
        """
        df_ = df.drop_duplicates(key)[[key]].set_index(key)
        dtypes = df.dtypes
        col = dtypes[dtypes != 'O'].index
        col = [c for c in col if '_id' not in c]
        gr = df.groupby(key)
        for c in col:
            df_[c + '-min'] = gr[c].min()
            df_[c + '-mean'] = gr[c].mean()
            df_[c + '-median'] = gr[c].median()
            df_[c + '-max'] = gr[c].max()
            df_[c + '-std'] = gr[c].std()

        var = df_.var()
        col = var[var == 0].index
        df_.drop(col, axis=1, inplace=True)
        gc.collect()

        return df_.reset_index()

    key = 'order_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f301_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    feature = compress(
        pd.read_pickle('../feature/{}/f301_order-product_n5.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'order_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f302_order-product_all.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'order_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f303_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'order_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f304-1_order-product.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'order_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f304-2_order-product.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'order_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f304-3_order-product.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'order_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f305_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    gc.collect()

    key = 'user_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f306_user-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    feature = compress(
        pd.read_pickle('../feature/{}/f306_user-product_n5.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'user_id'
    feature = compress(
        pd.read_pickle(
            '../feature/{}/f307_user-product-timezone.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'user_id'
    feature = compress(
        pd.read_pickle(
            '../feature/{}/f308_user-product-timezone.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'user_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f308_user-product-dow.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'user_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f309_user-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    feature = compress(
        pd.read_pickle('../feature/{}/f309_user-product_n5.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'user_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f310_user-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'user_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f312_user_product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    feature = compress(
        pd.read_pickle('../feature/{}/f312_user_product_n5.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')
    gc.collect()

    key = 'user_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f313_user_aisle.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'user_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f313_user_dep.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'user_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f314_user-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'order_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f315-1_order-product.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'order_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f315-2_order-product.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'order_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f315-3_order-product.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'order_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f316_order_product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')

    gc.collect()

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(df, ix_end)
    ix_end = df.shape[1]

    #==============================================================================
    print('daytime')
    #==============================================================================

    df = daytime_feature(df, name)

    print('{}.shape:{}\n'.format(name, df.shape))

    #    #==============================================================================
    #    print('aisle')
    #    #==============================================================================
    #    order_aisdep = pd.read_pickle('../input/mk/order_aisle-department.p')
    #    col = [c for c in order_aisdep.columns if 'department_' in c]
    #    order_aisdep.drop(col, axis=1, inplace=1)
    #
    #    df = pd.merge(df, order_aisdep.add_prefix('t-1_'), on='t-1_order_id', how='left')
    #    df = pd.merge(df, order_aisdep.add_prefix('t-2_'), on='t-2_order_id', how='left')
    #
    #    print('{}.shape:{}\n'.format(name, df.shape))

    #==============================================================================
    print('feature engineering')
    #==============================================================================
    df = pd.get_dummies(df, columns=['timezone'])
    df = pd.get_dummies(df, columns=['order_dow'])
    df = pd.get_dummies(df, columns=['order_hour_of_day'])

    df['days_near_order_cycle'] = (df.days_since_last_order_this_item -
                                   df.item_order_days_mean).abs()
    df['days_last_order-min'] = df.days_since_last_order_this_item - df.useritem_order_days_min
    df['days_last_order-max'] = df.days_since_last_order_this_item - df.useritem_order_days_max

    df['pos_cart_diff'] = (df.item_mean_pos_cart - df.useritem_mean_pos_cart)

    df['t-1_product_unq_len_diffByT-2'] = df['t-1_product_unq_len'] - df[
        't-2_product_unq_len']
    df['t-1_product_unq_len_diffByT-3'] = df['t-1_product_unq_len'] - df[
        't-3_product_unq_len']
    df['t-2_product_unq_len_diffByT-3'] = df['t-2_product_unq_len'] - df[
        't-3_product_unq_len']

    df['t-1_product_unq_len_ratioByT-2'] = df['t-1_product_unq_len'] / df[
        't-2_product_unq_len']
    df['t-1_product_unq_len_ratioByT-3'] = df['t-1_product_unq_len'] / df[
        't-3_product_unq_len']
    df['t-2_product_unq_len_ratioByT-3'] = df['t-2_product_unq_len'] / df[
        't-3_product_unq_len']

    df['T'] = T

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(df, ix_end)

    #==============================================================================
    print('output')
    #==============================================================================
    if dryrun == True:
        return df
    else:
        utils.to_pickles(df,
                         '../feature/{}/all'.format(name),
                         20,
                         inplace=True)
Пример #13
0
def concat_pred_features(T):
    if T == -1:
        name = 'test'
        # concat
        df0 = pd.read_csv('../input/test_mobile.csv.gz', compression='gzip')
        df1 = pd.read_csv('../input/test_female_fashion.csv.gz',
                          compression='gzip')
        df2 = pd.read_csv('../input/test_male_fashion.csv.gz',
                          compression='gzip')
        df = pd.concat([df0, df1, df2], ignore_index=True)
    else:
        name = 'train'
        #==============================================================================
        print('load label')
        #==============================================================================
        # concat
        df0 = pd.read_csv('../input/train_mobile.csv.gz', compression='gzip')
        df1 = pd.read_csv('../input/train_female_fashion.csv.gz',
                          compression='gzip')
        df2 = pd.read_csv('../input/train_male_fashion.csv.gz',
                          compression='gzip')
        df = pd.concat([df0, df1, df2], ignore_index=True)

    #==============================================================================
    print('word_given_product_name feature')
    #==============================================================================
    df = word_given_product_name_feature(df, name)

    print('{}.shape: {}'.format(name, df.shape))

    #==============================================================================
    print('semantic_feature')
    #==============================================================================
    df = semantic_feature(df, name)

    print('{}.shape: {}'.format(name, df.shape))

    #==============================================================================
    print('hot search feature')
    #==============================================================================
    df = hot_search_count(df, name)

    print('{}.shape: {}'.format(name, df.shape))

    #==============================================================================
    print('feature engineering')
    #==============================================================================
    df = pd.get_dummies(df, columns=['Category'])
    df.drop_duplicates(['Product Name', 'words'], inplace=True)
    print('{}.shape: {}'.format(name, df.shape))
    # some features with largest, we perform log transformation to them
    for col in df.columns:
        if 'count' in col and df[col].max() > 100:
            df['log_{}'.format(col)] = np.log(df[col] + 1)  # smoothing
            df.drop(col, axis=1, inplace=True)
    if name == 'train':
        #==============================================================================
        print('reduce memory')
        #==============================================================================
        utils.reduce_memory(df)
    #==============================================================================
    print('output')
    #==============================================================================
    df.to_csv('../feature/{}/all_features.csv.gz'.format(name),
              index=False,
              compression='gzip')
Пример #14
0
def run_pipeline(use_pickled_features=False, debug=False):
    """Run the complete pipeline.

    Arguments:
        use_pickled_features: Use features saved as pickle 
        files (boolean, default: False).
        debug: Run pipeline with a subset of data (boolean, default: False)
    """
    num_rows = 30000 if debug else None  # Subset of data for debugging

    # Preprocess and extract features from each csv file
    with utils.timer("Application data"):
        if use_pickled_features:
            df = pd.read_pickle(
                os.path.join(config.PICKLED_DATA_DIRECTORY, 'application.pkl'))
        else:
            df = application_pipeline.get_train_test(config.DATA_DIRECTORY,
                                                     num_rows=num_rows)
    with utils.timer("Bureau data"):
        if use_pickled_features:
            bureau_df = pd.read_pickle(
                os.path.join(config.PICKLED_DATA_DIRECTORY,
                             'bureau_and_balance.pkl'))
        else:
            bureau_df = bureau_pipeline.get_bureau(config.DATA_DIRECTORY,
                                                   num_rows=num_rows)
        df = pd.merge(df, bureau_df, on='SK_ID_CURR', how='left')
        del bureau_df
        gc.collect()
    with utils.timer("Previous application data"):
        if use_pickled_features:
            prev_df = pd.read_pickle(
                os.path.join(config.PICKLED_DATA_DIRECTORY, 'previous.pkl'))
        else:
            prev_df = previous_pipeline.get_previous_applications(
                config.DATA_DIRECTORY, num_rows)
        df = pd.merge(df, prev_df, on='SK_ID_CURR', how='left')
        del prev_df
        gc.collect()
    with utils.timer("Previous balance data"):
        if use_pickled_features:
            pos = pd.read_pickle(
                os.path.join(config.PICKLED_DATA_DIRECTORY, 'pos_cash.pkl'))
        else:
            pos = previous_balance_pipeline.get_pos_cash(
                config.DATA_DIRECTORY, num_rows)
        df = pd.merge(df, pos, on='SK_ID_CURR', how='left')
        del pos
        gc.collect()
        if use_pickled_features:
            ins = pd.read_pickle(
                os.path.join(config.PICKLED_DATA_DIRECTORY, 'payments.pkl'))
        else:
            ins = previous_balance_pipeline.get_installment_payments(
                config.DATA_DIRECTORY, num_rows)
        df = pd.merge(df, ins, on='SK_ID_CURR', how='left')
        del ins
        gc.collect()
        if use_pickled_features:
            cc = pd.read_pickle(
                os.path.join(config.PICKLED_DATA_DIRECTORY, 'credit_card.pkl'))
        else:
            cc = previous_balance_pipeline.get_credit_card(
                config.DATA_DIRECTORY, num_rows)
        df = pd.merge(df, cc, on='SK_ID_CURR', how='left')
        del cc
        gc.collect()

    # Add ratios and groupby between different tables
    with utils.timer('Add extra features'):
        df = other_features.add_ratio_features(df)
        df = other_features.add_groupby_features(df)
    # Reduce memory usage
    df = utils.reduce_memory(df)

    # List categorical features for LightGBM partitioning mechanism (Fisher 1958)
    lgbm_categorical_feat = [
        'CODE_GENDER', 'FLAG_OWN_CAR', 'NAME_CONTRACT_TYPE',
        'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
        'NAME_INCOME_TYPE', 'OCCUPATION_TYPE', 'ORGANIZATION_TYPE',
        'WEEKDAY_APPR_PROCESS_START'
    ]

    with utils.timer("Run LightGBM"):
        model.kfold_lightgbm_sklearn(df, lgbm_categorical_feat)
Пример #15
0
def _pickle_file(df, file_name):
    df = utils.reduce_memory(df)
    df.to_pickle(os.path.join(config.PICKLED_DATA_DIRECTORY, file_name))
    print("Saved as {} - frame shape: {}".format(file_name, df.shape))
Пример #16
0
def make(T):
	"""
	T = 0
	folder = 'trainW-0'
	"""

	if T ==-1:
	    folder = 'test'
	    train = pd.read_csv('../input/sample_submission_v2.csv') # 此train代表的是test的user
	else:
	    folder = 'trainW-'+str(T)
	    train = pd.read_csv('../input/preprocessed_data/trainW-{0}.csv'.format(T))[['msno']]

	# the following style is silly, but it's all for saving memory 
	if T == 0:
		df = pd.merge(train, 
	    transactions_price_plan_days[(transactions_price_plan_days.transaction_date < datetime.strptime('2017-03-01', '%Y-%m-%d'))], 
	    on=['msno'], 
	    how='left')
	elif T == 1:
	    # w = 1:使用2月之前的資料當作history
	    df = pd.merge(train, 
	    	transactions_price_plan_days[(transactions_price_plan_days.transaction_date < datetime.strptime('2017-02-01', '%Y-%m-%d'))],
	    	on=['msno'], 
	    	how='left') 
	elif T == 2:
	    # w = 2:使用1月之前的資料當作history
	    df = pd.merge(train, 
	    	transactions_price_plan_days[(transactions_price_plan_days.transaction_date < datetime.strptime('2017-01-01', '%Y-%m-%d'))],
	    	on=['msno'], 
	    	how='left') 
	elif T == -1:
	    # w = -1:使用4月之前的資料當作history
	    df = pd.merge(train, 
	    	transactions_price_plan_days[(transactions_price_plan_days.transaction_date < datetime.strptime('2017-04-01', '%Y-%m-%d'))],
	    	on='msno', 
	    	how='left') 
	
	del train
	gc.collect()
	##################################################
	# All history
	##################################################
	#df = df.dropna()
	########
	# core1
	########
	tbl = df.groupby('msno').discount.mean().to_frame()
	tbl.columns = ['discount-mean']
	tbl['discount-min'] = df.groupby('msno').discount.min()
	tbl['discount-max'] = df.groupby('msno').discount.max()
	tbl['discount-median'] = df.groupby('msno').discount.median()
	tbl['discount-std'] = df.groupby('msno').discount.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)

	# write
	tbl.to_csv('../feature/{}/discount.csv'.format(folder), index = False)

	########
	# core2
	########
	tbl = df.groupby('msno').amt_per_day.mean().to_frame()
	tbl.columns = ['amt_per_day-mean']
	tbl['amt_per_day-min'] = df.groupby('msno').amt_per_day.min()
	tbl['amt_per_day-max'] = df.groupby('msno').amt_per_day.max()
	tbl['amt_per_day-median'] = df.groupby('msno').amt_per_day.median()
	tbl['amt_per_day-std'] = df.groupby('msno').amt_per_day.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)

	# write
	tbl.to_csv('../feature/{}/amt_per_day.csv'.format(folder), index = False)

	########
	# core3
	########
	tbl = df.groupby('msno').cp_value.mean().to_frame()
	tbl.columns = ['cp_value-mean']
	tbl['cp_value-min'] = df.groupby('msno').cp_value.min()
	tbl['cp_value-max'] = df.groupby('msno').cp_value.max()
	tbl['cp_value-median'] = df.groupby('msno').cp_value.median()
	tbl['cp_value-std'] = df.groupby('msno').cp_value.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)

	# write
	tbl.to_csv('../feature/{}/cp_value.csv'.format(folder), index = False)
	########
	# core4
	########
	tbl = df.groupby('msno').is_discount.sum().to_frame()
	tbl.columns = ['is_discount_total_count']
	tbl['is_discount_total_count_ratio'] = df.groupby('msno').is_discount.mean()
	tbl.reset_index(inplace = True)
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)

	# write
	tbl.to_csv('../feature/{}/is_discount.csv'.format(folder), index = False)

	##################################################
	# near 5
	##################################################
	df_ = df.groupby('msno').apply(near,5).reset_index(drop = True)
	########
	# core1
	########
	tbl = df_.groupby('msno').discount.mean().to_frame()
	tbl.columns = ['discount-mean_n5']
	tbl['discount-min_n5'] = df_.groupby('msno').discount.min()
	tbl['discount-max_n5'] = df_.groupby('msno').discount.max()
	tbl['discount-median_n5'] = df_.groupby('msno').discount.median()
	tbl['discount-std_n5'] = df_.groupby('msno').discount.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)

	# write
	tbl.to_csv('../feature/{}/discount_n5.csv'.format(folder), index = False)

	########
	# core2
	########
	tbl = df_.groupby('msno').amt_per_day.mean().to_frame()
	tbl.columns = ['amt_per_day-mean_n5']
	tbl['amt_per_day-min_n5'] = df_.groupby('msno').amt_per_day.min()
	tbl['amt_per_day-max_n5'] = df_.groupby('msno').amt_per_day.max()
	tbl['amt_per_day-median_n5'] = df_.groupby('msno').amt_per_day.median()
	tbl['amt_per_day-std_n5'] = df_.groupby('msno').amt_per_day.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)

	# write
	tbl.to_csv('../feature/{}/amt_per_day_n5.csv'.format(folder), index = False)

	########
	# core3
	########
	tbl = df_.groupby('msno').cp_value.mean().to_frame()
	tbl.columns = ['cp_value-mean_n5']
	tbl['cp_value-min_n5'] = df_.groupby('msno').cp_value.min()
	tbl['cp_value-max_n5'] = df_.groupby('msno').cp_value.max()
	tbl['cp_value-median_n5'] = df_.groupby('msno').cp_value.median()
	tbl['cp_value-std_n5'] = df_.groupby('msno').cp_value.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)

	# write
	tbl.to_csv('../feature/{}/cp_value_n5.csv'.format(folder), index = False)
	########
	# core4
	########
	tbl = df_.groupby('msno').is_discount.sum().to_frame()
	tbl.columns = ['is_discount_total_count_n5']
	tbl['is_discount_total_count_ratio_n5'] = df_.groupby('msno').is_discount.mean()
	tbl.reset_index(inplace = True)
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)

	# write
	tbl.to_csv('../feature/{}/is_discount_n5.csv'.format(folder), index = False)
	del df_
	gc.collect()

	##################################################
	# only one prvious order
	##################################################
	df_ = df.groupby('msno').apply(near,1).reset_index(drop = True)
	########
	# core1
	########
	tbl = df_.groupby('msno').discount.mean().to_frame()
	tbl.columns = ['discount-mean_n1']
	tbl['discount-min_n1'] = df_.groupby('msno').discount.min()
	tbl['discount-max_n1'] = df_.groupby('msno').discount.max()
	tbl['discount-median_n1'] = df_.groupby('msno').discount.median()
	tbl['discount-std_n1'] = df_.groupby('msno').discount.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)

	# write
	tbl.to_csv('../feature/{}/discount_n1.csv'.format(folder), index = False)

	########
	# core2
	########
	tbl = df_.groupby('msno').amt_per_day.mean().to_frame()
	tbl.columns = ['amt_per_day-mean_n1']
	tbl['amt_per_day-min_n1'] = df_.groupby('msno').amt_per_day.min()
	tbl['amt_per_day-max_n1'] = df_.groupby('msno').amt_per_day.max()
	tbl['amt_per_day-median_n1'] = df_.groupby('msno').amt_per_day.median()
	tbl['amt_per_day-std_n1'] = df_.groupby('msno').amt_per_day.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)

	# write
	tbl.to_csv('../feature/{}/amt_per_day_n1.csv'.format(folder), index = False)

	########
	# core3
	########
	tbl = df_.groupby('msno').cp_value.mean().to_frame()
	tbl.columns = ['cp_value-mean_n1']
	tbl['cp_value-min_n1'] = df_.groupby('msno').cp_value.min()
	tbl['cp_value-max_n1'] = df_.groupby('msno').cp_value.max()
	tbl['cp_value-median_n1'] = df_.groupby('msno').cp_value.median()
	tbl['cp_value-std_n1'] = df_.groupby('msno').cp_value.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)

	# write
	tbl.to_csv('../feature/{}/cp_value_n1.csv'.format(folder), index = False)
	########
	# core4
	########
	tbl = df_.groupby('msno').is_discount.sum().to_frame()
	tbl.columns = ['is_discount_total_count_n1']
	tbl['is_discount_total_count_ratio_n1'] = df_.groupby('msno').is_discount.mean()
	tbl.reset_index(inplace = True)
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)

	# write
	tbl.to_csv('../feature/{}/is_discount_n1.csv'.format(folder), index = False)
	del df_
	gc.collect()
Пример #17
0
def make(T):
    """
	T = 0
	folder = 'trainW-0'
	"""

    if T == -1:
        folder = 'test'
        train = pd.read_csv(
            '../input/sample_submission_v2.csv')  # 此train代表的是test的user
        train['w'] = T
        membership_loyalty = utils.read_multiple_csv(
            '../feature/{}/days_since_the_last_transactions'.format(folder),
            input_col)

    else:
        folder = 'trainW-' + str(T)
        membership_loyalty = utils.read_multiple_csv(
            '../feature/{}/days_since_the_last_transactions'.format(folder),
            input_col)
        train = pd.read_csv(
            '../input/preprocessed_data/trainW-{0}.csv'.format(T))[[
                'msno', 'w'
            ]]  # we do not need is_churn
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(membership_loyalty)
    ##################################################
    # All history
    ##################################################
    # merge
    df = pd.merge(train, membership_loyalty, on=['msno', 'w'], how='left')
    #df = df.head( n= 1000)
    #core1
    tbl = df.groupby('msno').is_subscribe_early.sum().to_frame()
    tbl.columns = ['is_subscribe_early_count']
    tbl['is_subscribe_early_ratio'] = df.groupby(
        'msno').is_subscribe_early.mean()
    tbl.reset_index(inplace=True)

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    #write
    tbl.to_csv('../feature/{}/is_subscribe_early.csv'.format(folder),
               index=False)

    #core2
    tbl = df.groupby('msno').do_change_payment_method.sum().to_frame()
    tbl.columns = ['do_change_payment_method_count']
    tbl['do_change_payment_method_ratio'] = df.groupby(
        'msno').do_change_payment_method.mean()
    tbl.reset_index(inplace=True)

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    #write
    tbl.to_csv('../feature/{}/do_change_payment_method.csv'.format(folder),
               index=False)

    #core3
    tbl = df.groupby('msno').do_spend_more_money.mean().to_frame()
    tbl.columns = ['do_spend_more_money-mean']
    tbl['do_spend_more_money-min'] = df.groupby(
        'msno').do_spend_more_money.min()
    tbl['do_spend_more_money-max'] = df.groupby(
        'msno').do_spend_more_money.max()
    tbl['do_spend_more_money-median'] = df.groupby(
        'msno').do_spend_more_money.median()
    tbl['do_spend_more_money-std'] = df.groupby(
        'msno').do_spend_more_money.std()
    tbl.reset_index(inplace=True)

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    #write
    tbl.to_csv('../feature/{}/do_spend_more_money.csv'.format(folder),
               index=False)

    #core4
    tbl = df.groupby('msno').do_extend_payment_days.mean().to_frame()
    tbl.columns = ['do_extend_payment_days-mean']
    tbl['do_extend_payment_days-min'] = df.groupby(
        'msno').do_extend_payment_days.min()
    tbl['do_extend_payment_days-max'] = df.groupby(
        'msno').do_extend_payment_days.max()
    tbl['do_extend_payment_days-median'] = df.groupby(
        'msno').do_extend_payment_days.median()
    tbl['do_extend_payment_days-std'] = df.groupby(
        'msno').do_extend_payment_days.std()
    tbl.reset_index(inplace=True)

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    #write
    tbl.to_csv('../feature/{}/do_extend_payment_days.csv'.format(folder),
               index=False)

    #core5
    tbl = df.groupby('msno').do_paid_more.sum().to_frame()
    tbl.columns = ['do_paid_more_count']
    tbl['do_paid_more_ratio'] = df.groupby('msno').do_paid_more.mean()
    tbl.reset_index(inplace=True)

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    #write
    tbl.to_csv('../feature/{}/do_paid_more.csv'.format(folder), index=False)

    ##################################################
    # near 5
    ##################################################
    df_ = df.groupby('msno').apply(near, 5).reset_index(drop=True)

    #core1
    tbl = df_.groupby('msno').is_subscribe_early.sum().to_frame()
    tbl.columns = ['is_subscribe_early_count_n5']
    tbl['is_subscribe_early_ratio_n5'] = df_.groupby(
        'msno').is_subscribe_early.mean()
    tbl.reset_index(inplace=True)

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    #write
    tbl.to_csv('../feature/{}/is_subscribe_early_n5.csv'.format(folder),
               index=False)

    #core2
    tbl = df_.groupby('msno').do_change_payment_method.sum().to_frame()
    tbl.columns = ['do_change_payment_method_count_n5']
    tbl['do_change_payment_method_ratio_n5'] = df_.groupby(
        'msno').do_change_payment_method.mean()
    tbl.reset_index(inplace=True)

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    #write
    tbl.to_csv('../feature/{}/do_change_payment_method_n5.csv'.format(folder),
               index=False)

    #core3
    tbl = df_.groupby('msno').do_spend_more_money.mean().to_frame()
    tbl.columns = ['do_spend_more_money-mean_n5']
    tbl['do_spend_more_money-min_n5'] = df_.groupby(
        'msno').do_spend_more_money.min()
    tbl['do_spend_more_money-max_n5'] = df_.groupby(
        'msno').do_spend_more_money.max()
    tbl['do_spend_more_money-median_n5'] = df_.groupby(
        'msno').do_spend_more_money.median()
    tbl['do_spend_more_money-std_n5'] = df_.groupby(
        'msno').do_spend_more_money.std()
    tbl.reset_index(inplace=True)

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    #write
    tbl.to_csv('../feature/{}/do_spend_more_money_n5.csv'.format(folder),
               index=False)

    #core4
    tbl = df_.groupby('msno').do_extend_payment_days.mean().to_frame()
    tbl.columns = ['do_extend_payment_days-mean_n5']
    tbl['do_extend_payment_days-min_n5'] = df_.groupby(
        'msno').do_extend_payment_days.min()
    tbl['do_extend_payment_days-max_n5'] = df_.groupby(
        'msno').do_extend_payment_days.max()
    tbl['do_extend_payment_days-median_n5'] = df_.groupby(
        'msno').do_extend_payment_days.median()
    tbl['do_extend_payment_days-std_n5'] = df_.groupby(
        'msno').do_extend_payment_days.std()
    tbl.reset_index(inplace=True)

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    #write
    tbl.to_csv('../feature/{}/do_extend_payment_days_n5.csv'.format(folder),
               index=False)

    #core5
    tbl = df_.groupby('msno').do_paid_more.sum().to_frame()
    tbl.columns = ['do_paid_more_count_n5']
    tbl['do_paid_more_ratio_n5'] = df_.groupby('msno').do_paid_more.mean()
    tbl.reset_index(inplace=True)

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    #write
    tbl.to_csv('../feature/{}/do_paid_more_n5.csv'.format(folder), index=False)

    del df_
    gc.collect()

    ##################################################
    # only one prvious order
    ##################################################
    df_ = df.groupby('msno').apply(near, 1).reset_index(drop=True)

    #core1
    tbl = df_.groupby('msno').is_subscribe_early.sum().to_frame()
    tbl.columns = ['is_subscribe_early_count_n1']
    tbl['is_subscribe_early_ratio_n1'] = df_.groupby(
        'msno').is_subscribe_early.mean()
    tbl.reset_index(inplace=True)

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    #write
    tbl.to_csv('../feature/{}/is_subscribe_early_n1.csv'.format(folder),
               index=False)

    #core2
    tbl = df_.groupby('msno').do_change_payment_method.sum().to_frame()
    tbl.columns = ['do_change_payment_method_count_n1']
    tbl['do_change_payment_method_ratio_n1'] = df_.groupby(
        'msno').do_change_payment_method.mean()
    tbl.reset_index(inplace=True)

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    #write
    tbl.to_csv('../feature/{}/do_change_payment_method_n1.csv'.format(folder),
               index=False)

    #core3
    tbl = df_.groupby('msno').do_spend_more_money.mean().to_frame()
    tbl.columns = ['do_spend_more_money-mean_n1']
    tbl['do_spend_more_money-min_n1'] = df_.groupby(
        'msno').do_spend_more_money.min()
    tbl['do_spend_more_money-max_n1'] = df_.groupby(
        'msno').do_spend_more_money.max()
    tbl['do_spend_more_money-median_n1'] = df_.groupby(
        'msno').do_spend_more_money.median()
    tbl['do_spend_more_money-std_n1'] = df_.groupby(
        'msno').do_spend_more_money.std()
    tbl.reset_index(inplace=True)

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    #write
    tbl.to_csv('../feature/{}/do_spend_more_money_n1.csv'.format(folder),
               index=False)

    #core4
    tbl = df_.groupby('msno').do_extend_payment_days.mean().to_frame()
    tbl.columns = ['do_extend_payment_days-mean_n1']
    tbl['do_extend_payment_days-min_n1'] = df_.groupby(
        'msno').do_extend_payment_days.min()
    tbl['do_extend_payment_days-max_n1'] = df_.groupby(
        'msno').do_extend_payment_days.max()
    tbl['do_extend_payment_days-median_n1'] = df_.groupby(
        'msno').do_extend_payment_days.median()
    tbl['do_extend_payment_days-std_n1'] = df_.groupby(
        'msno').do_extend_payment_days.std()
    tbl.reset_index(inplace=True)

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    #write
    tbl.to_csv('../feature/{}/do_extend_payment_days_n1.csv'.format(folder),
               index=False)

    #core5
    tbl = df_.groupby('msno').do_paid_more.sum().to_frame()
    tbl.columns = ['do_paid_more_count_n1']
    tbl['do_paid_more_ratio_n1'] = df_.groupby('msno').do_paid_more.mean()
    tbl.reset_index(inplace=True)

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    #write
    tbl.to_csv('../feature/{}/do_paid_more_n1.csv'.format(folder), index=False)

    del df_
    gc.collect()
def make(T):
    """
	T = 0
	folder = 'trainW-0'
	"""

    if T == -1:
        folder = 'test'
        train = pd.read_csv(
            '../input/sample_submission_v2.csv')  # 此train代表的是test的user
        train['w'] = T
        membership_loyalty = utils.read_multiple_csv(
            '../feature/{}/days_since_the_last_transactions'.format(folder),
            input_col)

    else:
        folder = 'trainW-' + str(T)
        membership_loyalty = utils.read_multiple_csv(
            '../feature/{}/days_since_the_last_transactions'.format(folder),
            input_col)
        train = pd.read_csv(
            '../input/preprocessed_data/trainW-{0}.csv'.format(T))[[
                'msno', 'w'
            ]]  # we do not need is_churn
        #==============================================================================
        print('reduce memory')
        #==============================================================================
        utils.reduce_memory(membership_loyalty)
    ##################################################
    # All history
    ##################################################
    # merge
    df = pd.merge(train, membership_loyalty, on=['msno', 'w'], how='left')
    ########
    # core1
    ########
    tbl = df.groupby('msno').days_since_the_last_expiration.mean().to_frame()
    tbl.columns = ['days_since_the_last_expiration-mean']
    tbl['days_since_the_last_expiration-min'] = df.groupby(
        'msno').days_since_the_last_expiration.min()
    tbl['days_since_the_last_expiration-max'] = df.groupby(
        'msno').days_since_the_last_expiration.max()
    tbl['days_since_the_last_expiration-median'] = df.groupby(
        'msno').days_since_the_last_expiration.median()
    tbl['days_since_the_last_expiration-std'] = df.groupby(
        'msno').days_since_the_last_expiration.std()
    tbl.reset_index(inplace=True)
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    # write
    tbl.to_csv(
        '../feature/{}/days_since_the_last_expiration.csv'.format(folder),
        index=False)
    ########
    # core2
    ########
    tbl = df.groupby('msno').days_since_the_last_subscription.mean().to_frame()
    tbl.columns = ['days_since_the_last_subscription-mean']
    tbl['days_since_the_last_subscription-min'] = df.groupby(
        'msno').days_since_the_last_subscription.min()
    tbl['days_since_the_last_subscription-max'] = df.groupby(
        'msno').days_since_the_last_subscription.max()
    tbl['days_since_the_last_subscription-median'] = df.groupby(
        'msno').days_since_the_last_subscription.median()
    tbl['days_since_the_last_subscription-std'] = df.groupby(
        'msno').days_since_the_last_subscription.std()
    tbl.reset_index(inplace=True)
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    # write
    tbl.to_csv(
        '../feature/{}/days_since_the_last_subscription.csv'.format(folder),
        index=False)
    #########
    # core3
    #########
    tbl = df.groupby(
        'msno')['days_since_the_last_expiration-cumsum'].mean().to_frame()
    tbl.columns = ['days_since_the_last_expiration-cumsum-mean']
    tbl['days_since_the_last_expiration-cumsum-min'] = df.groupby(
        'msno')['days_since_the_last_expiration-cumsum'].min()
    tbl['days_since_the_last_expiration-cumsum-max'] = df.groupby(
        'msno')['days_since_the_last_expiration-cumsum'].max()
    tbl['days_since_the_last_expiration-cumsum-median'] = df.groupby(
        'msno')['days_since_the_last_expiration-cumsum'].median()
    tbl['days_since_the_last_expiration-cumsum-std'] = df.groupby(
        'msno')['days_since_the_last_expiration-cumsum'].std()
    tbl.reset_index(inplace=True)
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    # write
    tbl.to_csv(
        '../feature/{}/days_since_the_last_expiration-cumsum.csv'.format(
            folder),
        index=False)
    ########
    # core4
    ########
    tbl = df.groupby(
        'msno').days_since_the_last_expiration_ratio.mean().to_frame()
    tbl.columns = ['days_since_the_last_expiration_ratio-mean']
    tbl['days_since_the_last_expiration_ratio-min'] = df.groupby(
        'msno').days_since_the_last_expiration_ratio.min()
    tbl['days_since_the_last_expiration_ratio-max'] = df.groupby(
        'msno').days_since_the_last_expiration_ratio.max()
    tbl['days_since_the_last_expiration_ratio-median'] = df.groupby(
        'msno').days_since_the_last_expiration_ratio.median()
    tbl['days_since_the_last_expiration_ratio-std'] = df.groupby(
        'msno').days_since_the_last_expiration_ratio.std()
    tbl.reset_index(inplace=True)
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    # write
    tbl.to_csv('../feature/{}/days_since_the_last_expiration_ratio.csv'.format(
        folder),
               index=False)
    ########
    # core5
    ########
    tbl = df.groupby(
        'msno').days_since_the_last_subscription_ratio.mean().to_frame()
    tbl.columns = ['days_since_the_last_subscription_ratio-mean']
    tbl['days_since_the_last_subscription_ratio-min'] = df.groupby(
        'msno').days_since_the_last_subscription_ratio.min()
    tbl['days_since_the_last_subscription_ratio-max'] = df.groupby(
        'msno').days_since_the_last_subscription_ratio.max()
    tbl['days_since_the_last_subscription_ratio-median'] = df.groupby(
        'msno').days_since_the_last_subscription_ratio.median()
    tbl['days_since_the_last_subscription_ratio-std'] = df.groupby(
        'msno').days_since_the_last_subscription_ratio.std()
    tbl.reset_index(inplace=True)
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    # write
    tbl.to_csv(
        '../feature/{}/days_since_the_last_subscription_ratio.csv'.format(
            folder),
        index=False)
    ########
    # core6
    ########
    tbl = df.groupby(
        'msno').days_since_the_first_subscription.mean().to_frame()
    tbl.columns = ['days_since_the_first_subscription-mean']
    tbl['days_since_the_first_subscription-min'] = df.groupby(
        'msno').days_since_the_first_subscription.min()
    tbl['days_since_the_first_subscription-max'] = df.groupby(
        'msno').days_since_the_first_subscription.max()
    tbl['days_since_the_first_subscription-median'] = df.groupby(
        'msno').days_since_the_first_subscription.median()
    tbl['days_since_the_first_subscription-std'] = df.groupby(
        'msno').days_since_the_first_subscription.std()
    tbl.reset_index(inplace=True)
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    # write
    tbl.to_csv(
        '../feature/{}/days_since_the_first_subscription.csv'.format(folder),
        index=False)

    ##################################################
    # near 5
    ##################################################
    df_ = df.groupby('msno').apply(near, 5).reset_index(drop=True)
    ########
    # core1
    ########
    tbl = df_.groupby('msno').days_since_the_last_expiration.mean().to_frame()
    tbl.columns = ['days_since_the_last_expiration-mean_n5']
    tbl['days_since_the_last_expiration-min_n5'] = df_.groupby(
        'msno').days_since_the_last_expiration.min()
    tbl['days_since_the_last_expiration-max_n5'] = df_.groupby(
        'msno').days_since_the_last_expiration.max()
    tbl['days_since_the_last_expiration-median_n5'] = df_.groupby(
        'msno').days_since_the_last_expiration.median()
    tbl['days_since_the_last_expiration-std_n5'] = df_.groupby(
        'msno').days_since_the_last_expiration.std()
    tbl.reset_index(inplace=True)
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)

    # write
    tbl.to_csv(
        '../feature/{}/days_since_the_last_expiration_n5.csv'.format(folder),
        index=False)
    ########
    # core2
    ########
    tbl = df_.groupby(
        'msno').days_since_the_last_subscription.mean().to_frame()
    tbl.columns = ['days_since_the_last_subscription-mean_n5']
    tbl['days_since_the_last_subscription-min_n5'] = df_.groupby(
        'msno').days_since_the_last_subscription.min()
    tbl['days_since_the_last_subscription-max_n5'] = df_.groupby(
        'msno').days_since_the_last_subscription.max()
    tbl['days_since_the_last_subscription-median_n5'] = df_.groupby(
        'msno').days_since_the_last_subscription.median()
    tbl['days_since_the_last_subscription-std_n5'] = df_.groupby(
        'msno').days_since_the_last_subscription.std()
    tbl.reset_index(inplace=True)
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    # write
    tbl.to_csv(
        '../feature/{}/days_since_the_last_subscription_n5.csv'.format(folder),
        index=False)
    #########
    # core3
    #########
    tbl = df_.groupby(
        'msno')['days_since_the_last_expiration-cumsum'].mean().to_frame()
    tbl.columns = ['days_since_the_last_expiration-cumsum-mean_n5']
    tbl['days_since_the_last_expiration-cumsum-min_n5'] = df_.groupby(
        'msno')['days_since_the_last_expiration-cumsum'].min()
    tbl['days_since_the_last_expiration-cumsum-max_n5'] = df_.groupby(
        'msno')['days_since_the_last_expiration-cumsum'].max()
    tbl['days_since_the_last_expiration-cumsum-median_n5'] = df_.groupby(
        'msno')['days_since_the_last_expiration-cumsum'].median()
    tbl['days_since_the_last_expiration-cumsum-std_n5'] = df_.groupby(
        'msno')['days_since_the_last_expiration-cumsum'].std()
    tbl.reset_index(inplace=True)
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    # write
    tbl.to_csv(
        '../feature/{}/days_since_the_last_expiration-cumsum_n5.csv'.format(
            folder),
        index=False)
    ########
    # core4
    ########
    tbl = df_.groupby(
        'msno').days_since_the_last_expiration_ratio.mean().to_frame()
    tbl.columns = ['days_since_the_last_expiration_ratio-mean_n5']
    tbl['days_since_the_last_expiration_ratio-min_n5'] = df_.groupby(
        'msno').days_since_the_last_expiration_ratio.min()
    tbl['days_since_the_last_expiration_ratio-max_n5'] = df_.groupby(
        'msno').days_since_the_last_expiration_ratio.max()
    tbl['days_since_the_last_expiration_ratio-median_n5'] = df_.groupby(
        'msno').days_since_the_last_expiration_ratio.median()
    tbl['days_since_the_last_expiration_ratio-std_n5'] = df_.groupby(
        'msno').days_since_the_last_expiration_ratio.std()
    tbl.reset_index(inplace=True)
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    # write
    tbl.to_csv(
        '../feature/{}/days_since_the_last_expiration_ratio_n5.csv'.format(
            folder),
        index=False)
    ########
    # core5
    ########
    tbl = df_.groupby(
        'msno').days_since_the_last_subscription_ratio.mean().to_frame()
    tbl.columns = ['days_since_the_last_subscription_ratio-mean_n5']
    tbl['days_since_the_last_subscription_ratio-min_n5'] = df_.groupby(
        'msno').days_since_the_last_subscription_ratio.min()
    tbl['days_since_the_last_subscription_ratio-max_n5'] = df_.groupby(
        'msno').days_since_the_last_subscription_ratio.max()
    tbl['days_since_the_last_subscription_ratio-median_n5'] = df_.groupby(
        'msno').days_since_the_last_subscription_ratio.median()
    tbl['days_since_the_last_subscription_ratio-std_n5'] = df_.groupby(
        'msno').days_since_the_last_subscription_ratio.std()
    tbl.reset_index(inplace=True)
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    # write
    tbl.to_csv(
        '../feature/{}/days_since_the_last_subscription_ratio_n5.csv'.format(
            folder),
        index=False)
    ########
    # core6
    ########
    tbl = df_.groupby(
        'msno').days_since_the_first_subscription.mean().to_frame()
    tbl.columns = ['days_since_the_first_subscription-mean_n5']
    tbl['days_since_the_first_subscription-min_n5'] = df_.groupby(
        'msno').days_since_the_first_subscription.min()
    tbl['days_since_the_first_subscription-max_n5'] = df_.groupby(
        'msno').days_since_the_first_subscription.max()
    tbl['days_since_the_first_subscription-median_n5'] = df_.groupby(
        'msno').days_since_the_first_subscription.median()
    tbl['days_since_the_first_subscription-std_n5'] = df_.groupby(
        'msno').days_since_the_first_subscription.std()
    tbl.reset_index(inplace=True)
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    # write
    tbl.to_csv('../feature/{}/days_since_the_first_subscription_n5.csv'.format(
        folder),
               index=False)

    del df_
    ##################################################
    # only one prvious order
    ##################################################
    df_ = df.groupby('msno').apply(near, 1).reset_index(drop=True)
    ########
    # core1
    ########
    tbl = df_.groupby('msno').days_since_the_last_expiration.mean().to_frame()
    tbl.columns = ['days_since_the_last_expiration-mean_n1']
    tbl['days_since_the_last_expiration-min_n1'] = df_.groupby(
        'msno').days_since_the_last_expiration.min()
    tbl['days_since_the_last_expiration-max_n1'] = df_.groupby(
        'msno').days_since_the_last_expiration.max()
    tbl['days_since_the_last_expiration-median_n1'] = df_.groupby(
        'msno').days_since_the_last_expiration.median()
    tbl['days_since_the_last_expiration-std_n1'] = df_.groupby(
        'msno').days_since_the_last_expiration.std()
    tbl.reset_index(inplace=True)
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    # write
    tbl.to_csv(
        '../feature/{}/days_since_the_last_expiration_n1.csv'.format(folder),
        index=False)
    ########
    # core2
    ########
    tbl = df_.groupby(
        'msno').days_since_the_last_subscription.mean().to_frame()
    tbl.columns = ['days_since_the_last_subscription-mean_n1']
    tbl['days_since_the_last_subscription-min_n1'] = df_.groupby(
        'msno').days_since_the_last_subscription.min()
    tbl['days_since_the_last_subscription-max_n1'] = df_.groupby(
        'msno').days_since_the_last_subscription.max()
    tbl['days_since_the_last_subscription-median_n1'] = df_.groupby(
        'msno').days_since_the_last_subscription.median()
    tbl['days_since_the_last_subscription-std_n1'] = df_.groupby(
        'msno').days_since_the_last_subscription.std()
    tbl.reset_index(inplace=True)
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    # write
    tbl.to_csv(
        '../feature/{}/days_since_the_last_subscription_n1.csv'.format(folder),
        index=False)
    #########
    # core3
    #########
    tbl = df_.groupby(
        'msno')['days_since_the_last_expiration-cumsum'].mean().to_frame()
    tbl.columns = ['days_since_the_last_expiration-cumsum-mean_n1']
    tbl['days_since_the_last_expiration-cumsum-min_n1'] = df_.groupby(
        'msno')['days_since_the_last_expiration-cumsum'].min()
    tbl['days_since_the_last_expiration-cumsum-max_n1'] = df_.groupby(
        'msno')['days_since_the_last_expiration-cumsum'].max()
    tbl['days_since_the_last_expiration-cumsum-median_n1'] = df_.groupby(
        'msno')['days_since_the_last_expiration-cumsum'].median()
    tbl['days_since_the_last_expiration-cumsum-std_n1'] = df_.groupby(
        'msno')['days_since_the_last_expiration-cumsum'].std()
    tbl.reset_index(inplace=True)
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    # write
    tbl.to_csv(
        '../feature/{}/days_since_the_last_expiration-cumsum_n1.csv'.format(
            folder),
        index=False)
    ########
    # core4
    ########
    tbl = df_.groupby(
        'msno').days_since_the_last_expiration_ratio.mean().to_frame()
    tbl.columns = ['days_since_the_last_expiration_ratio-mean_n1']
    tbl['days_since_the_last_expiration_ratio-min_n1'] = df_.groupby(
        'msno').days_since_the_last_expiration_ratio.min()
    tbl['days_since_the_last_expiration_ratio-max_n1'] = df_.groupby(
        'msno').days_since_the_last_expiration_ratio.max()
    tbl['days_since_the_last_expiration_ratio-median_n1'] = df_.groupby(
        'msno').days_since_the_last_expiration_ratio.median()
    tbl['days_since_the_last_expiration_ratio-std_n1'] = df_.groupby(
        'msno').days_since_the_last_expiration_ratio.std()
    tbl.reset_index(inplace=True)
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    # write
    tbl.to_csv(
        '../feature/{}/days_since_the_last_expiration_ratio_n1.csv'.format(
            folder),
        index=False)
    ########
    # core5
    ########
    tbl = df_.groupby(
        'msno').days_since_the_last_subscription_ratio.mean().to_frame()
    tbl.columns = ['days_since_the_last_subscription_ratio-mean_n1']
    tbl['days_since_the_last_subscription_ratio-min_n1'] = df_.groupby(
        'msno').days_since_the_last_subscription_ratio.min()
    tbl['days_since_the_last_subscription_ratio-max_n1'] = df_.groupby(
        'msno').days_since_the_last_subscription_ratio.max()
    tbl['days_since_the_last_subscription_ratio-median_n1'] = df_.groupby(
        'msno').days_since_the_last_subscription_ratio.median()
    tbl['days_since_the_last_subscription_ratio-std_n1'] = df_.groupby(
        'msno').days_since_the_last_subscription_ratio.std()
    tbl.reset_index(inplace=True)
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    # write
    tbl.to_csv(
        '../feature/{}/days_since_the_last_subscription_ratio_n1.csv'.format(
            folder),
        index=False)
    ########
    # core6
    ########
    tbl = df_.groupby(
        'msno').days_since_the_first_subscription.mean().to_frame()
    tbl.columns = ['days_since_the_first_subscription-mean_n1']
    tbl['days_since_the_first_subscription-min_n1'] = df_.groupby(
        'msno').days_since_the_first_subscription.min()
    tbl['days_since_the_first_subscription-max_n1'] = df_.groupby(
        'msno').days_since_the_first_subscription.max()
    tbl['days_since_the_first_subscription-median_n1'] = df_.groupby(
        'msno').days_since_the_first_subscription.median()
    tbl['days_since_the_first_subscription-std_n1'] = df_.groupby(
        'msno').days_since_the_first_subscription.std()
    tbl.reset_index(inplace=True)
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    # write
    tbl.to_csv('../feature/{}/days_since_the_first_subscription_n1.csv'.format(
        folder),
               index=False)

    del df_
Пример #19
0
input_col = ['msno', 'transaction_date', 'is_auto_renew']
transactions = utils.read_multiple_csv(
    '../input/preprocessed_data/transactions', input_col)  # 20,000,000

#transactions = transactions.head(n = 5000)
##################################################
# Convert string to datetime format
##################################################
transactions['transaction_date'] = transactions.transaction_date.apply(
    lambda x: datetime.strptime(x, '%Y-%m-%d'))

#==============================================================================
print('reduce memory')
#==============================================================================

utils.reduce_memory(transactions)


def near(x, keep=5):
    return x.tail(keep)


def make_order_number(x):
    x['order_number'] = [i + 1 for i in range(x.shape[0])]
    return x


#==============================================================================
# def
#==============================================================================
def make(T):
Пример #20
0
def concat_pred_item(T, dryrun=False):
    if T==-1:
        name = 'test'
    else:
        name = 'trainT-'+str(T)
    #==============================================================================
    print('load label')
    #==============================================================================
    # NOTE: order_id is label
    print('load t3')
    X_base = pd.read_pickle('../feature/X_base_t3.p')
    
    label = pd.read_pickle('../feature/{}/label_reordered.p'.format(name))
    
    # 'inner' for removing t-n_order_id == NaN
    if 'train' in name:
        df = pd.merge(X_base[X_base.is_train==1], label, on='order_id', how='inner')
    elif name == 'test':
        df = pd.merge(X_base[X_base.is_train==0], label, on='order_id', how='inner')
    
    if dryrun:
        print('dryrun')
        df = df.sample(9999)
    
    df = pd.merge(df, pd.read_pickle('../input/mk/goods.p')[['product_id', 'aisle_id', 'department_id']], 
                  on='product_id', how='left')
    
    print('{}.shape:{}\n'.format(name, df.shape))
        
    #==============================================================================
    print('user feature')
    #==============================================================================
    
    df = user_feature(df, name)
    
    print('{}.shape:{}\n'.format(name, df.shape))
    
    #==============================================================================
    print('item feature')
    #==============================================================================
    
    df = item_feature(df, name)
    
    print('{}.shape:{}\n'.format(name, df.shape))
    
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(df)
    ix_end = df.shape[1]
    
    #==============================================================================
    print('user x item')
    #==============================================================================
    
    df = user_item_feature(df, name)
    
    print('{}.shape:{}\n'.format(name, df.shape))
    
    #==============================================================================
    print('user x item')
    #==============================================================================
    def compress(df, key):
        """
        key: str
        """
        df_ = df.drop_duplicates(key)[[key]].set_index(key)
        dtypes = df.dtypes
        col = dtypes[dtypes!='O'].index
        col = [c for c in col if '_id' not in c]
        gr = df.groupby(key)
        for c in col:
            df_[c+'-min'] = gr[c].min()
            df_[c+'-mean'] = gr[c].mean()
            df_[c+'-median'] = gr[c].median()
            df_[c+'-max'] = gr[c].max()
            df_[c+'-std'] = gr[c].std()
            
        var = df_.var()
        col = var[var==0].index
        df_.drop(col, axis=1, inplace=True)
        gc.collect()
        
        return df_.reset_index()
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f301_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    feature = compress(pd.read_pickle('../feature/{}/f301_order-product_n5.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f302_order-product_all.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f303_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f304-1_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f304-2_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f304-3_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f305_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    gc.collect()
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f306_user-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    feature = compress(pd.read_pickle('../feature/{}/f306_user-product_n5.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f307_user-product-timezone.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f308_user-product-timezone.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f308_user-product-dow.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f309_user-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    feature = compress(pd.read_pickle('../feature/{}/f309_user-product_n5.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f310_user-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f312_user_product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    feature = compress(pd.read_pickle('../feature/{}/f312_user_product_n5.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    gc.collect()
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f313_user_aisle.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f313_user_dep.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f314_user-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f315-1_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f315-2_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f315-3_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f316_order_product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    gc.collect()
    
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(df, ix_end)
    ix_end = df.shape[1]
    
    #==============================================================================
    print('daytime')
    #==============================================================================
    
    df = daytime_feature(df, name)
    
    print('{}.shape:{}\n'.format(name, df.shape))
    
#    #==============================================================================
#    print('aisle')
#    #==============================================================================
#    order_aisdep = pd.read_pickle('../input/mk/order_aisle-department.p')
#    col = [c for c in order_aisdep.columns if 'department_' in c]
#    order_aisdep.drop(col, axis=1, inplace=1)
#    
#    df = pd.merge(df, order_aisdep.add_prefix('t-1_'), on='t-1_order_id', how='left')
#    df = pd.merge(df, order_aisdep.add_prefix('t-2_'), on='t-2_order_id', how='left')
#    
#    print('{}.shape:{}\n'.format(name, df.shape))

    #==============================================================================
    print('feature engineering')
    #==============================================================================
    df = pd.get_dummies(df, columns=['timezone'])
    df = pd.get_dummies(df, columns=['order_dow'])
    df = pd.get_dummies(df, columns=['order_hour_of_day'])
    
    df['days_near_order_cycle'] = (df.days_since_last_order_this_item - df.item_order_days_mean).abs()
    df['days_last_order-min'] = df.days_since_last_order_this_item - df.useritem_order_days_min
    df['days_last_order-max'] = df.days_since_last_order_this_item - df.useritem_order_days_max
    
    df['pos_cart_diff'] = (df.item_mean_pos_cart - df.useritem_mean_pos_cart)
    
    df['t-1_product_unq_len_diffByT-2'] = df['t-1_product_unq_len'] - df['t-2_product_unq_len']
    df['t-1_product_unq_len_diffByT-3'] = df['t-1_product_unq_len'] - df['t-3_product_unq_len']
    df['t-2_product_unq_len_diffByT-3'] = df['t-2_product_unq_len'] - df['t-3_product_unq_len']
    
    df['t-1_product_unq_len_ratioByT-2'] = df['t-1_product_unq_len'] / df['t-2_product_unq_len']
    df['t-1_product_unq_len_ratioByT-3'] = df['t-1_product_unq_len'] / df['t-3_product_unq_len']
    df['t-2_product_unq_len_ratioByT-3'] = df['t-2_product_unq_len'] / df['t-3_product_unq_len']
    
    df['T'] = T
    
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(df, ix_end)
    
    #==============================================================================
    print('output')
    #==============================================================================
    if dryrun == True:
        return df
    else:
        utils.to_pickles(df, '../feature/{}/all'.format(name), 20, inplace=True)
Пример #21
0
def concat_pred_features(T):
    if T == -1:
        name = 'test'
        train = pd.read_csv(
            '../input/sample_submission_v2.csv')  # 此train代表的是test的user
    else:
        #==============================================================================
        print('load label')
        #==============================================================================
        name = 'trainW-' + str(T)
        train = pd.read_csv(
            '../input/preprocessed_data/trainW-{0}.csv'.format(T))[[
                'msno', 'is_churn'
            ]]

    #train = train.head( n = 500)
    #==============================================================================
    print('transactions feature')
    #==============================================================================
    df = transactions_feature(train, name)

    print('{}.shape:{}\n'.format(name, df.shape))

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(df)
    ix_end = df.shape[1]

    #==============================================================================
    print('members feature')
    #==============================================================================
    df = members_feature(df, name)
    print('{}.shape:{}\n'.format(name, df.shape))

    #==============================================================================
    print('user_logs feature')
    #==============================================================================
    df = user_logs_feature(df, name)
    df.replace(
        np.inf, 0,
        inplace=True)  # It may destroy feature but forget it. just noise
    print('{}.shape:{}\n'.format(name, df.shape))

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(df, ix_end)
    ix_end = df.shape[1]

    #==============================================================================
    print('feature engineering')
    #==============================================================================
    # delta:反應這個值隨者時間的變化量(future - history)
    # delta1: time difference = 1
    # delta2: time difference = 2
    # bynum: start from the num
    ####
    #num_100_ratio-mean
    ####
    #delta1
    df['num_100_ratio_delta1_by7'] = df['num_100_ratio_during_t_7-mean'] - df[
        'num_100_ratio_during_t_14-mean']
    df['num_100_ratio_delta1_by14'] = df[
        'num_100_ratio_during_t_14-mean'] - df['num_100_ratio_during_t_30-mean']
    df['num_100_ratio_delta1_by30'] = df[
        'num_100_ratio_during_t_30-mean'] - df['num_100_ratio_during_t_60-mean']
    df['num_100_ratio_delta1_by60'] = df[
        'num_100_ratio_during_t_60-mean'] - df['num_100_ratio_during_t_90-mean']
    df['num_100_ratio_delta1_by90'] = df[
        'num_100_ratio_during_t_90-mean'] - df['num_100_ratio-mean']
    #delta2
    df['num_100_ratio_delta2_by7'] = df['num_100_ratio_during_t_7-mean'] - df[
        'num_100_ratio_during_t_30-mean']
    df['num_100_ratio_delta2_by14'] = df[
        'num_100_ratio_during_t_14-mean'] - df['num_100_ratio_during_t_60-mean']
    df['num_100_ratio_delta2_by30'] = df[
        'num_100_ratio_during_t_30-mean'] - df['num_100_ratio_during_t_90-mean']
    df['num_100_ratio_delta2_by60'] = df[
        'num_100_ratio_during_t_60-mean'] - df['num_100_ratio-mean']
    ####
    #num_25_ratio-mean
    ####
    #delta1
    df['num_25_ratio_delta1_by7'] = df['num_25_ratio_during_t_7-mean'] - df[
        'num_25_ratio_during_t_14-mean']
    df['num_25_ratio_delta1_by14'] = df['num_25_ratio_during_t_14-mean'] - df[
        'num_25_ratio_during_t_30-mean']
    df['num_25_ratio_delta1_by30'] = df['num_25_ratio_during_t_30-mean'] - df[
        'num_25_ratio_during_t_60-mean']
    df['num_25_ratio_delta1_by60'] = df['num_25_ratio_during_t_60-mean'] - df[
        'num_25_ratio_during_t_90-mean']
    df['num_25_ratio_delta1_by90'] = df['num_25_ratio_during_t_90-mean'] - df[
        'num_25_ratio-mean']
    #delta2
    df['num_25_ratio_delta2_by7'] = df['num_25_ratio_during_t_7-mean'] - df[
        'num_25_ratio_during_t_30-mean']
    df['num_25_ratio_delta2_by14'] = df['num_25_ratio_during_t_14-mean'] - df[
        'num_25_ratio_during_t_60-mean']
    df['num_25_ratio_delta2_by30'] = df['num_25_ratio_during_t_30-mean'] - df[
        'num_25_ratio_during_t_90-mean']
    df['num_25_ratio_delta2_by60'] = df['num_25_ratio_during_t_60-mean'] - df[
        'num_25_ratio-mean']
    ####
    #num_repeated_songs-mean
    ####
    #delta1
    df['num_repeated_songs_delta1_by7'] = df[
        'num_repeated_songs_during_t_7-mean'] - df[
            'num_repeated_songs_during_t_14-mean']
    df['num_repeated_songs_delta1_by14'] = df[
        'num_repeated_songs_during_t_14-mean'] - df[
            'num_repeated_songs_during_t_30-mean']
    df['num_repeated_songs_delta1_by30'] = df[
        'num_repeated_songs_during_t_30-mean'] - df[
            'num_repeated_songs_during_t_60-mean']
    df['num_repeated_songs_delta1_by60'] = df[
        'num_repeated_songs_during_t_60-mean'] - df[
            'num_repeated_songs_during_t_90-mean']
    df['num_repeated_songs_delta1_by90'] = df[
        'num_repeated_songs_during_t_90-mean'] - df['num_repeated_songs-mean']
    #delta2
    df['num_repeated_songs_delta2_by7'] = df[
        'num_repeated_songs_during_t_7-mean'] - df[
            'num_repeated_songs_during_t_30-mean']
    df['num_repeated_songs_delta2_by14'] = df[
        'num_repeated_songs_during_t_14-mean'] - df[
            'num_repeated_songs_during_t_60-mean']
    df['num_repeated_songs_delta2_by30'] = df[
        'num_repeated_songs_during_t_30-mean'] - df[
            'num_repeated_songs_during_t_90-mean']
    df['num_repeated_songs_delta2_by60'] = df[
        'num_repeated_songs_during_t_60-mean'] - df['num_repeated_songs-mean']
    ####
    #completed_songs_ratio
    ####
    #delta1
    df['completed_songs_ratio_delta1_by7'] = df[
        'completed_songs_ratio_during_t_7-mean'] - df[
            'completed_songs_ratio_during_t_14-mean']
    df['completed_songs_ratio_delta1_by14'] = df[
        'completed_songs_ratio_during_t_14-mean'] - df[
            'completed_songs_ratio_during_t_30-mean']
    df['completed_songs_ratio_delta1_by30'] = df[
        'completed_songs_ratio_during_t_30-mean'] - df[
            'completed_songs_ratio_during_t_60-mean']
    df['completed_songs_ratio_delta1_by60'] = df[
        'completed_songs_ratio_during_t_60-mean'] - df[
            'completed_songs_ratio_during_t_90-mean']
    df['completed_songs_ratio_delta1_by90'] = df[
        'completed_songs_ratio_during_t_90-mean'] - df[
            'completed_songs_ratio-mean']
    #delta2
    df['completed_songs_ratio_delta2_by7'] = df[
        'completed_songs_ratio_during_t_7-mean'] - df[
            'completed_songs_ratio_during_t_30-mean']
    df['completed_songs_ratio_delta2_by14'] = df[
        'completed_songs_ratio_during_t_14-mean'] - df[
            'completed_songs_ratio_during_t_60-mean']
    df['completed_songs_ratio_delta2_by30'] = df[
        'completed_songs_ratio_during_t_30-mean'] - df[
            'completed_songs_ratio_during_t_90-mean']
    df['completed_songs_ratio_delta2_by60'] = df[
        'completed_songs_ratio_during_t_60-mean'] - df[
            'completed_songs_ratio-mean']
    ####
    #listen_music_in_a_row_count
    ####
    #delta1
    df['listen_music_in_a_row_count_delta1_by7'] = df[
        'listen_music_in_a_row_count_during_t_7'] - df[
            'listen_music_in_a_row_count_during_t_14']
    df['listen_music_in_a_row_count_delta1_by14'] = df[
        'listen_music_in_a_row_count_during_t_14'] - df[
            'listen_music_in_a_row_count_during_t_30']
    df['listen_music_in_a_row_count_delta1_by30'] = df[
        'listen_music_in_a_row_count_during_t_30'] - df[
            'listen_music_in_a_row_count_during_t_60']
    df['listen_music_in_a_row_count_delta1_by60'] = df[
        'listen_music_in_a_row_count_during_t_60'] - df[
            'listen_music_in_a_row_count_during_t_90']
    #delta2
    df['listen_music_in_a_row_count_delta2_by7'] = df[
        'listen_music_in_a_row_count_during_t_7'] - df[
            'listen_music_in_a_row_count_during_t_30']
    df['listen_music_in_a_row_count_delta2_by14'] = df[
        'listen_music_in_a_row_count_during_t_14'] - df[
            'listen_music_in_a_row_count_during_t_60']
    df['listen_music_in_a_row_count_delta2_by30'] = df[
        'listen_music_in_a_row_count_during_t_30'] - df[
            'listen_music_in_a_row_count_during_t_90']

    ####
    #listen_music_in_a_row_ratio
    ####
    #delta1
    df['listen_music_in_a_row_ratio_delta1_by7'] = df[
        'listen_music_in_a_row_ratio_during_t_7'] - df[
            'listen_music_in_a_row_ratio_during_t_14']
    df['listen_music_in_a_row_ratio_delta1_by14'] = df[
        'listen_music_in_a_row_ratio_during_t_14'] - df[
            'listen_music_in_a_row_ratio_during_t_30']
    df['listen_music_in_a_row_ratio_delta1_by30'] = df[
        'listen_music_in_a_row_ratio_during_t_30'] - df[
            'listen_music_in_a_row_ratio_during_t_60']
    df['listen_music_in_a_row_ratio_delta1_by60'] = df[
        'listen_music_in_a_row_ratio_during_t_60'] - df[
            'listen_music_in_a_row_ratio_during_t_90']
    #delta2
    df['listen_music_in_a_row_ratio_delta2_by7'] = df[
        'listen_music_in_a_row_ratio_during_t_7'] - df[
            'listen_music_in_a_row_ratio_during_t_30']
    df['listen_music_in_a_row_ratio_delta2_by14'] = df[
        'listen_music_in_a_row_ratio_during_t_14'] - df[
            'listen_music_in_a_row_ratio_during_t_60']
    df['listen_music_in_a_row_ratio_delta2_by30'] = df[
        'listen_music_in_a_row_ratio_during_t_30'] - df[
            'listen_music_in_a_row_ratio_during_t_90']
    ####
    #date_diff-mean
    ####
    #delta1
    df['date_diff_delta1_by7'] = df['date_diff_during_t_7-mean'] - df[
        'date_diff_during_t_14-mean']
    df['date_diff_delta1_by14'] = df['date_diff_during_t_14-mean'] - df[
        'date_diff_during_t_30-mean']
    df['date_diff_delta1_by30'] = df['date_diff_during_t_30-mean'] - df[
        'date_diff_during_t_60-mean']
    df['date_diff_delta1_by60'] = df['date_diff_during_t_60-mean'] - df[
        'date_diff_during_t_90-mean']
    df['date_diff_delta1_by90'] = df['date_diff_during_t_90-mean'] - df[
        'date_diff-mean']
    #delta2
    df['date_diff_delta2_by7'] = df['date_diff_during_t_7-mean'] - df[
        'completed_songs_ratio_during_t_30-mean']
    df['date_diff_delta2_by14'] = df['date_diff_during_t_14-mean'] - df[
        'date_diff_during_t_60-mean']
    df['date_diff_delta2_by30'] = df['date_diff_during_t_30-mean'] - df[
        'date_diff_during_t_90-mean']
    df['date_diff_delta2_by60'] = df['date_diff_during_t_60-mean'] - df[
        'date_diff-mean']
    ####
    #num_log_in
    ####
    #delta1
    df['num_log_in_delta1_by7'] = df['num_log_in_during_t_7'] - df[
        'num_log_in_during_t_14']
    df['num_log_in_delta1_by14'] = df['num_log_in_during_t_14'] - df[
        'num_log_in_during_t_30']
    df['num_log_in_delta1_by30'] = df['num_log_in_during_t_30'] - df[
        'num_log_in_during_t_60']
    df['num_log_in_delta1_by60'] = df['num_log_in_during_t_60'] - df[
        'num_log_in_during_t_90']
    df['num_log_in_delta1_by90'] = df['num_log_in_during_t_90'] - df[
        'num_log_in']
    #delta2
    df['num_log_in_delta2_by7'] = df['num_log_in_during_t_7'] - df[
        'num_log_in_during_t_30']
    df['num_log_in_delta2_by14'] = df['num_log_in_during_t_14'] - df[
        'num_log_in_during_t_60']
    df['num_log_in_delta2_by30'] = df['num_log_in_during_t_30'] - df[
        'num_log_in_during_t_90']
    df['num_log_in_delta2_by60'] = df['num_log_in_during_t_60'] - df[
        'num_log_in']
    ####
    #log_in_ratio
    ####
    #delta1
    df['log_in_ratio_delta1_by7'] = df['log_in_ratio_during_t_7'] - df[
        'log_in_ratio_during_t_14']
    df['log_in_ratio_delta1_by14'] = df['log_in_ratio_during_t_14'] - df[
        'log_in_ratio_during_t_30']
    df['log_in_ratio_delta1_by30'] = df['log_in_ratio_during_t_30'] - df[
        'log_in_ratio_during_t_60']
    df['log_in_ratio_delta1_by60'] = df['log_in_ratio_during_t_60'] - df[
        'log_in_ratio_during_t_90']
    df['log_in_ratio_delta1_by90'] = df['log_in_ratio_during_t_90'] - df[
        'log_in_ratio']
    #delta2
    df['log_in_ratio_delta2_by7'] = df['log_in_ratio_during_t_7'] - df[
        'log_in_ratio_during_t_30']
    df['log_in_ratio_delta2_by14'] = df['log_in_ratio_during_t_14'] - df[
        'log_in_ratio_during_t_60']
    df['log_in_ratio_delta2_by30'] = df['log_in_ratio_during_t_30'] - df[
        'log_in_ratio_during_t_90']
    df['log_in_ratio_delta2_by60'] = df['log_in_ratio_during_t_60'] - df[
        'log_in_ratio']

    print('{}.shape:{}\n'.format(name, df.shape))
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(df, ix_end)

    #==============================================================================
    print('output')
    #==============================================================================

    utils.to_multiple_csv(df, '../feature/{}/all'.format(name),
                          20)  # 存一個all_sampling_for_developing
Пример #22
0
def make(T):
	"""
	T = 0
	folder = 'trainW-0'
	"""
	input_col = ['msno','date','num_25','num_100'] # for speed, only considering two extremes, num_25 and num_100

	if T == -1:
		folder = 'test'
		#label
		train = pd.read_csv('../input/sample_submission_v2.csv')[['msno']] # 此train代表的是test的user
		#file1
		user_logs = utils.read_multiple_csv('../feature/{}/compressed_user_logs'.format(folder),
			input_col,
			parse_dates = ['date']) 
		user_logs = pd.concat([user_logs,pd.read_csv('../input/user_logs_v2.csv',parse_dates = ['date'])[input_col]],
		ignore_index = True) 
		#user_logs.sort_values(by = ['msno', 'date'],inplace = True)
	else:
		folder = 'trainW-'+ str(T)
		#label
		train = pd.read_csv('../input/preprocessed_data/trainW-{0}.csv'.format(T))[['msno']] 
		#file1
		user_logs = utils.read_multiple_csv('../feature/{}/compressed_user_logs'.format(folder), 
			input_col,
			parse_dates = ['date']
			)

	##################################################
	# basic procedure
	##################################################
	#get_ratio
	user_logs.loc[:,"num_25":"num_100"] = user_logs.loc[:,"num_25":"num_100"].div(user_logs.loc[:,"num_25":"num_100"].sum(axis=1), axis=0)

	user_logs.rename(columns = {'num_25':'num_25_ratio', 'num_100':'num_100_ratio'}, inplace =True)
	user_logs.dropna(inplace = True) # 0/0會有問題,把他drop掉
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(user_logs)

	df = pd.merge(train,user_logs, on = 'msno', how = 'left')
	del user_logs
	gc.collect()
	print ('shape of df:', df.shape)
	# ##################################################
	# # All history
	# ##################################################
	# #core1:num_25_ratio
	# print ('core1')
	# tbl = df.groupby('msno').num_25_ratio.mean().to_frame()
	# tbl.columns = ['num_25_ratio-mean']
	# tbl['num_25_ratio-min'] = df.groupby('msno').num_25_ratio.min()
	# tbl['num_25_ratio-max'] = df.groupby('msno').num_25_ratio.max()
	# tbl['num_25_ratio-median'] = df.groupby('msno').num_25_ratio.median()
	# tbl['num_25_ratio-std'] = df.groupby('msno').num_25_ratio.std()
	# tbl.reset_index(inplace = True)
	# #==============================================================================
	# print('reduce memory')
	# #==============================================================================
	# utils.reduce_memory(tbl)
	# # write
	# tbl.to_csv('../feature/{}/num_25_ratio.csv'.format(folder), index = False)
	# del tbl
	# gc.collect()
	# #core2:num_100_ratio
	# print ('core2')
	# tbl = df.groupby('msno').num_100_ratio.mean().to_frame()
	# tbl.columns = ['num_100_ratio-mean']
	# tbl['num_100_ratio-min'] = df.groupby('msno').num_100_ratio.min()
	# tbl['num_100_ratio-max'] = df.groupby('msno').num_100_ratio.max()
	# tbl['num_100_ratio-median'] = df.groupby('msno').num_100_ratio.median()
	# tbl['num_100_ratio-std'] = df.groupby('msno').num_100_ratio.std()
	# tbl.reset_index(inplace = True)
	# #==============================================================================
	# print('reduce memory')
	# #==============================================================================
	# utils.reduce_memory(tbl)
	# # write
	# tbl.to_csv('../feature/{}/num_100_ratio.csv'.format(folder), index = False)
	# del tbl
	# gc.collect()
	##################################################
	# n = 7
	##################################################
	df_ = df.groupby('msno').apply(within_n_days,T, n = 7).reset_index(drop = True)
	# #core1:num_25_ratio
	# print ('core1')
	# tbl = df_.groupby('msno').num_25_ratio.mean().to_frame()
	# tbl.columns = ['num_25_ratio_during_t_7-mean']
	# tbl['num_25_ratio_during_t_7-min'] = df_.groupby('msno').num_25_ratio.min()
	# tbl['num_25_ratio_during_t_7-max'] = df_.groupby('msno').num_25_ratio.max()
	# tbl['num_25_ratio_during_t_7-median'] = df_.groupby('msno').num_25_ratio.median()
	# tbl['num_25_ratio_during_t_7-std'] = df_.groupby('msno').num_25_ratio.std()
	# tbl.reset_index(inplace = True)
	# #==============================================================================
	# print('reduce memory')
	# #==============================================================================
	# utils.reduce_memory(tbl)
	# # write
	# tbl.to_csv('../feature/{}/num_25_ratio_during_t_7.csv'.format(folder), index = False)
	# del tbl
	# gc.collect()	
	#core2:num_100_ratio
	print ('core2')
	tbl = df_.groupby('msno').num_100_ratio.mean().to_frame()
	tbl.columns = ['num_100_ratio_during_t_7-mean'] #--->e04, 這邊打錯了,但沒有時間重跑了,原本是num_repeated_songs_during_t_7
	tbl['num_100_ratio_during_t_7-min'] = df_.groupby('msno').num_100_ratio.min()
	tbl['num_100_ratio_during_t_7-max'] = df_.groupby('msno').num_100_ratio.max()
	tbl['num_100_ratio_during_t_7-median'] = df_.groupby('msno').num_100_ratio.median()
	tbl['num_100_ratio_during_t_7-std'] = df_.groupby('msno').num_100_ratio.std()
	tbl.reset_index(inplace = True)
	del df_
	gc.collect()
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)
	# write
	tbl.to_csv('../feature/{}/num_100_ratio_during_t_7.csv'.format(folder), index = False)
	del tbl
	gc.collect()
def make(T):
    """
	T = 0
	folder = 'trainW-0'
	"""
    input_col = ['msno', 'date']
    if T == -1:
        folder = 'test'
        #label
        train = pd.read_csv('../input/sample_submission_v2.csv')[[
            'msno'
        ]]  # 此train代表的是test的user
        #file1
        user_logs = utils.read_multiple_csv(
            '../feature/{}/compressed_user_logs'.format(folder),
            input_col,
            parse_dates=['date'])
        user_logs = pd.concat([
            user_logs,
            pd.read_csv('../input/user_logs_v2.csv', parse_dates=['date'
                                                                  ])[input_col]
        ],
                              ignore_index=True)
        #user_logs.sort_values(by = ['msno', 'date'],inplace = True)
    else:
        folder = 'trainW-' + str(T)
        #label
        train = pd.read_csv(
            '../input/preprocessed_data/trainW-{0}.csv'.format(T))[['msno']]
        #file1
        user_logs = utils.read_multiple_csv(
            '../feature/{}/compressed_user_logs'.format(folder),
            input_col,
            parse_dates=['date'])
    ##################################################
    # basic procedure
    ##################################################
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(user_logs)
    df = pd.merge(train, user_logs, on='msno', how='left')
    del user_logs
    gc.collect()
    df.sort_values(by=['msno', 'date'],
                   inplace=True)  # have to do this line for next line
    df['date_diff'] = [i.days for i in (df.date - df['date'].shift(1))]
    print('shape of df:', df.shape)

    df = df.groupby('msno').apply(drop_first_columns)  # 每個user第一欄不用
    df.reset_index(drop=True, inplace=True)

    # ##################################################
    # # All history
    # ##################################################
    # for speed
    # # core
    # tbl = df[df.date_diff == 1].groupby('msno').date_diff.size().to_frame() # date_diff == 1: mean in a row
    # tbl.columns = ['listen_music_in_a_row_count']
    # tbl['listen_music_in_a_row_ratio'] = tbl.listen_music_in_a_row_count / df.groupby('msno').date_diff.apply(len)
    # tbl.reset_index(inplace = True)
    # #==============================================================================
    # print('reduce memory')
    # #==============================================================================
    # utils.reduce_memory(tbl)
    # # write
    # tbl.to_csv('../feature/{}/listen_music_in_a_row_count.csv'.format(folder), index = False)
    # del tbl
    # gc.collect()
    ##################################################
    # n = 7
    ##################################################
    df_ = df.groupby('msno').apply(within_n_days, T,
                                   n=7).reset_index(drop=True)
    #core
    tbl = df_[df_.date_diff == 1].groupby('msno').date_diff.size().to_frame()
    tbl.columns = ['listen_music_in_a_row_count_during_t_7']
    tbl['listen_music_in_a_row_ratio_during_t_7'] = tbl.listen_music_in_a_row_count_during_t_7 / df_.groupby(
        'msno').date_diff.apply(len)
    tbl.reset_index(inplace=True)
    del df_
    gc.collect()
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    # write
    tbl.to_csv(
        '../feature/{}/listen_music_in_a_row_count_during_t_7.csv'.format(
            folder),
        index=False)
    del tbl
    gc.collect()
    ##################################################
    # n = 14
    ##################################################
    df_ = df.groupby('msno').apply(within_n_days, T,
                                   n=14).reset_index(drop=True)
    #core
    tbl = df_[df_.date_diff == 1].groupby('msno').date_diff.size().to_frame()
    tbl.columns = ['listen_music_in_a_row_count_during_t_14']
    tbl['listen_music_in_a_row_ratio_during_t_14'] = tbl.listen_music_in_a_row_count_during_t_14 / df_.groupby(
        'msno').date_diff.apply(len)
    tbl.reset_index(inplace=True)
    del df_
    gc.collect()
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    # write
    tbl.to_csv(
        '../feature/{}/listen_music_in_a_row_count_during_t_14.csv'.format(
            folder),
        index=False)
    del tbl
    gc.collect()
    ##################################################
    # n = 30
    ##################################################
    df_ = df.groupby('msno').apply(within_n_days, T,
                                   n=30).reset_index(drop=True)
    #core
    tbl = df_[df_.date_diff == 1].groupby('msno').date_diff.size().to_frame()
    tbl.columns = ['listen_music_in_a_row_count_during_t_30']
    tbl['listen_music_in_a_row_ratio_during_t_30'] = tbl.listen_music_in_a_row_count_during_t_30 / df_.groupby(
        'msno').date_diff.apply(len)
    tbl.reset_index(inplace=True)
    del df_
    gc.collect()
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    # write
    tbl.to_csv(
        '../feature/{}/listen_music_in_a_row_count_during_t_30.csv'.format(
            folder),
        index=False)
    del tbl
    gc.collect()
    ##################################################
    # n = 60
    ##################################################
    df_ = df.groupby('msno').apply(within_n_days, T,
                                   n=60).reset_index(drop=True)
    #core
    tbl = df_[df_.date_diff == 1].groupby('msno').date_diff.size().to_frame()
    tbl.columns = ['listen_music_in_a_row_count_during_t_60']
    tbl['listen_music_in_a_row_ratio_during_t_60'] = tbl.listen_music_in_a_row_count_during_t_60 / df_.groupby(
        'msno').date_diff.apply(len)
    tbl.reset_index(inplace=True)
    del df_
    gc.collect()
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    # write
    tbl.to_csv(
        '../feature/{}/listen_music_in_a_row_count_during_t_60.csv'.format(
            folder),
        index=False)
    del tbl
    gc.collect()
    ##################################################
    # n = 90
    ##################################################
    df_ = df.groupby('msno').apply(within_n_days, T,
                                   n=90).reset_index(drop=True)
    #core
    tbl = df_[df_.date_diff == 1].groupby('msno').date_diff.size().to_frame()
    tbl.columns = ['listen_music_in_a_row_count_during_t_90']
    tbl['listen_music_in_a_row_ratio_during_t_90'] = tbl.listen_music_in_a_row_count_during_t_90 / df_.groupby(
        'msno').date_diff.apply(len)
    tbl.reset_index(inplace=True)
    del df_
    gc.collect()
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    # write
    tbl.to_csv(
        '../feature/{}/listen_music_in_a_row_count_during_t_90.csv'.format(
            folder),
        index=False)
    del tbl
    gc.collect()
##################################################
# Load members and
##################################################
demographics = pd.read_csv('../input/preprocessed_data/demographics.csv')

##################################################
# Convert string to datetime format
##################################################
demographics[
    'registration_init_time'] = demographics.registration_init_time.apply(
        lambda x: datetime.strptime(str(x), '%Y%m%d'))

#==============================================================================
print('reduce memory')
#==============================================================================
utils.reduce_memory(demographics)


#==============================================================================
# def
#==============================================================================
def make(T):
    """
	T = 0
	folder = 'trainW-0'
	"""

    if T == -1:
        folder = 'test'
        train = pd.read_csv(
            '../input/sample_submission_v2.csv')  # 此train代表的是test的user
Пример #25
0
def make(T):
    """
    T = 0
    folder = 'trainW-0'
    """
    input_col = ['msno', 'date']
    #output_col = ['msno','num_log_in','listening_longevity','log_in_ratio']

    if T == -1:
        folder = 'test'
        #label
        train = pd.read_csv('../input/sample_submission_v2.csv')[[
            'msno'
        ]]  # 此train代表的是test的user
        user_logs = utils.read_multiple_csv(
            '../feature/{}/compressed_user_logs'.format(folder),
            input_col,
            parse_dates=['date'])
        user_logs = pd.concat([
            user_logs,
            pd.read_csv('../input/user_logs_v2.csv', parse_dates=['date'
                                                                  ])[input_col]
        ],
                              ignore_index=True)
        #user_logs.sort_values(by = ['msno', 'date'],inplace = True)
    else:
        folder = 'trainW-' + str(T)
        #label
        train = pd.read_csv(
            '../input/preprocessed_data/trainW-{0}.csv'.format(T))[['msno']]
        user_logs = utils.read_multiple_csv(
            '../feature/{}/compressed_user_logs'.format(folder),
            input_col,
            parse_dates=['date'])

    ##################################################
    # basic procedure
    ##################################################
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(user_logs)

    df = pd.merge(train, user_logs, on='msno', how='left')
    del user_logs
    gc.collect()
    print('shape of df:', df.shape)

    ##################################################
    # All history
    ##################################################
    # count
    tbl = df.groupby('msno').date.size().to_frame()
    tbl.columns = ['num_log_in']
    tbl.reset_index(inplace=True)
    # for computing log_in_ratio
    user_logs_copy = df.groupby('msno').apply(listening_longevity)
    user_logs_copy.drop_duplicates('msno', inplace=True)

    tbl = pd.merge(tbl, user_logs_copy, on='msno', how='left')

    del user_logs_copy
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    gc.collect()
    #log_in_ratio
    tbl['log_in_ratio'] = tbl.num_log_in / tbl.listening_longevity
    tbl.drop(['date', 'listening_longevity'], axis=1, inplace=True)
    # write
    tbl.to_csv('../feature/{}/num_log_in.csv'.format(folder), index=False)

    del tbl
    gc.collect()
    ##################################################
    # n = 7
    ##################################################
    df_ = df.groupby('msno').apply(within_n_days, T,
                                   n=7).reset_index(drop=True)
    tbl = df_.groupby('msno').date.size().to_frame()
    tbl.columns = ['num_log_in_during_t_7']
    tbl.reset_index(inplace=True)
    # for computing log_in_ratio
    user_logs_copy = df_.groupby('msno').apply(listening_longevity)
    user_logs_copy.drop_duplicates('msno', inplace=True)

    tbl = pd.merge(tbl, user_logs_copy, on='msno', how='left')

    del user_logs_copy
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    gc.collect()
    #log_in_ratio
    tbl['log_in_ratio_during_t_7'] = tbl.num_log_in_during_t_7 / tbl.listening_longevity
    tbl.drop(['date', 'listening_longevity'], axis=1, inplace=True)
    # write
    tbl.to_csv('../feature/{}/num_log_in_during_t_7.csv'.format(folder),
               index=False)

    del tbl
    gc.collect()

    ##################################################
    # n = 14
    ##################################################
    df_ = df.groupby('msno').apply(within_n_days, T,
                                   n=14).reset_index(drop=True)
    tbl = df_.groupby('msno').date.size().to_frame()
    tbl.columns = ['num_log_in_during_t_14']
    tbl.reset_index(inplace=True)
    # for computing log_in_ratio
    user_logs_copy = df_.groupby('msno').apply(listening_longevity)
    user_logs_copy.drop_duplicates('msno', inplace=True)

    tbl = pd.merge(tbl, user_logs_copy, on='msno', how='left')

    del user_logs_copy
    gc.collect()
    #log_in_ratio
    tbl['log_in_ratio_during_t_14'] = tbl.num_log_in_during_t_14 / tbl.listening_longevity
    tbl.drop(['date', 'listening_longevity'], axis=1, inplace=True)
    # write
    tbl.to_csv('../feature/{}/num_log_in_during_t_14.csv'.format(folder),
               index=False)

    del tbl
    gc.collect()

    ##################################################
    # n = 30
    ##################################################
    df_ = df.groupby('msno').apply(within_n_days, T,
                                   n=30).reset_index(drop=True)
    tbl = df_.groupby('msno').date.size().to_frame()
    tbl.columns = ['num_log_in_during_t_30']
    tbl.reset_index(inplace=True)
    # for computing log_in_ratio
    user_logs_copy = df_.groupby('msno').apply(listening_longevity)
    user_logs_copy.drop_duplicates('msno', inplace=True)

    tbl = pd.merge(tbl, user_logs_copy, on='msno', how='left')

    del user_logs_copy
    gc.collect()
    #log_in_ratio
    tbl['log_in_ratio_during_t_30'] = tbl.num_log_in_during_t_30 / tbl.listening_longevity
    tbl.drop(['date', 'listening_longevity'], axis=1, inplace=True)
    # write
    tbl.to_csv('../feature/{}/num_log_in_during_t_30.csv'.format(folder),
               index=False)

    del tbl
    gc.collect()
    ##################################################
    # n = 60
    ##################################################
    df_ = df.groupby('msno').apply(within_n_days, T,
                                   n=60).reset_index(drop=True)
    tbl = df_.groupby('msno').date.size().to_frame()
    tbl.columns = ['num_log_in_during_t_60']
    tbl.reset_index(inplace=True)
    # for computing log_in_ratio
    user_logs_copy = df_.groupby('msno').apply(listening_longevity)
    user_logs_copy.drop_duplicates('msno', inplace=True)

    tbl = pd.merge(tbl, user_logs_copy, on='msno', how='left')

    del user_logs_copy
    gc.collect()
    #log_in_ratio
    tbl['log_in_ratio_during_t_60'] = tbl.num_log_in_during_t_60 / tbl.listening_longevity
    tbl.drop(['date', 'listening_longevity'], axis=1, inplace=True)
    # write
    tbl.to_csv('../feature/{}/num_log_in_during_t_60.csv'.format(folder),
               index=False)

    del tbl
    gc.collect()

    ##################################################
    # n = 90
    ##################################################
    df_ = df.groupby('msno').apply(within_n_days, T,
                                   n=90).reset_index(drop=True)
    tbl = df_.groupby('msno').date.size().to_frame()
    tbl.columns = ['num_log_in_during_t_90']
    tbl.reset_index(inplace=True)
    # for computing log_in_ratio
    user_logs_copy = df_.groupby('msno').apply(listening_longevity)
    user_logs_copy.drop_duplicates('msno', inplace=True)

    tbl = pd.merge(tbl, user_logs_copy, on='msno', how='left')

    del user_logs_copy
    gc.collect()
    #log_in_ratio
    tbl['log_in_ratio_during_t_90'] = tbl.num_log_in_during_t_90 / tbl.listening_longevity
    tbl.drop(['date', 'listening_longevity'], axis=1, inplace=True)
    # write
    tbl.to_csv('../feature/{}/num_log_in_during_t_90.csv'.format(folder),
               index=False)

    del tbl
    gc.collect()
Пример #26
0
def make(T):
	"""
	T = 0
	folder = 'trainW-0'
	"""

	if T == -1:
		folder = 'test'
		user_logs = utils.read_multiple_csv('../feature/{}/compressed_user_logs'.format(folder),input_col) 
		user_logs = pd.concat([user_logs,pd.read_csv('../input/user_logs_v2.csv')], ignore_index=True) # user_logs_v2.csv: inclue data of March, it's for testing set.
		user_logs.sort_values(by = ['msno', 'date'],inplace = True)
		train = pd.read_csv('../input/sample_submission_v2.csv') # 此train代表的是test的user
	else:
		folder = 'trainW-'+str(T)
		user_logs = utils.read_multiple_csv('../feature/{}/compressed_user_logs'.format(folder), input_col)
		train = pd.read_csv('../input/preprocessed_data/trainW-{0}.csv'.format(T))[['msno']] # we do not need is_churn
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(user_logs)
	utils.reduce_memory(train)

	#user_logs = user_logs.head(n = 5000)
	#merge
	df = pd.merge(train,user_logs, on = 'msno', how = 'left')
	#df = df.dropna()	
	del user_logs
	gc.collect()
	##################################################
	# All history
	##################################################

	########
	# core1
	########
	tbl = df.groupby('msno').num_25.mean().to_frame()
	tbl.columns = ['num_25-mean']
	tbl['num_25-min'] = df.groupby('msno').num_25.min()
	tbl['num_25-max'] = df.groupby('msno').num_25.max()
	tbl['num_25-median'] = df.groupby('msno').num_25.median()
	tbl['num_25-std'] = df.groupby('msno').num_25.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)

	# write
	tbl.to_csv('../feature/{}/num_25.csv'.format(folder), index = False)
	########
	# core2
	########
	tbl = df.groupby('msno').num_50.mean().to_frame()
	tbl.columns = ['num_50-mean']
	tbl['num_50-min'] = df.groupby('msno').num_50.min()
	tbl['num_50-max'] = df.groupby('msno').num_50.max()
	tbl['num_50-median'] = df.groupby('msno').num_50.median()
	tbl['num_50-std'] = df.groupby('msno').num_50.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)

	# write
	tbl.to_csv('../feature/{}/num_50.csv'.format(folder), index = False)
	########
	# core3
	########
	tbl = df.groupby('msno').num_75.mean().to_frame()
	tbl.columns = ['num_75-mean']
	tbl['num_75-min'] = df.groupby('msno').num_75.min()
	tbl['num_75-max'] = df.groupby('msno').num_75.max()
	tbl['num_75-median'] = df.groupby('msno').num_75.median()
	tbl['num_75-std'] = df.groupby('msno').num_75.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)

	# write
	tbl.to_csv('../feature/{}/num_75.csv'.format(folder), index = False)
	########
	# core4
	########
	tbl = df.groupby('msno').num_985.mean().to_frame()
	tbl.columns = ['num_985-mean']
	tbl['num_985-min'] = df.groupby('msno').num_985.min()
	tbl['num_985-max'] = df.groupby('msno').num_985.max()
	tbl['num_985-median'] = df.groupby('msno').num_985.median()
	tbl['num_985-std'] = df.groupby('msno').num_985.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)

	# write
	tbl.to_csv('../feature/{}/num_985.csv'.format(folder), index = False)
	########
	# core5
	########
	tbl = df.groupby('msno').num_100.mean().to_frame()
	tbl.columns = ['num_100-mean']
	tbl['num_100-min'] = df.groupby('msno').num_100.min()
	tbl['num_100-max'] = df.groupby('msno').num_100.max()
	tbl['num_100-median'] = df.groupby('msno').num_100.median()
	tbl['num_100-std'] = df.groupby('msno').num_100.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)

	# write
	tbl.to_csv('../feature/{}/num_100.csv'.format(folder), index = False)
	########
	# core6
	########
	tbl = df.groupby('msno').num_unq.mean().to_frame()
	tbl.columns = ['num_unq-mean']
	tbl['num_unq-min'] = df.groupby('msno').num_unq.min()
	tbl['num_unq-max'] = df.groupby('msno').num_unq.max()
	tbl['num_unq-median'] = df.groupby('msno').num_unq.median()
	tbl['num_unq-std'] = df.groupby('msno').num_unq.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)

	# write
	tbl.to_csv('../feature/{}/num_unq.csv'.format(folder), index = False)
	del tbl
	gc.collect()

	##################################################
	# near 5
	##################################################
	df_ = df.groupby('msno').apply(near,5).reset_index(drop = True)
	########
	# core1
	########
	tbl = df_.groupby('msno').num_25.mean().to_frame()
	tbl.columns = ['num_25-mean_n5']
	tbl['num_25-min_n5'] = df_.groupby('msno').num_25.min()
	tbl['num_25-max_n5'] = df_.groupby('msno').num_25.max()
	tbl['num_25-median_n5'] = df_.groupby('msno').num_25.median()
	tbl['num_25-std_n5'] = df_.groupby('msno').num_25.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)

	# write
	tbl.to_csv('../feature/{}/num_25_n5.csv'.format(folder), index = False)
	########
	# core2
	########
	tbl = df_.groupby('msno').num_50.mean().to_frame()
	tbl.columns = ['num_50-mean_n5']
	tbl['num_50-min_n5'] = df_.groupby('msno').num_50.min()
	tbl['num_50-max_n5'] = df_.groupby('msno').num_50.max()
	tbl['num_50-median_n5'] = df_.groupby('msno').num_50.median()
	tbl['num_50-std_n5'] = df_.groupby('msno').num_50.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)
	# write
	tbl.to_csv('../feature/{}/num_50_n5.csv'.format(folder), index = False)
	#########
	# core3
	#########
	tbl = df_.groupby('msno').num_75.mean().to_frame()
	tbl.columns = ['num_75-mean_n5']
	tbl['num_75-min_n5'] = df_.groupby('msno').num_75.min()
	tbl['num_75-max_n5'] = df_.groupby('msno').num_75.max()
	tbl['num_75-median_n5'] = df_.groupby('msno').num_75.median()
	tbl['num_75-std_n5'] = df_.groupby('msno').num_75.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)
	# write
	tbl.to_csv('../feature/{}/num_75_n5.csv'.format(folder), index = False)
	########
	# core4
	########
	tbl = df_.groupby('msno').num_985.mean().to_frame()
	tbl.columns = ['num_985-mean_n5']
	tbl['num_985-min_n5'] = df_.groupby('msno').num_985.min()
	tbl['num_985-max_n5'] = df_.groupby('msno').num_985.max()
	tbl['num_985-median_n5'] = df_.groupby('msno').num_985.median()
	tbl['num_985-std_n5'] = df_.groupby('msno').num_985.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)
	# write
	tbl.to_csv('../feature/{}/num_985_n5.csv'.format(folder), index = False)
	########
	# core5
	########
	tbl = df_.groupby('msno').num_100.mean().to_frame()
	tbl.columns = ['num_100-mean_n5']
	tbl['num_100-min_n5'] = df_.groupby('msno').num_100.min()
	tbl['num_100-max_n5'] = df_.groupby('msno').num_100.max()
	tbl['num_100-median_n5'] = df_.groupby('msno').num_100.median()
	tbl['num_100-std_n5'] = df_.groupby('msno').num_100.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)
	# write
	tbl.to_csv('../feature/{}/num_100_n5.csv'.format(folder), index = False)
	########
	# core6
	########
	tbl = df_.groupby('msno').num_unq.mean().to_frame()
	tbl.columns = ['num_unq-mean_n5']
	tbl['num_unq-min_n5'] = df_.groupby('msno').num_unq.min()
	tbl['num_unq-max_n5'] = df_.groupby('msno').num_unq.max()
	tbl['num_unq-median_n5'] = df_.groupby('msno').num_unq.median()
	tbl['num_unq-std_n5'] = df_.groupby('msno').num_unq.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)
	# write
	tbl.to_csv('../feature/{}/num_unq_n5.csv'.format(folder), index = False)
	del tbl
	del df_
	gc.collect()
	##################################################
	# only one prvious order
	##################################################
	df_ = df.groupby('msno').apply(near,1).reset_index(drop = True)
	########
	# core1
	########
	tbl = df_.groupby('msno').num_25.mean().to_frame()
	tbl.columns = ['num_25-mean_n1']
	tbl['num_25-min_n1'] = df_.groupby('msno').num_25.min()
	tbl['num_25-max_n1'] = df_.groupby('msno').num_25.max()
	tbl['num_25-median_n1'] = df_.groupby('msno').num_25.median()
	tbl['num_25-std_n1'] = df_.groupby('msno').num_25.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)

	# write
	tbl.to_csv('../feature/{}/num_25_n1.csv'.format(folder), index = False)
	########
	# core2
	########
	tbl = df_.groupby('msno').num_50.mean().to_frame()
	tbl.columns = ['num_50-mean_n1']
	tbl['num_50-min_n1'] = df_.groupby('msno').num_50.min()
	tbl['num_50-max_n1'] = df_.groupby('msno').num_50.max()
	tbl['num_50-median_n1'] = df_.groupby('msno').num_50.median()
	tbl['num_50-std_n1'] = df_.groupby('msno').num_50.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)
	# write
	tbl.to_csv('../feature/{}/num_50_n1.csv'.format(folder), index = False)
	#########
	# core3
	#########
	tbl = df_.groupby('msno').num_75.mean().to_frame()
	tbl.columns = ['num_75-mean_n1']
	tbl['num_75-min_n1'] = df_.groupby('msno').num_75.min()
	tbl['num_75-max_n1'] = df_.groupby('msno').num_75.max()
	tbl['num_75-median_n1'] = df_.groupby('msno').num_75.median()
	tbl['num_75-std_n1'] = df_.groupby('msno').num_75.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)
	# write
	tbl.to_csv('../feature/{}/num_75_n1.csv'.format(folder), index = False)
	########
	# core4
	########
	tbl = df_.groupby('msno').num_985.mean().to_frame()
	tbl.columns = ['num_985-mean_n1']
	tbl['num_985-min_n1'] = df_.groupby('msno').num_985.min()
	tbl['num_985-max_n1'] = df_.groupby('msno').num_985.max()
	tbl['num_985-median_n1'] = df_.groupby('msno').num_985.median()
	tbl['num_985-std_n1'] = df_.groupby('msno').num_985.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)
	# write
	tbl.to_csv('../feature/{}/num_985_n1.csv'.format(folder), index = False)
	########
	# core5
	########
	tbl = df_.groupby('msno').num_100.mean().to_frame()
	tbl.columns = ['num_100-mean_n1']
	tbl['num_100-min_n1'] = df_.groupby('msno').num_100.min()
	tbl['num_100-max_n1'] = df_.groupby('msno').num_100.max()
	tbl['num_100-median_n1'] = df_.groupby('msno').num_100.median()
	tbl['num_100-std_n1'] = df_.groupby('msno').num_100.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)
	# write
	tbl.to_csv('../feature/{}/num_100_n1.csv'.format(folder), index = False)
	########
	# core6
	########
	tbl = df_.groupby('msno').num_unq.mean().to_frame()
	tbl.columns = ['num_unq-mean_n1']
	tbl['num_unq-min_n1'] = df_.groupby('msno').num_unq.min()
	tbl['num_unq-max_n1'] = df_.groupby('msno').num_unq.max()
	tbl['num_unq-median_n1'] = df_.groupby('msno').num_unq.median()
	tbl['num_unq-std_n1'] = df_.groupby('msno').num_unq.std()
	tbl.reset_index(inplace = True)	
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)
	# write
	tbl.to_csv('../feature/{}/num_unq_n1.csv'.format(folder), index = False)
	del tbl
	del df_
	gc.collect()
Пример #27
0
##################################################
input_col = ['msno','transaction_date','is_membership_duration_equal_to_plan_days',
'is_membership_duration_longer_than_plan_days','is_early_expiration']
membership_loyalty = utils.read_multiple_csv('../input/preprocessed_data/transactions_date_base',input_col) # 20,000,000

#membership_loyalty = membership_loyalty.head(n = 500)

##################################################
# Convert string to datetime format
##################################################
membership_loyalty['transaction_date']  = membership_loyalty.transaction_date.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))

#==============================================================================
print('reduce memory')
#==============================================================================
utils.reduce_memory(membership_loyalty)

def near(x, keep = 5):
    return x.tail(keep)

#==============================================================================
# def
#==============================================================================
def make(T):
	"""
	T = 0
	folder = 'trainW-0'
	"""

	if T ==-1:
	    folder = 'test'
Пример #28
0
##################################################
# Load  
##################################################
input_col = ['msno', 'transaction_date', 'discount', 'is_discount', 'amt_per_day',
       'cp_value']
transactions_price_plan_days = utils.read_multiple_csv('../input/preprocessed_data/transaction_price_and_play_days_base') # 20,000,000
#transactions_price_plan_days = transactions_price_plan_days.head( n = 1000)
##################################################
# Convert string to datetime format
##################################################
transactions_price_plan_days['transaction_date']  = transactions_price_plan_days.transaction_date.apply(lambda x: datetime.strptime(str(x), '%Y-%m-%d'))

#==============================================================================
print('reduce memory')
#==============================================================================
utils.reduce_memory(transactions_price_plan_days)



def near(x, keep = 5):
    return x.tail(keep)

#==============================================================================
# def
#==============================================================================
def make(T):
	"""
	T = 0
	folder = 'trainW-0'
	"""
Пример #29
0
def make(T):
	"""
	T = 0
	folder = 'trainW-0'
	"""

	if T ==-1:
	    folder = 'test'
	    train = pd.read_csv('../input/sample_submission_v2.csv') # 此train代表的是test的user
	else:
	    folder = 'trainW-'+str(T)
	    train = pd.read_csv('../input/preprocessed_data/trainW-{0}.csv'.format(T))[['msno','is_churn']]

	# the following style is silly, but it's all for saving memory 
	if T == 0:
		df = pd.merge(train, 
	    membership_loyalty[(membership_loyalty.transaction_date < datetime.strptime('2017-03-01', '%Y-%m-%d'))], 
	    on=['msno'], 
	    how='left')
		del train
	elif T == 1:
	    # w = 1:使用2月之前的資料當作history
	    df = pd.merge(train, 
	    	membership_loyalty[(membership_loyalty.transaction_date < datetime.strptime('2017-02-01', '%Y-%m-%d'))],
	    	on=['msno'], 
	    	how='left') 
	    del train
	elif T == 2:
	    # w = 2:使用1月之前的資料當作history
	    df = pd.merge(train, 
	    	membership_loyalty[(membership_loyalty.transaction_date < datetime.strptime('2017-01-01', '%Y-%m-%d'))],
	    	on=['msno'], 
	    	how='left') 
	    del train
	elif T == -1:
	    # w = -1:使用4月之前的資料當作history
	    df = pd.merge(train, 
	    	membership_loyalty[(membership_loyalty.transaction_date < datetime.strptime('2017-04-01', '%Y-%m-%d'))],
	    	on='msno', 
	    	how='left') 
	    del train
	##################################################
	# All history
	##################################################
	#df = df.dropna()
	#core
	tbl = df.groupby(['msno']).is_membership_duration_equal_to_plan_days.sum().to_frame()
	tbl.columns = ['is_membership_duration_equal_to_plan_days_cnt']
	tbl['is_membership_duration_equal_to_plan_days_ratio'] = df.groupby('msno').is_membership_duration_equal_to_plan_days.mean()
	tbl['is_membership_duration_longer_than_plan_days_cnt'] = df.groupby('msno').is_membership_duration_longer_than_plan_days.sum()
	tbl['is_membership_duration_longer_than_plan_days_ratio'] = df.groupby('msno').is_membership_duration_longer_than_plan_days.mean()
	tbl['is_early_expiration_cnt'] = df.groupby('msno').is_early_expiration.sum()
	tbl['is_early_expiration_ratio'] = df.groupby('msno').is_early_expiration.mean()
	tbl.reset_index(inplace = True)
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)
	#write
	tbl.to_csv('../feature/{}/regular_membership.csv'.format(folder), index = False)
	del tbl
	gc.collect()
	##################################################
	# near 5
	##################################################
	#core
	df_ = df.groupby('msno').apply(near,5).reset_index(drop = True)
	tbl = df_.groupby(['msno']).is_membership_duration_equal_to_plan_days.sum().to_frame()
	tbl.columns = ['is_membership_duration_equal_to_plan_days_cnt_n5']
	tbl['is_membership_duration_equal_to_plan_days_ratio_n5'] = df_.groupby('msno').is_membership_duration_equal_to_plan_days.mean()
	tbl['is_membership_duration_longer_than_plan_days_cnt_n5'] = df_.groupby('msno').is_membership_duration_longer_than_plan_days.sum()
	tbl['is_membership_duration_longer_than_plan_days_ratio_n5'] = df_.groupby('msno').is_membership_duration_longer_than_plan_days.mean()
	tbl['is_early_expiration_cnt_n5'] = df_.groupby('msno').is_early_expiration.sum()
	tbl['is_early_expiration_ratio_n5'] = df_.groupby('msno').is_early_expiration.mean()
	tbl.reset_index(inplace = True)
	del df_
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)
	#write
	tbl.to_csv('../feature/{}/regular_membership_n5.csv'.format(folder), index = False)
	del tbl
	gc.collect()
	##################################################
	# only one prvious order
	##################################################
	#core
	df_ = df.groupby('msno').apply(near,1).reset_index(drop = True)
	tbl = df_.groupby(['msno']).is_membership_duration_equal_to_plan_days.sum().to_frame()
	tbl.columns = ['is_membership_duration_equal_to_plan_days_cnt_n1']
	tbl['is_membership_duration_equal_to_plan_days_ratio_n1'] = df_.groupby('msno').is_membership_duration_equal_to_plan_days.mean()
	tbl['is_membership_duration_longer_than_plan_days_cnt_n1'] = df_.groupby('msno').is_membership_duration_longer_than_plan_days.sum()
	tbl['is_membership_duration_longer_than_plan_days_ratio_n1'] = df_.groupby('msno').is_membership_duration_longer_than_plan_days.mean()
	tbl['is_early_expiration_cnt_n1'] = df_.groupby('msno').is_early_expiration.sum()
	tbl['is_early_expiration_ratio_n1'] = df_.groupby('msno').is_early_expiration.mean()
	tbl.reset_index(inplace = True)
	del df_
	#==============================================================================
	print('reduce memory')
	#==============================================================================
	utils.reduce_memory(tbl)
	#write
	tbl.to_csv('../feature/{}/regular_membership_n1.csv'.format(folder), index = False)
	del tbl
	gc.collect()
Пример #30
0
def make(T):
    """
	T = 0
	folder = 'trainW-0'
	"""

    if T == -1:
        folder = 'test'
        train = pd.read_csv(
            '../input/sample_submission_v2.csv')  # 此train代表的是test的user
    else:
        folder = 'trainW-' + str(T)
        train = pd.read_csv(
            '../input/preprocessed_data/trainW-{0}.csv'.format(T))[['msno']]

    # the following style is silly, but it's all for saving memory
    if T == 0:
        df = pd.merge(
            train,
            transactions[(transactions.transaction_date < datetime.strptime(
                '2017-03-01', '%Y-%m-%d'))],
            on=['msno'],
            how='left')
        del train
    elif T == 1:
        # w = 1:使用2月之前的資料當作history
        df = pd.merge(
            train,
            transactions[(transactions.transaction_date < datetime.strptime(
                '2017-02-01', '%Y-%m-%d'))],
            on=['msno'],
            how='left')
        del train
    elif T == 2:
        # w = 2:使用1月之前的資料當作history
        df = pd.merge(
            train,
            transactions[(transactions.transaction_date < datetime.strptime(
                '2017-01-01', '%Y-%m-%d'))],
            on=['msno'],
            how='left')
        del train
    elif T == -1:
        # w = -1:使用4月之前的資料當作history
        df = pd.merge(
            train,
            transactions[(transactions.transaction_date < datetime.strptime(
                '2017-04-01', '%Y-%m-%d'))],
            on='msno',
            how='left')
        del train
    ##################################################
    # All history
    ##################################################
    #df = df.dropna()

    df_ = df.groupby('msno').apply(make_order_number)
    #count
    cnt = df_.groupby(['msno', 'is_auto_renew']).size()
    cnt.name = 'cnt'
    cnt = cnt.reset_index()
    # chance
    user_onb_max = df_.groupby('msno').order_number.max().reset_index()
    user_onb_max.columns = ['msno', 'onb_max']
    user_is_auto_renew_min = df_.groupby(['msno', 'is_auto_renew'
                                          ]).order_number.min().reset_index()
    user_is_auto_renew_min.columns = ['msno', 'is_auto_renew', 'onb_min']

    chance = pd.merge(user_is_auto_renew_min,
                      user_onb_max,
                      on='msno',
                      how='left')
    chance['is_auto_renew_chance'] = chance.onb_max - chance.onb_min + 1

    tbl = pd.merge(cnt, chance, on=['msno', 'is_auto_renew'], how='left')
    tbl['auto_renew_ratio_by_chance'] = tbl.cnt / tbl.is_auto_renew_chance
    # total_count
    tbl_ = df_.groupby('msno').is_auto_renew.sum().to_frame()
    tbl_.columns = ['auto_renew_total_count']
    tbl_['auto_renew_total_count_ratio'] = df_.groupby(
        'msno').is_auto_renew.mean()
    tbl_.reset_index(inplace=True)
    tbl = pd.merge(tbl, tbl_, on='msno')
    col = [
        'msno', 'is_auto_renew_chance', 'auto_renew_ratio_by_chance',
        'auto_renew_total_count', 'auto_renew_total_count_ratio'
    ]
    tbl = tbl[col]
    tbl.drop_duplicates('msno', keep='first',
                        inplace=True)  # 只要 is_auto_renew == 0
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    #write
    tbl.to_csv('../feature/{}/is_auto_renew.csv'.format(folder), index=False)

    del tbl
    del tbl_
    del df_
    gc.collect()
    ##################################################
    # near 5
    ##################################################
    df_ = df.groupby('msno').apply(near, 5).reset_index(drop=True)
    df_ = df_.groupby('msno').apply(make_order_number)
    #count
    cnt = df_.groupby(['msno', 'is_auto_renew']).size()
    cnt.name = 'cnt'
    cnt = cnt.reset_index()
    # chance
    user_onb_max = df_.groupby('msno').order_number.max().reset_index()
    user_onb_max.columns = ['msno', 'onb_max']
    user_is_cancel_min = df_.groupby(['msno', 'is_auto_renew'
                                      ]).order_number.min().reset_index()
    user_is_cancel_min.columns = ['msno', 'is_auto_renew', 'onb_min']

    chance = pd.merge(user_is_cancel_min, user_onb_max, on='msno', how='left')
    chance['is_auto_renew_chance_n5'] = chance.onb_max - chance.onb_min + 1

    tbl = pd.merge(cnt, chance, on=['msno', 'is_auto_renew'], how='left')
    tbl['auto_renew_ratio_by_chance_n5'] = tbl.cnt / tbl.is_auto_renew_chance_n5
    # total_count
    tbl_ = df_.groupby('msno').is_auto_renew.sum().to_frame()
    tbl_.columns = ['auto_renew_chance_n5_total_count_n5']
    tbl_['auto_renew_chance_n5_total_count_ratio_n5'] = df_.groupby(
        'msno').is_auto_renew.mean()
    tbl_.reset_index(inplace=True)
    tbl = pd.merge(tbl, tbl_, on='msno')
    col = [
        'msno', 'is_auto_renew_chance_n5', 'auto_renew_ratio_by_chance_n5',
        'auto_renew_chance_n5_total_count_n5',
        'auto_renew_chance_n5_total_count_ratio_n5'
    ]
    tbl = tbl[col]
    tbl.drop_duplicates('msno', keep='first', inplace=True)
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    #write
    tbl.to_csv('../feature/{}/is_auto_renew_n5.csv'.format(folder),
               index=False)

    del tbl
    del tbl_
    del df_
    gc.collect()
    ##################################################
    # only one prvious order
    ##################################################
    df_ = df.groupby('msno').apply(near, 1).reset_index(drop=True)
    df_ = df_.groupby('msno').apply(make_order_number)
    #count
    cnt = df_.groupby(['msno', 'is_auto_renew']).size()
    cnt.name = 'cnt'
    cnt = cnt.reset_index()
    # chance
    user_onb_max = df_.groupby('msno').order_number.max().reset_index()
    user_onb_max.columns = ['msno', 'onb_max']
    user_is_cancel_min = df_.groupby(['msno', 'is_auto_renew'
                                      ]).order_number.min().reset_index()
    user_is_cancel_min.columns = ['msno', 'is_auto_renew', 'onb_min']

    chance = pd.merge(user_is_cancel_min, user_onb_max, on='msno', how='left')
    chance['is_auto_renew_chance_n1'] = chance.onb_max - chance.onb_min + 1

    tbl = pd.merge(cnt, chance, on=['msno', 'is_auto_renew'], how='left')
    tbl['auto_renew_chance_n1_ratio_by_chance_n1'] = tbl.cnt / tbl.is_auto_renew_chance_n1
    # total_count
    tbl_ = df_.groupby('msno').is_auto_renew.sum().to_frame()
    tbl_.columns = ['auto_renew_total_count_n1']
    tbl_['auto_renew_total_count_ratio_n1'] = df_.groupby(
        'msno').is_auto_renew.mean()
    tbl_.reset_index(inplace=True)
    tbl = pd.merge(tbl, tbl_, on='msno')
    col = [
        'msno', 'is_auto_renew_chance_n1',
        'auto_renew_chance_n1_ratio_by_chance_n1', 'auto_renew_total_count_n1',
        'auto_renew_total_count_ratio_n1'
    ]
    tbl = tbl[col]
    tbl.drop_duplicates('msno', keep='first', inplace=True)
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(tbl)
    #write
    tbl.to_csv('../feature/{}/is_auto_renew_n1.csv'.format(folder),
               index=False)

    gc.collect()
Пример #31
0
	else:
		return 4 # 最容易流失的city_zone
def registered_via_zone(x):
	if x in set([4,3]):
		return 1 # 最容易流失的registered_via
	elif x in set([7,2]):
		return 2 # 最不容易流失的registered_via
	else:
		return 3 
members['bd_zone'] = members.bd.apply(bd_zone)
members['city_zone'] = members.city.apply(city_zone)
members['registered_via_zone'] = members.city.apply(registered_via_zone)

#==============================================================================
print('reduce memory')
#==============================================================================
utils.reduce_memory(members)
gc.collect()

# write
path = '../input/preprocessed_data/demographics.csv'


members.to_csv(path, index = False)