Пример #1
0
def aggregate(args):
    path, cont_type, pref = args

    df = utils.read_pickles(path, [KEY, 'DAYS_ENTRY_PAYMENT'])
    df = df[df['DAYS_ENTRY_PAYMENT'].between(day_start, day_end)].sort_values(
        [KEY, 'DAYS_ENTRY_PAYMENT'])
    df = pd.merge(df, prev, on=KEY, how='left')
    gc.collect()

    if cont_type == 'NA':
        df = df[df['NAME_CONTRACT_TYPE'].isnull()]
    else:
        df = df[df['NAME_CONTRACT_TYPE'] == cont_type]

    df['DEP_diff'] = df.groupby(KEY).DAYS_ENTRY_PAYMENT.diff()
    feature = df.groupby(KEY).agg(
        {'DEP_diff': ['min', 'mean', 'max', 'var', 'nunique']})
    feature.columns = pd.Index(
        [e[0] + "_" + e[1] for e in feature.columns.tolist()])
    feature.reset_index(inplace=True)

    utils.remove_feature(feature, var_limit=0, sample_size=19999)

    tmp = pd.merge(train, feature, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF + pref), '../feature_prev/train')

    tmp = pd.merge(test, feature, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF + pref), '../feature_prev/test')

    return
def multi(p):
    
    if p==0:
        feature = gr.size()
        feature.name = f'{PREF}_{KEY}_size'
        feature = feature.reset_index()
    elif p==1:
        feature = gr.min().add_prefix(f'{PREF}_').add_suffix('_min').reset_index()
    elif p==2:
        feature = gr.max().add_prefix(f'{PREF}_').add_suffix('_max').reset_index()
    elif p==3:
        feature = gr.mean().add_prefix(f'{PREF}_').add_suffix('_mean').reset_index()
    elif p==4:
        feature = gr.std().add_prefix(f'{PREF}_').add_suffix('_std').reset_index()
    elif p==5:
        feature = gr.sum().add_prefix(f'{PREF}_').add_suffix('_sum').reset_index()
    elif p==6:
        feature = gr.quantile(0.25).add_prefix(f'{PREF}_').add_suffix('_q25').reset_index()
    elif p==7:
        feature = gr.quantile(0.50).add_prefix(f'{PREF}_').add_suffix('_q50').reset_index()
    elif p==8:
        feature = gr.quantile(0.75).add_prefix(f'{PREF}_').add_suffix('_q75').reset_index()
    else:
        return
    
    train_ = pd.merge(train, feature, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(train_, '../feature/train')
    
    test_ = pd.merge(test, feature, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(test_,  '../feature/test')
    
    return
Пример #3
0
def aggregate(args):

    path, pref = args
    df = utils.read_pickles(path)
    df = df[df['DAYS_INSTALMENT'].between(day_start, day_end)]
    df_key = df[[KEY, 'SK_ID_PREV']].drop_duplicates()

    df_agg = df.groupby('SK_ID_PREV').agg({**utils_agg.ins_num_aggregations})
    df_agg.columns = pd.Index(
        [e[0] + "_" + e[1] for e in df_agg.columns.tolist()])

    df_agg['INS_COUNT'] = df.groupby('SK_ID_PREV').size()
    df_agg = df_agg.add_prefix(pref).reset_index()

    utils.remove_feature(df_agg, var_limit=0, sample_size=19999)

    df_agg = pd.merge(df_agg, df_key, on='SK_ID_PREV',
                      how='left').drop('SK_ID_PREV', axis=1)

    df_agg2 = df_agg.groupby(KEY).agg(['mean', 'var'])
    df_agg2.columns = pd.Index(
        [e[0] + "_" + e[1] for e in df_agg2.columns.tolist()])
    df_agg2.reset_index(inplace=True)

    tmp = pd.merge(train, df_agg2, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/train')

    tmp = pd.merge(test, df_agg2, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/test')

    return
def aggregate(args):
    print(args)
    k, v, prefix = args
    
    df = (bb[bb[k]==v])
    
    df_agg = df.groupby(KEY).agg(utils_agg.bb_num_aggregations)
    df_agg.columns = pd.Index([prefix + '_' + e[0] + "_" + e[1] for e in df_agg.columns.tolist()])
    
    df_agg[f'{prefix}_BURE_COUNT'] = df.groupby(KEY).size()
    
#    gr2size = df.groupby([KEY, 'SK_ID_BUREAU']).size()
#    gr2size.name = 'CURR-BUREAU_cnt'
#    gr1 = gr2size.groupby(KEY)
#    gr1size = gr1.agg({**{'CURR-BUREAU_cnt': ['min', 'max', 'mean', 'sum', 'var', 'size']}})
#    gr1size.columns = pd.Index([prefix + '_' + e[0] + "_" + e[1] for e in gr1size.columns.tolist()])
#    
#    df_agg = pd.concat([df_agg, gr1size], axis=1)
    
    # merge
    df_agg.reset_index(inplace=True)
    tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature_bureau/train')
    
    tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF),  '../feature_bureau/test')
    
    return
def aggregate(args):
    path, pref = args

    df = utils.read_pickles(path, [KEY, 'SK_ID_PREV', 'month'] + COL)
    #    df = df[df['DAYS_INSTALMENT'].between(day_start, day_end)]
    #    del df['SK_ID_PREV']

    df = df.groupby([KEY, 'SK_ID_PREV', 'month'])[COL].sum().reset_index()

    df_agg = df.groupby(KEY).agg({**num_agg})
    df_agg.columns = pd.Index(
        [e[0] + "_" + e[1] for e in df_agg.columns.tolist()])

    df_agg['INS_COUNT'] = df.groupby(KEY).size()
    df_agg = df_agg.add_prefix(pref).reset_index()

    utils.remove_feature(df_agg, var_limit=0, sample_size=19999)

    tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/train')

    tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/test')

    return
def aggregate():

    df = utils.get_dummies(cre)

    li = []
    for c1 in df.columns:
        for c2 in col_cat:
            if c1.startswith(c2 + '_'):
                li.append(c1)
                break

    cat_aggregations = {}
    for cat in li:
        cat_aggregations[cat] = ['mean', 'sum']

    df_agg = df.groupby(KEY).agg({
        **utils_agg.cre_num_aggregations,
        **cat_aggregations
    })
    df_agg.columns = pd.Index(
        [e[0] + "_" + e[1] for e in df_agg.columns.tolist()])

    df_agg['CRE_COUNT'] = df.groupby(KEY).size()
    df_agg.reset_index(inplace=True)

    tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/train')

    tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/test')

    return
def multi_agg(args):
    path, pref, cont_type, cont_type_pref = args
    print(args)
    
    ins = utils.read_pickles(path)
    ins = ins[ins['DAYS_INSTALMENT'].between(day_start, day_end)]
    ins = pd.merge(ins, prev, on='SK_ID_PREV', how='left'); gc.collect()
    del ins['SK_ID_CURR']
    
    
    
    if cont_type=='NA':
        df = ins[ins['NAME_CONTRACT_TYPE'].isnull()]
    else:
        df = ins[ins['NAME_CONTRACT_TYPE']==cont_type]
    
    df_agg = df.groupby(KEY).agg({**utils_agg.ins_num_aggregations})
    
    df_agg.columns = pd.Index([e[0] + "_" + e[1] for e in df_agg.columns.tolist()])
    
    df_agg['INS_COUNT'] = df.groupby(KEY).size()
    df_agg = df_agg.add_prefix(pref+cont_type_pref).reset_index()
    
    utils.remove_feature(df_agg, var_limit=0, sample_size=19999)
    
    tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature_prev/train')
    
    tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF),  '../feature_prev/test')
    
    return
def main():
    # load pkls
    df = read_pickles('../feats/sales_diff')
    df_calendar = loadpkl('../feats/calendar.pkl')
    df_sell_prices = loadpkl('../feats/sell_prices.pkl')

    # merge
    df = df.merge(df_calendar, on='d',how='left')
    df = df.merge(df_sell_prices, on=['store_id','item_id','wm_yr_wk'],how='left')

    del df_calendar, df_sell_prices
    gc.collect()

    # drop pre-release rows
    df = df[df['wm_yr_wk']>=df['release']]

    # make lag features
    df = make_lags(df,28)

    # label encoding
    cols_string = ['item_id','dept_id','cat_id','store_id','state_id']
    for c in cols_string:
        df[c], _ = pd.factorize(df[c])
        df[c].replace(-1,np.nan,inplace=True)

    # add price features
    df_grouped = df[['id','sell_price']].groupby('id')['sell_price']
    df['shift_price_t1'] = df_grouped.transform(lambda x: x.shift(1))
    df['price_change_t1'] = (df['shift_price_t1'] - df['sell_price']) / (df['shift_price_t1'])
    df['rolling_price_max_t365'] = df_grouped.transform(lambda x: x.shift(1).rolling(365).max())
    df['price_change_t365'] = (df['rolling_price_max_t365'] - df['sell_price']) / (df['rolling_price_max_t365'])
    df['rolling_price_std_t7'] = df_grouped.transform(lambda x: x.rolling(7).std())
    df['rolling_price_std_t30'] = df_grouped.transform(lambda x: x.rolling(30).std())

    # features release date
    df['release'] = df['release'] - df['release'].min()

    # price momentum by month & year
    df['price_momentum_m'] = df['sell_price']/df.groupby(['store_id','item_id','month'])['sell_price'].transform('mean')
    df['price_momentum_y'] = df['sell_price']/df.groupby(['store_id','item_id','year'])['sell_price'].transform('mean')

    # days for CustomTimeSeriesSplitter
    df['d_numeric'] = df['d'].apply(lambda x: str(x)[2:]).astype(int)

    # reduce memory usage
    df = reduce_mem_usage(df)

    # save as feather
    to_feature(df, '../feats/f105')

    # save feature name list
    features_json = {'features':df.columns.tolist()}
    to_json(features_json,'../configs/105_all_features_diff.json')

    # LINE notify
    line_notify('{} done.'.format(sys.argv[0]))
Пример #9
0
def aggregate():

    df = utils.get_dummies(pos)

    li = []
    for c1 in df.columns:
        for c2 in col_cat:
            if c1.startswith(c2 + '_'):
                li.append(c1)
                break

    cat_aggregations = {}
    for cat in li:
        cat_aggregations[cat] = ['mean', 'sum']

    df_agg = df.groupby(KEY).agg({
        **utils_agg.pos_num_aggregations,
        **cat_aggregations
    })
    df_agg.columns = pd.Index(
        [e[0] + "_" + e[1] for e in df_agg.columns.tolist()])

    # std / mean
    col_std = [c for c in df_agg.columns if c.endswith('_std')]
    for c in col_std:
        df_agg[f'{c}-d-mean'] = df_agg[c] / df_agg[c.replace('_std', '_mean')]

    # max / min
    col_max = [c for c in df_agg.columns if c.endswith('_max')]
    for c in col_max:
        try:
            df_agg[f'{c}-d-min'] = df_agg[c] / df_agg[c.replace(
                '_max', '_min')]
        except:
            pass

    df_agg['POS_COUNT'] = df.groupby(KEY).size()
    df_agg.reset_index(inplace=True)

    utils.remove_feature(df_agg,
                         var_limit=0,
                         corr_limit=0.98,
                         sample_size=19999)

    tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/train')

    tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/test')

    return
Пример #10
0
    def create_feature(self, random_state=None, devmode=False):
        trn_dir, tst_dir = self.get_feature_dir(random_state)

        if os.path.exists(trn_dir) and os.path.exists(
                tst_dir) and devmode is False:
            print(
                "There are cache dir for feature [{}] (train_cache_dir=[{}], test_cache_dir=[{}])"
                .format(self.__class__.__name__, trn_dir, tst_dir))
            trn_feature_files = list(Path(trn_dir).glob('*.f'))
            tst_feature_files = list(Path(tst_dir).glob('*.f'))

            return trn_feature_files, tst_feature_files

        print(
            "Start computing feature [{}] (train_cache_dir=[{}], test_cache_dir=[{}])"
            .format(self.__class__.__name__, trn_dir, tst_dir))

        if isinstance(self.fin, list):
            # 入力ファイルがlistだった場合DataFrameのlistを渡す
            df_list = [pd.read_feather(f) for f in self.fin]
            feat = self.create_feature_impl(df_list, random_state)
            del df_list
            gc.collect()
        else:
            df = pd.read_feather(self.fin)
            feat = self.create_feature_impl(df, random_state)
            del df
            gc.collect()

        feat = utils.reduce_mem_usage(feat)
        trn = self.trn_base.merge(feat, on=CONST.KEY,
                                  how='left').drop(columns=CONST.KEY)
        tst = self.tst_base.merge(feat, on=CONST.KEY,
                                  how='left').drop(columns=CONST.KEY)

        trn = trn.add_prefix(self.pref)
        tst = tst.add_prefix(self.pref)

        # Save ...
        if not devmode:
            os.makedirs(trn_dir)
            os.makedirs(tst_dir)
            utils.to_feature(trn, trn_dir)
            utils.to_feature(tst, tst_dir)
            trn_feature_files = list(Path(trn_dir).glob('*.f'))
            tst_feature_files = list(Path(tst_dir).glob('*.f'))

            return trn_feature_files, tst_feature_files

        else:
            return trn, tst
Пример #11
0
def aggregate(args):
    print(args)
    k, v, prefix = args

    df = utils.get_dummies(bure[bure[k] == v])

    li = []
    for c1 in df.columns:
        for c2 in col_cat:
            if c1.startswith(c2 + '_'):
                li.append(c1)
                break

    cat_aggregations = {}
    for cat in li:
        cat_aggregations[cat] = ['mean', 'sum']

    df_agg = df.groupby(KEY).agg({
        **utils_agg.bure_num_aggregations,
        **cat_aggregations
    })
    df_agg.columns = pd.Index(
        [prefix + e[0] + "_" + e[1] for e in df_agg.columns.tolist()])

    # std / mean
    col_std = [c for c in df_agg.columns if c.endswith('_std')]
    for c in col_std:
        df_agg[f'{c}-d-mean'] = df_agg[c] / df_agg[c.replace('_std', '_mean')]

    # max / min
    col_max = [c for c in df_agg.columns if c.endswith('_max')]
    for c in col_max:
        try:
            df_agg[f'{c}-d-min'] = df_agg[c] / df_agg[c.replace(
                '_max', '_min')]
            df_agg[f'{c}-m-min'] = df_agg[c] - df_agg[c.replace(
                '_max', '_min')]
        except:
            pass

    df_agg[f'{prefix}BURE_COUNT'] = df.groupby(KEY).size()
    df_agg.reset_index(inplace=True)

    tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/train')

    tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/test')

    return
def multi(c):
    train[c+'_ta'] = 0
    for i,(train_index, test_index) in enumerate(skf.split(train, train.TARGET)):
        enc = train.iloc[train_index].groupby(c)['TARGET'].mean()
        train.set_index(c, inplace=True)
        train.iloc[test_index, -1] = enc
        train.reset_index(inplace=True)
    enc = train.groupby(c)['TARGET'].mean()
    test[c+'_ta'] = 0
    test.set_index(c, inplace=True)
    test.iloc[:,-1] = enc
    test.reset_index(inplace=True)
    
    utils.to_feature(train[[c+'_ta']].add_prefix(PREF), '../feature/train')
    utils.to_feature(test[[c+'_ta']].add_prefix(PREF),  '../feature/test')
def aggregate():
    
    df_agg = df.groupby(KEY).agg({**num_aggregations})
    df_agg.columns = pd.Index([e[0] + "_" + e[1] for e in df_agg.columns.tolist()])
    
    df_agg.reset_index(inplace=True)
    
#    utils.remove_feature(df_agg, var_limit=0, corr_limit=0.98, sample_size=19999)
    
    tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/train')
    
    tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF),  '../feature/test')
    
    return
Пример #14
0
def multi_agg(args):
    path, pref, cont_type, cont_type_pref = args
    print(args)

    ins = utils.read_pickles(path)
    ins = ins[ins['DAYS_INSTALMENT'].between(day_start, day_end)]
    ins = pd.merge(ins, prev, on='SK_ID_PREV', how='left')
    gc.collect()
    del ins['SK_ID_PREV']

    if cont_type == 'NA':
        df = ins[ins['NAME_CONTRACT_TYPE'].isnull()]
    else:
        df = ins[ins['NAME_CONTRACT_TYPE'] == cont_type]

    df_agg = df.groupby(KEY).agg({**utils_agg.ins_num_aggregations})
    df_agg.columns = pd.Index(
        [e[0] + "_" + e[1] for e in df_agg.columns.tolist()])

    # std / mean
    col_std = [c for c in df_agg.columns if c.endswith('_std')]
    for c in col_std:
        df_agg[f'{c}-d-mean'] = df_agg[c] / df_agg[c.replace('_std', '_mean')]

    # max / min
    col_max = [c for c in df_agg.columns if c.endswith('_max')]
    for c in col_max:
        try:
            df_agg[f'{c}-d-min'] = df_agg[c] / df_agg[c.replace(
                '_max', '_min')]
            df_agg[f'{c}-m-min'] = df_agg[c] - df_agg[c.replace(
                '_max', '_min')]
        except:
            pass

    df_agg['INS_COUNT'] = df.groupby(KEY).size()
    df_agg = df_agg.add_prefix(pref + cont_type_pref).reset_index()

    utils.remove_feature(df_agg, var_limit=0, sample_size=19999)

    tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/train')

    tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/test')

    return
Пример #15
0
def aggregate(args):
    print(args)
    k, v, prefix = args

    df = (bb[bb[k] == v])

    df_agg = df.groupby(KEY).agg(utils_agg.bb_num_aggregations)
    df_agg.columns = pd.Index(
        [prefix + '_' + e[0] + "_" + e[1] for e in df_agg.columns.tolist()])

    # std / mean
    col_std = [c for c in df_agg.columns if c.endswith('_std')]
    for c in col_std:
        df_agg[f'{c}-d-mean'] = df_agg[c] / df_agg[c.replace('_std', '_mean')]

    # max / min
    col_max = [c for c in df_agg.columns if c.endswith('_max')]
    for c in col_max:
        df_agg[f'{c}-d-min'] = df_agg[c] / df_agg[c.replace('_max', '_min')]

    df_agg[f'{prefix}_BURE_COUNT'] = df.groupby(KEY).size()

    gr2size = df.groupby([KEY, 'SK_ID_BUREAU']).size()
    gr2size.name = 'CURR-BUREAU_cnt'
    gr1 = gr2size.groupby(KEY)
    gr1size = gr1.agg(
        {**{
            'CURR-BUREAU_cnt': ['min', 'max', 'mean', 'sum', 'var', 'size']
        }})
    gr1size.columns = pd.Index(
        [prefix + '_' + e[0] + "_" + e[1] for e in gr1size.columns.tolist()])

    df_agg = pd.concat([df_agg, gr1size], axis=1)

    # merge
    df_agg.reset_index(inplace=True)
    tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/train')

    tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/test')

    return
Пример #16
0
def aggregate():

    df_agg = df.groupby(KEY).agg({**num_aggregations})
    df_agg.columns = pd.Index(
        [e[0] + "_" + e[1] for e in df_agg.columns.tolist()])

    col_std = [c for c in df_agg.columns if c.endswith('_std')]
    for c in col_std:
        df_agg[f'{c}-d-mean'] = df_agg[c] / df_agg[c.replace('_std', '_mean')]

    df_agg['INSCC_COUNT'] = df.groupby(KEY).size()
    df_agg.reset_index(inplace=True)

    tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/train')

    tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/test')

    return
def aggregate(args):
    path, pref = args

    df = utils.read_pickles(path)
    df = df[df['DAYS_INSTALMENT'].between(day_start, day_end)]
    del df['SK_ID_PREV']

    df_agg = df.groupby(KEY).agg({**utils_agg.ins_num_aggregations})
    df_agg.columns = pd.Index(
        [e[0] + "_" + e[1] for e in df_agg.columns.tolist()])

    # std / mean
    col_std = [c for c in df_agg.columns if c.endswith('_std')]
    for c in col_std:
        df_agg[f'{c}-d-mean'] = df_agg[c] / df_agg[c.replace('_std', '_mean')]

    # max / min
    col_max = [c for c in df_agg.columns if c.endswith('_max')]
    for c in col_max:
        try:
            df_agg[f'{c}-d-min'] = df_agg[c] / df_agg[c.replace(
                '_max', '_min')]
            df_agg[f'{c}-m-min'] = df_agg[c] - df_agg[c.replace(
                '_max', '_min')]
        except:
            pass

    df_agg['INS_COUNT'] = df.groupby(KEY).size()
    df_agg = df_agg.add_prefix(pref).reset_index()

    utils.remove_feature(df_agg, var_limit=0, sample_size=19999)

    tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/train')

    tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/test')

    return
Пример #18
0
# =============================================================================
# NAME_CONTRACT_STATUS
# =============================================================================

ct1 = pd.crosstab(pos[KEY], pos['NAME_CONTRACT_STATUS']).add_suffix('_cnt')
ct2 = pd.crosstab(pos[KEY], pos['NAME_CONTRACT_STATUS'],
                  normalize='index').add_suffix('_nrm')

base = pd.concat([base, ct1, ct2], axis=1)

# TODO: DPD

# =============================================================================
# merge
# =============================================================================
base.reset_index(inplace=True)

train = utils.load_train([KEY])

test = utils.load_test([KEY])

train_ = pd.merge(train, base, on=KEY, how='left').drop(KEY, axis=1)
utils.to_feature(train_.add_prefix(PREF), '../feature/train')

test_ = pd.merge(test, base, on=KEY, how='left').drop(KEY, axis=1)
utils.to_feature(test_.add_prefix(PREF), '../feature/test')

#==============================================================================
utils.end(__file__)
                         y.iloc[valid_index],
                         categorical_feature=CAT)

    model = lgb.train(
        params=param,
        train_set=dtrain,
        num_boost_round=9999,
        valid_sets=[dtrain, dvalid],
        valid_names=['train', 'valid'],
        early_stopping_rounds=100,
        #evals_result=evals_result,
        verbose_eval=50)

    sub_train.iloc[valid_index, -1] = model.predict(X_train.iloc[valid_index])
    sub_test['y_pred'] += model.predict(X_test)

sub_test['y_pred'] /= NFOLD

print('train:', sub_train.y_pred.describe())
print('test:', sub_test.y_pred.describe())

# =============================================================================
# save
# =============================================================================

utils.to_feature(sub_train.add_prefix(PREF), '../feature/train')
utils.to_feature(sub_test.add_prefix(PREF), '../feature/test')

#==============================================================================
utils.end(__file__)
Пример #20
0
             dtrain,
             99999,
             nfold=FOLD,
             stratified=False,
             early_stopping_rounds=100,
             verbose_eval=50,
             seed=SEED)

# =============================================================================
#
# =============================================================================
NROUND = int(len(ret['rmse-mean']) * 1.3)  # 12234
print(f'NROUND: {NROUND}')

dtrain = lgb.Dataset(X_test_train, y_test_train, categorical_feature=CAT)

model = lgb.train(params, dtrain, NROUND)

train = pd.DataFrame(model.predict(X_train), columns=[feature1])

train[feature2] = y_train - train[feature1]

# =============================================================================
# otuput
# =============================================================================
utils.to_feature(train.add_prefix(PREF), '../feature/train')
#utils.to_feature(sub_test.add_prefix(PREF),  '../feature/test')

#==============================================================================
utils.end(__file__)
Пример #21
0
imp = ex.getImp(models)

# =============================================================================
# predict
# =============================================================================

for i, model in enumerate(models):
    y_pred_train_ = model.predict(X_train)
    y_pred_test_ = model.predict(X_test)
    if i == 0:
        y_pred_train = y_pred_train_
        y_pred_test = y_pred_test_
    else:
        y_pred_train += y_pred_train_
        y_pred_test += y_pred_test_

y_pred_train /= len(models)
y_pred_test /= len(models)

train = pd.DataFrame(y_pred_train).add_prefix(PREF)
test = pd.DataFrame(y_pred_test).add_prefix(PREF)

# =============================================================================
# output
# =============================================================================
utils.to_feature(train, '../feature/train')
utils.to_feature(test, '../feature/test')

#==============================================================================
utils.end(__file__)
Пример #22
0
def main(num_rows=None):
    # load pkls
    df = read_pickles('../features/plans')
    queries = loadpkl('../features/queries.pkl')
    profiles = loadpkl('../features/profiles.pkl')
    queries_pred = loadpkl('../features/queries_pred.pkl')
    queries_profiles_pred = loadpkl('../features/queries_profiles_pred.pkl')

    # merge
    df = pd.merge(df, queries, on=['sid', 'click_mode'], how='left')
    df = pd.merge(df, profiles, on='pid', how='left')
    df = pd.merge(df, queries_pred, on='sid', how='left')
    df = pd.merge(df, queries_profiles_pred, on='sid', how='left')

    del queries, profiles, queries_pred, queries_profiles_pred
    gc.collect()

    # reduce memory usage
    df = reduce_mem_usage(df)

    # count features
    df['pid_count'] = df['pid'].map(df['pid'].value_counts())

    # time diff
    df['plan_req_time_diff'] = (df['plan_time'] - df['req_time']).astype(int)

    # distance ratio
    cols_plan_distance = ['plan_{}_distance'.format(i) for i in range(0, 7)]

    for i, c in enumerate(cols_plan_distance):
        df['plan_queries_distance_ratio{}'.format(
            i)] = df[c] / df['queries_distance']
        df['plan_queries_distance_diff{}'.format(
            i)] = df[c] - df['queries_distance']

    # stats features for preds
    cols_pred_queries = ['pred_queries{}'.format(i) for i in range(0, 12)]
    cols_pred_queries_profiles = [
        'pred_queries_profiles{}'.format(i) for i in range(0, 12)
    ]

    df['pred_queries_mean'] = df[cols_pred_queries].mean(axis=1)
    df['pred_queries_sum'] = df[cols_pred_queries].sum(axis=1)
    df['pred_queries_max'] = df[cols_pred_queries].max(axis=1)
    df['pred_queries_min'] = df[cols_pred_queries].min(axis=1)
    df['pred_queries_var'] = df[cols_pred_queries].var(axis=1)
    df['pred_queries_skew'] = df[cols_pred_queries].skew(axis=1)

    df['pred_queries_profiles_mean'] = df[cols_pred_queries_profiles].mean(
        axis=1)
    df['pred_queries_profiles_sum'] = df[cols_pred_queries_profiles].sum(
        axis=1)
    df['pred_queries_profiles_max'] = df[cols_pred_queries_profiles].max(
        axis=1)
    df['pred_queries_profiles_min'] = df[cols_pred_queries_profiles].min(
        axis=1)
    df['pred_queries_profiles_var'] = df[cols_pred_queries_profiles].var(
        axis=1)
    df['pred_queries_profiles_skew'] = df[cols_pred_queries_profiles].skew(
        axis=1)

    # stats features for each classes
    print('stats features...')
    for i in tqdm(range(0, 12)):
        cols = [
            'pred_queries{}'.format(i), 'pred_queries_profiles{}'.format(i)
        ]
        df['pred_mean{}'.format(i)] = df[cols].mean(axis=1)
        df['pred_sum{}'.format(i)] = df[cols].sum(axis=1)
        df['pred_max{}'.format(i)] = df[cols].max(axis=1)
        df['pred_min{}'.format(i)] = df[cols].min(axis=1)
        df['pred_var{}'.format(i)] = df[cols].var(axis=1)
        df['pred_skew{}'.format(i)] = df[cols].skew(axis=1)

        cols_target = [c for c in df.columns if '_target_{}'.format(i) in c]
        df['target_mean{}'.format(i)] = df[cols_target].mean(axis=1)
        df['target_sum{}'.format(i)] = df[cols_target].sum(axis=1)
        df['target_max{}'.format(i)] = df[cols_target].max(axis=1)
        df['target_min{}'.format(i)] = df[cols_target].min(axis=1)
        df['target_var{}'.format(i)] = df[cols_target].var(axis=1)
        df['target_skew{}'.format(i)] = df[cols_target].skew(axis=1)

    # post processing
    cols_transport_mode = [
        'plan_{}_transport_mode'.format(i) for i in range(0, 7)
    ]
    print('post processing...')
    for i in tqdm(range(1, 12)):
        tmp = np.zeros(len(df))
        for c in cols_transport_mode:
            tmp += (df[c] == i).astype(int)

        cols_target = [c for c in df.columns if '_target_{}'.format(i) in c]
        for c in cols_target + [
                'pred_queries{}'.format(i), 'pred_queries_profiles{}'.format(i)
        ]:
            df[c] = df[c] * (tmp > 0)

    # reduce memory usage
    df = reduce_mem_usage(df)

    # split data by city
    df1 = df[df['y_o'] > 37.5]
    df2 = df[df['y_o'] < 27.5]
    df3 = df[df['x_o'] > 120.0]

    del df
    gc.collect()

    # cols for target encoding
    cols_target_encoding = [
        'plan_weekday', 'plan_hour', 'plan_is_holiday', 'plan_weekday_hour',
        'plan_is_holiday_hour', 'plan_num_plans', 'plan_num_free_plans',
        'x_o_round', 'y_o_round', 'x_d_round', 'y_d_round',
        'queries_distance_round'
    ]

    cols_ratio_plan = [
        'plan_price_distance_ratio_max_plan',
        'plan_price_distance_ratio_min_plan', 'plan_price_eta_ratio_max_plan',
        'plan_price_eta_ratio_min_plan', 'plan_distance_eta_ratio_max_plan',
        'plan_distance_eta_ratio_min_plan',
        'plan_price_distance_prod_max_plan', 'plan_price_eta_prod_max_plan',
        'plan_price_distance_prod_min_plan', 'plan_price_eta_prod_min_plan',
        'plan_distance_eta_prod_max_plan', 'plan_distance_eta_prod_min_plan',
        'plan_price_distance_eta_prod_max_plan',
        'plan_price_distance_eta_prod_min_plan',
        'plan_distance_ratio_0_max_plan', 'plan_distance_ratio_0_min_plan',
        'plan_price_ratio_0_max_plan', 'plan_price_ratio_0_min_plan',
        'plan_eta_ratio_0_max_plan', 'plan_eta_ratio_0_min_plan',
        'plan_price_distance_prod_ratio_0_max_plan',
        'plan_price_distance_prod_ratio_0_min_plan',
        'plan_price_eta_prod_ratio_0_max_plan',
        'plan_price_eta_prod_ratio_0_min_plan',
        'plan_distance_eta_prod_ratio_0_max_plan',
        'plan_distance_eta_prod_ratio_0_min_plan',
        'plan_price_distance_eta_prod_ratio_0_max_plan',
        'plan_price_distance_eta_prod_ratio_0_min_plan'
    ]

    cols_min_max_plan = [
        'plan_distance_max_plan', 'plan_distance_min_plan',
        'plan_price_max_plan', 'plan_price_min_plan', 'plan_eta_max_plan',
        'plan_eta_min_plan'
    ]

    cols_transport_mode = [
        'plan_{}_transport_mode'.format(i) for i in range(0, 7)
    ]

    cols_target_encoding = cols_target_encoding + cols_ratio_plan + cols_min_max_plan + cols_transport_mode + [
        'profile_k_means'
    ]

    # target encoding for each cities
    print('traget encoding...')
    for i, df in tqdm(enumerate([df1, df2, df3])):

        # target encoding
        df = targetEncodingMultiClass(df, 'click_mode', cols_target_encoding)

        # change dtype
        for col in df.columns.tolist():
            if df[col].dtypes == 'float16':
                df[col] = df[col].astype(np.float32)

        # remove missing variables
        col_missing = removeMissingVariables(df, 0.75)
        df.drop(col_missing, axis=1, inplace=True)

        # remove correlated variables
        col_drop = removeCorrelatedVariables(df, 0.95)
        df.drop(col_drop, axis=1, inplace=True)

        # save as feather
        to_feature(df, '../features/feats{}'.format(i + 1))

        # save feature name list
        features_json = {'features': df.columns.tolist()}
        to_json(features_json,
                '../features/00{}_all_features.json'.format(i + 1))

        del df
        gc.collect()

    line_notify('{} finished.'.format(sys.argv[0]))
#
# =============================================================================

train = utils.read_pickles('../data/prev_train').drop(
    ['SK_ID_CURR', 'SK_ID_PREV', 'TARGET'], axis=1)
test = utils.read_pickles('../data/prev_test').drop(
    ['SK_ID_CURR', 'SK_ID_PREV'], axis=1)

categorical_features = [
    'NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START',
    'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS', 'NAME_PAYMENT_TYPE',
    'CODE_REJECT_REASON', 'NAME_TYPE_SUITE', 'NAME_CLIENT_TYPE',
    'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE',
    'CHANNEL_TYPE', 'NAME_SELLER_INDUSTRY', 'NAME_YIELD_GROUP',
    'PRODUCT_COMBINATION'
]

le = LabelEncoder()
for c in categorical_features:
    train[c].fillna('na dayo', inplace=True)
    test[c].fillna('na dayo', inplace=True)
    le.fit(train[c].append(test[c]))
    train[c] = le.transform(train[c])
    test[c] = le.transform(test[c])

utils.to_feature(train.add_prefix(PREF), '../feature_prev/train')
utils.to_feature(test.add_prefix(PREF), '../feature_prev/test')

#==============================================================================
utils.end(__file__)
Пример #24
0
def aggregate(args):
    path, cont_type, pref = args

    df = utils.read_pickles(path,
                            [KEY, 'SK_ID_PREV', 'DAYS_ENTRY_PAYMENT'] + COL)
    #    df = df[df['DAYS_ENTRY_PAYMENT'].between(day_start, day_end)].sort_values([KEY, 'DAYS_ENTRY_PAYMENT'])
    df = pd.merge(df, prev, on='SK_ID_PREV', how='left')
    gc.collect()

    if cont_type == 'NA':
        df = df[df['NAME_CONTRACT_TYPE'].isnull()]
    else:
        df = df[df['NAME_CONTRACT_TYPE'] == cont_type]

    df.sort_values(['SK_ID_PREV', 'DAYS_ENTRY_PAYMENT'], inplace=True)
    df.reset_index(drop=True, inplace=True)

    li = []
    for c in COL:
        ret_diff = []
        ret_pctchng = []
        key_bk = x_bk = None
        for key, x in df[['SK_ID_PREV', c]].values:

            if key_bk is None:
                ret_diff.append(None)
                ret_pctchng.append(None)
            else:
                if key_bk == key:
                    ret_diff.append(x - x_bk)
                    ret_pctchng.append((x_bk - x) / x_bk)
                else:
                    ret_diff.append(None)
                    ret_pctchng.append(None)
            key_bk = key
            x_bk = x

        ret_diff = pd.Series(ret_diff, name=f'{c}_diff')
        ret_pctchng = pd.Series(ret_pctchng, name=f'{c}_pctchange')
        ret = pd.concat([ret_diff, ret_pctchng], axis=1)
        li.append(ret)
    callback = pd.concat(li, axis=1)
    col_ = callback.columns.tolist()
    callback[KEY] = df[KEY]

    num_agg = {}
    for c in col_:
        num_agg[c] = ['min', 'mean', 'max', 'var']

    feature = callback.groupby(KEY).agg(num_agg)
    feature.columns = pd.Index(
        [e[0] + "_" + e[1] for e in feature.columns.tolist()])
    feature.reset_index(inplace=True)

    utils.remove_feature(feature, var_limit=0, sample_size=19999)

    tmp = pd.merge(train, feature, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF + pref), '../feature/train')

    tmp = pd.merge(test, feature, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF + pref), '../feature/test')

    return