def aggregate(args): path, cont_type, pref = args df = utils.read_pickles(path, [KEY, 'DAYS_ENTRY_PAYMENT']) df = df[df['DAYS_ENTRY_PAYMENT'].between(day_start, day_end)].sort_values( [KEY, 'DAYS_ENTRY_PAYMENT']) df = pd.merge(df, prev, on=KEY, how='left') gc.collect() if cont_type == 'NA': df = df[df['NAME_CONTRACT_TYPE'].isnull()] else: df = df[df['NAME_CONTRACT_TYPE'] == cont_type] df['DEP_diff'] = df.groupby(KEY).DAYS_ENTRY_PAYMENT.diff() feature = df.groupby(KEY).agg( {'DEP_diff': ['min', 'mean', 'max', 'var', 'nunique']}) feature.columns = pd.Index( [e[0] + "_" + e[1] for e in feature.columns.tolist()]) feature.reset_index(inplace=True) utils.remove_feature(feature, var_limit=0, sample_size=19999) tmp = pd.merge(train, feature, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF + pref), '../feature_prev/train') tmp = pd.merge(test, feature, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF + pref), '../feature_prev/test') return
def multi(p): if p==0: feature = gr.size() feature.name = f'{PREF}_{KEY}_size' feature = feature.reset_index() elif p==1: feature = gr.min().add_prefix(f'{PREF}_').add_suffix('_min').reset_index() elif p==2: feature = gr.max().add_prefix(f'{PREF}_').add_suffix('_max').reset_index() elif p==3: feature = gr.mean().add_prefix(f'{PREF}_').add_suffix('_mean').reset_index() elif p==4: feature = gr.std().add_prefix(f'{PREF}_').add_suffix('_std').reset_index() elif p==5: feature = gr.sum().add_prefix(f'{PREF}_').add_suffix('_sum').reset_index() elif p==6: feature = gr.quantile(0.25).add_prefix(f'{PREF}_').add_suffix('_q25').reset_index() elif p==7: feature = gr.quantile(0.50).add_prefix(f'{PREF}_').add_suffix('_q50').reset_index() elif p==8: feature = gr.quantile(0.75).add_prefix(f'{PREF}_').add_suffix('_q75').reset_index() else: return train_ = pd.merge(train, feature, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(train_, '../feature/train') test_ = pd.merge(test, feature, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(test_, '../feature/test') return
def aggregate(args): path, pref = args df = utils.read_pickles(path) df = df[df['DAYS_INSTALMENT'].between(day_start, day_end)] df_key = df[[KEY, 'SK_ID_PREV']].drop_duplicates() df_agg = df.groupby('SK_ID_PREV').agg({**utils_agg.ins_num_aggregations}) df_agg.columns = pd.Index( [e[0] + "_" + e[1] for e in df_agg.columns.tolist()]) df_agg['INS_COUNT'] = df.groupby('SK_ID_PREV').size() df_agg = df_agg.add_prefix(pref).reset_index() utils.remove_feature(df_agg, var_limit=0, sample_size=19999) df_agg = pd.merge(df_agg, df_key, on='SK_ID_PREV', how='left').drop('SK_ID_PREV', axis=1) df_agg2 = df_agg.groupby(KEY).agg(['mean', 'var']) df_agg2.columns = pd.Index( [e[0] + "_" + e[1] for e in df_agg2.columns.tolist()]) df_agg2.reset_index(inplace=True) tmp = pd.merge(train, df_agg2, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/train') tmp = pd.merge(test, df_agg2, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/test') return
def aggregate(args): print(args) k, v, prefix = args df = (bb[bb[k]==v]) df_agg = df.groupby(KEY).agg(utils_agg.bb_num_aggregations) df_agg.columns = pd.Index([prefix + '_' + e[0] + "_" + e[1] for e in df_agg.columns.tolist()]) df_agg[f'{prefix}_BURE_COUNT'] = df.groupby(KEY).size() # gr2size = df.groupby([KEY, 'SK_ID_BUREAU']).size() # gr2size.name = 'CURR-BUREAU_cnt' # gr1 = gr2size.groupby(KEY) # gr1size = gr1.agg({**{'CURR-BUREAU_cnt': ['min', 'max', 'mean', 'sum', 'var', 'size']}}) # gr1size.columns = pd.Index([prefix + '_' + e[0] + "_" + e[1] for e in gr1size.columns.tolist()]) # # df_agg = pd.concat([df_agg, gr1size], axis=1) # merge df_agg.reset_index(inplace=True) tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature_bureau/train') tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature_bureau/test') return
def aggregate(args): path, pref = args df = utils.read_pickles(path, [KEY, 'SK_ID_PREV', 'month'] + COL) # df = df[df['DAYS_INSTALMENT'].between(day_start, day_end)] # del df['SK_ID_PREV'] df = df.groupby([KEY, 'SK_ID_PREV', 'month'])[COL].sum().reset_index() df_agg = df.groupby(KEY).agg({**num_agg}) df_agg.columns = pd.Index( [e[0] + "_" + e[1] for e in df_agg.columns.tolist()]) df_agg['INS_COUNT'] = df.groupby(KEY).size() df_agg = df_agg.add_prefix(pref).reset_index() utils.remove_feature(df_agg, var_limit=0, sample_size=19999) tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/train') tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/test') return
def aggregate(): df = utils.get_dummies(cre) li = [] for c1 in df.columns: for c2 in col_cat: if c1.startswith(c2 + '_'): li.append(c1) break cat_aggregations = {} for cat in li: cat_aggregations[cat] = ['mean', 'sum'] df_agg = df.groupby(KEY).agg({ **utils_agg.cre_num_aggregations, **cat_aggregations }) df_agg.columns = pd.Index( [e[0] + "_" + e[1] for e in df_agg.columns.tolist()]) df_agg['CRE_COUNT'] = df.groupby(KEY).size() df_agg.reset_index(inplace=True) tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/train') tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/test') return
def multi_agg(args): path, pref, cont_type, cont_type_pref = args print(args) ins = utils.read_pickles(path) ins = ins[ins['DAYS_INSTALMENT'].between(day_start, day_end)] ins = pd.merge(ins, prev, on='SK_ID_PREV', how='left'); gc.collect() del ins['SK_ID_CURR'] if cont_type=='NA': df = ins[ins['NAME_CONTRACT_TYPE'].isnull()] else: df = ins[ins['NAME_CONTRACT_TYPE']==cont_type] df_agg = df.groupby(KEY).agg({**utils_agg.ins_num_aggregations}) df_agg.columns = pd.Index([e[0] + "_" + e[1] for e in df_agg.columns.tolist()]) df_agg['INS_COUNT'] = df.groupby(KEY).size() df_agg = df_agg.add_prefix(pref+cont_type_pref).reset_index() utils.remove_feature(df_agg, var_limit=0, sample_size=19999) tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature_prev/train') tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature_prev/test') return
def main(): # load pkls df = read_pickles('../feats/sales_diff') df_calendar = loadpkl('../feats/calendar.pkl') df_sell_prices = loadpkl('../feats/sell_prices.pkl') # merge df = df.merge(df_calendar, on='d',how='left') df = df.merge(df_sell_prices, on=['store_id','item_id','wm_yr_wk'],how='left') del df_calendar, df_sell_prices gc.collect() # drop pre-release rows df = df[df['wm_yr_wk']>=df['release']] # make lag features df = make_lags(df,28) # label encoding cols_string = ['item_id','dept_id','cat_id','store_id','state_id'] for c in cols_string: df[c], _ = pd.factorize(df[c]) df[c].replace(-1,np.nan,inplace=True) # add price features df_grouped = df[['id','sell_price']].groupby('id')['sell_price'] df['shift_price_t1'] = df_grouped.transform(lambda x: x.shift(1)) df['price_change_t1'] = (df['shift_price_t1'] - df['sell_price']) / (df['shift_price_t1']) df['rolling_price_max_t365'] = df_grouped.transform(lambda x: x.shift(1).rolling(365).max()) df['price_change_t365'] = (df['rolling_price_max_t365'] - df['sell_price']) / (df['rolling_price_max_t365']) df['rolling_price_std_t7'] = df_grouped.transform(lambda x: x.rolling(7).std()) df['rolling_price_std_t30'] = df_grouped.transform(lambda x: x.rolling(30).std()) # features release date df['release'] = df['release'] - df['release'].min() # price momentum by month & year df['price_momentum_m'] = df['sell_price']/df.groupby(['store_id','item_id','month'])['sell_price'].transform('mean') df['price_momentum_y'] = df['sell_price']/df.groupby(['store_id','item_id','year'])['sell_price'].transform('mean') # days for CustomTimeSeriesSplitter df['d_numeric'] = df['d'].apply(lambda x: str(x)[2:]).astype(int) # reduce memory usage df = reduce_mem_usage(df) # save as feather to_feature(df, '../feats/f105') # save feature name list features_json = {'features':df.columns.tolist()} to_json(features_json,'../configs/105_all_features_diff.json') # LINE notify line_notify('{} done.'.format(sys.argv[0]))
def aggregate(): df = utils.get_dummies(pos) li = [] for c1 in df.columns: for c2 in col_cat: if c1.startswith(c2 + '_'): li.append(c1) break cat_aggregations = {} for cat in li: cat_aggregations[cat] = ['mean', 'sum'] df_agg = df.groupby(KEY).agg({ **utils_agg.pos_num_aggregations, **cat_aggregations }) df_agg.columns = pd.Index( [e[0] + "_" + e[1] for e in df_agg.columns.tolist()]) # std / mean col_std = [c for c in df_agg.columns if c.endswith('_std')] for c in col_std: df_agg[f'{c}-d-mean'] = df_agg[c] / df_agg[c.replace('_std', '_mean')] # max / min col_max = [c for c in df_agg.columns if c.endswith('_max')] for c in col_max: try: df_agg[f'{c}-d-min'] = df_agg[c] / df_agg[c.replace( '_max', '_min')] except: pass df_agg['POS_COUNT'] = df.groupby(KEY).size() df_agg.reset_index(inplace=True) utils.remove_feature(df_agg, var_limit=0, corr_limit=0.98, sample_size=19999) tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/train') tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/test') return
def create_feature(self, random_state=None, devmode=False): trn_dir, tst_dir = self.get_feature_dir(random_state) if os.path.exists(trn_dir) and os.path.exists( tst_dir) and devmode is False: print( "There are cache dir for feature [{}] (train_cache_dir=[{}], test_cache_dir=[{}])" .format(self.__class__.__name__, trn_dir, tst_dir)) trn_feature_files = list(Path(trn_dir).glob('*.f')) tst_feature_files = list(Path(tst_dir).glob('*.f')) return trn_feature_files, tst_feature_files print( "Start computing feature [{}] (train_cache_dir=[{}], test_cache_dir=[{}])" .format(self.__class__.__name__, trn_dir, tst_dir)) if isinstance(self.fin, list): # 入力ファイルがlistだった場合DataFrameのlistを渡す df_list = [pd.read_feather(f) for f in self.fin] feat = self.create_feature_impl(df_list, random_state) del df_list gc.collect() else: df = pd.read_feather(self.fin) feat = self.create_feature_impl(df, random_state) del df gc.collect() feat = utils.reduce_mem_usage(feat) trn = self.trn_base.merge(feat, on=CONST.KEY, how='left').drop(columns=CONST.KEY) tst = self.tst_base.merge(feat, on=CONST.KEY, how='left').drop(columns=CONST.KEY) trn = trn.add_prefix(self.pref) tst = tst.add_prefix(self.pref) # Save ... if not devmode: os.makedirs(trn_dir) os.makedirs(tst_dir) utils.to_feature(trn, trn_dir) utils.to_feature(tst, tst_dir) trn_feature_files = list(Path(trn_dir).glob('*.f')) tst_feature_files = list(Path(tst_dir).glob('*.f')) return trn_feature_files, tst_feature_files else: return trn, tst
def aggregate(args): print(args) k, v, prefix = args df = utils.get_dummies(bure[bure[k] == v]) li = [] for c1 in df.columns: for c2 in col_cat: if c1.startswith(c2 + '_'): li.append(c1) break cat_aggregations = {} for cat in li: cat_aggregations[cat] = ['mean', 'sum'] df_agg = df.groupby(KEY).agg({ **utils_agg.bure_num_aggregations, **cat_aggregations }) df_agg.columns = pd.Index( [prefix + e[0] + "_" + e[1] for e in df_agg.columns.tolist()]) # std / mean col_std = [c for c in df_agg.columns if c.endswith('_std')] for c in col_std: df_agg[f'{c}-d-mean'] = df_agg[c] / df_agg[c.replace('_std', '_mean')] # max / min col_max = [c for c in df_agg.columns if c.endswith('_max')] for c in col_max: try: df_agg[f'{c}-d-min'] = df_agg[c] / df_agg[c.replace( '_max', '_min')] df_agg[f'{c}-m-min'] = df_agg[c] - df_agg[c.replace( '_max', '_min')] except: pass df_agg[f'{prefix}BURE_COUNT'] = df.groupby(KEY).size() df_agg.reset_index(inplace=True) tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/train') tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/test') return
def multi(c): train[c+'_ta'] = 0 for i,(train_index, test_index) in enumerate(skf.split(train, train.TARGET)): enc = train.iloc[train_index].groupby(c)['TARGET'].mean() train.set_index(c, inplace=True) train.iloc[test_index, -1] = enc train.reset_index(inplace=True) enc = train.groupby(c)['TARGET'].mean() test[c+'_ta'] = 0 test.set_index(c, inplace=True) test.iloc[:,-1] = enc test.reset_index(inplace=True) utils.to_feature(train[[c+'_ta']].add_prefix(PREF), '../feature/train') utils.to_feature(test[[c+'_ta']].add_prefix(PREF), '../feature/test')
def aggregate(): df_agg = df.groupby(KEY).agg({**num_aggregations}) df_agg.columns = pd.Index([e[0] + "_" + e[1] for e in df_agg.columns.tolist()]) df_agg.reset_index(inplace=True) # utils.remove_feature(df_agg, var_limit=0, corr_limit=0.98, sample_size=19999) tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/train') tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/test') return
def multi_agg(args): path, pref, cont_type, cont_type_pref = args print(args) ins = utils.read_pickles(path) ins = ins[ins['DAYS_INSTALMENT'].between(day_start, day_end)] ins = pd.merge(ins, prev, on='SK_ID_PREV', how='left') gc.collect() del ins['SK_ID_PREV'] if cont_type == 'NA': df = ins[ins['NAME_CONTRACT_TYPE'].isnull()] else: df = ins[ins['NAME_CONTRACT_TYPE'] == cont_type] df_agg = df.groupby(KEY).agg({**utils_agg.ins_num_aggregations}) df_agg.columns = pd.Index( [e[0] + "_" + e[1] for e in df_agg.columns.tolist()]) # std / mean col_std = [c for c in df_agg.columns if c.endswith('_std')] for c in col_std: df_agg[f'{c}-d-mean'] = df_agg[c] / df_agg[c.replace('_std', '_mean')] # max / min col_max = [c for c in df_agg.columns if c.endswith('_max')] for c in col_max: try: df_agg[f'{c}-d-min'] = df_agg[c] / df_agg[c.replace( '_max', '_min')] df_agg[f'{c}-m-min'] = df_agg[c] - df_agg[c.replace( '_max', '_min')] except: pass df_agg['INS_COUNT'] = df.groupby(KEY).size() df_agg = df_agg.add_prefix(pref + cont_type_pref).reset_index() utils.remove_feature(df_agg, var_limit=0, sample_size=19999) tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/train') tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/test') return
def aggregate(args): print(args) k, v, prefix = args df = (bb[bb[k] == v]) df_agg = df.groupby(KEY).agg(utils_agg.bb_num_aggregations) df_agg.columns = pd.Index( [prefix + '_' + e[0] + "_" + e[1] for e in df_agg.columns.tolist()]) # std / mean col_std = [c for c in df_agg.columns if c.endswith('_std')] for c in col_std: df_agg[f'{c}-d-mean'] = df_agg[c] / df_agg[c.replace('_std', '_mean')] # max / min col_max = [c for c in df_agg.columns if c.endswith('_max')] for c in col_max: df_agg[f'{c}-d-min'] = df_agg[c] / df_agg[c.replace('_max', '_min')] df_agg[f'{prefix}_BURE_COUNT'] = df.groupby(KEY).size() gr2size = df.groupby([KEY, 'SK_ID_BUREAU']).size() gr2size.name = 'CURR-BUREAU_cnt' gr1 = gr2size.groupby(KEY) gr1size = gr1.agg( {**{ 'CURR-BUREAU_cnt': ['min', 'max', 'mean', 'sum', 'var', 'size'] }}) gr1size.columns = pd.Index( [prefix + '_' + e[0] + "_" + e[1] for e in gr1size.columns.tolist()]) df_agg = pd.concat([df_agg, gr1size], axis=1) # merge df_agg.reset_index(inplace=True) tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/train') tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/test') return
def aggregate(): df_agg = df.groupby(KEY).agg({**num_aggregations}) df_agg.columns = pd.Index( [e[0] + "_" + e[1] for e in df_agg.columns.tolist()]) col_std = [c for c in df_agg.columns if c.endswith('_std')] for c in col_std: df_agg[f'{c}-d-mean'] = df_agg[c] / df_agg[c.replace('_std', '_mean')] df_agg['INSCC_COUNT'] = df.groupby(KEY).size() df_agg.reset_index(inplace=True) tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/train') tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/test') return
def aggregate(args): path, pref = args df = utils.read_pickles(path) df = df[df['DAYS_INSTALMENT'].between(day_start, day_end)] del df['SK_ID_PREV'] df_agg = df.groupby(KEY).agg({**utils_agg.ins_num_aggregations}) df_agg.columns = pd.Index( [e[0] + "_" + e[1] for e in df_agg.columns.tolist()]) # std / mean col_std = [c for c in df_agg.columns if c.endswith('_std')] for c in col_std: df_agg[f'{c}-d-mean'] = df_agg[c] / df_agg[c.replace('_std', '_mean')] # max / min col_max = [c for c in df_agg.columns if c.endswith('_max')] for c in col_max: try: df_agg[f'{c}-d-min'] = df_agg[c] / df_agg[c.replace( '_max', '_min')] df_agg[f'{c}-m-min'] = df_agg[c] - df_agg[c.replace( '_max', '_min')] except: pass df_agg['INS_COUNT'] = df.groupby(KEY).size() df_agg = df_agg.add_prefix(pref).reset_index() utils.remove_feature(df_agg, var_limit=0, sample_size=19999) tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/train') tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/test') return
# ============================================================================= # NAME_CONTRACT_STATUS # ============================================================================= ct1 = pd.crosstab(pos[KEY], pos['NAME_CONTRACT_STATUS']).add_suffix('_cnt') ct2 = pd.crosstab(pos[KEY], pos['NAME_CONTRACT_STATUS'], normalize='index').add_suffix('_nrm') base = pd.concat([base, ct1, ct2], axis=1) # TODO: DPD # ============================================================================= # merge # ============================================================================= base.reset_index(inplace=True) train = utils.load_train([KEY]) test = utils.load_test([KEY]) train_ = pd.merge(train, base, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(train_.add_prefix(PREF), '../feature/train') test_ = pd.merge(test, base, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(test_.add_prefix(PREF), '../feature/test') #============================================================================== utils.end(__file__)
y.iloc[valid_index], categorical_feature=CAT) model = lgb.train( params=param, train_set=dtrain, num_boost_round=9999, valid_sets=[dtrain, dvalid], valid_names=['train', 'valid'], early_stopping_rounds=100, #evals_result=evals_result, verbose_eval=50) sub_train.iloc[valid_index, -1] = model.predict(X_train.iloc[valid_index]) sub_test['y_pred'] += model.predict(X_test) sub_test['y_pred'] /= NFOLD print('train:', sub_train.y_pred.describe()) print('test:', sub_test.y_pred.describe()) # ============================================================================= # save # ============================================================================= utils.to_feature(sub_train.add_prefix(PREF), '../feature/train') utils.to_feature(sub_test.add_prefix(PREF), '../feature/test') #============================================================================== utils.end(__file__)
dtrain, 99999, nfold=FOLD, stratified=False, early_stopping_rounds=100, verbose_eval=50, seed=SEED) # ============================================================================= # # ============================================================================= NROUND = int(len(ret['rmse-mean']) * 1.3) # 12234 print(f'NROUND: {NROUND}') dtrain = lgb.Dataset(X_test_train, y_test_train, categorical_feature=CAT) model = lgb.train(params, dtrain, NROUND) train = pd.DataFrame(model.predict(X_train), columns=[feature1]) train[feature2] = y_train - train[feature1] # ============================================================================= # otuput # ============================================================================= utils.to_feature(train.add_prefix(PREF), '../feature/train') #utils.to_feature(sub_test.add_prefix(PREF), '../feature/test') #============================================================================== utils.end(__file__)
imp = ex.getImp(models) # ============================================================================= # predict # ============================================================================= for i, model in enumerate(models): y_pred_train_ = model.predict(X_train) y_pred_test_ = model.predict(X_test) if i == 0: y_pred_train = y_pred_train_ y_pred_test = y_pred_test_ else: y_pred_train += y_pred_train_ y_pred_test += y_pred_test_ y_pred_train /= len(models) y_pred_test /= len(models) train = pd.DataFrame(y_pred_train).add_prefix(PREF) test = pd.DataFrame(y_pred_test).add_prefix(PREF) # ============================================================================= # output # ============================================================================= utils.to_feature(train, '../feature/train') utils.to_feature(test, '../feature/test') #============================================================================== utils.end(__file__)
def main(num_rows=None): # load pkls df = read_pickles('../features/plans') queries = loadpkl('../features/queries.pkl') profiles = loadpkl('../features/profiles.pkl') queries_pred = loadpkl('../features/queries_pred.pkl') queries_profiles_pred = loadpkl('../features/queries_profiles_pred.pkl') # merge df = pd.merge(df, queries, on=['sid', 'click_mode'], how='left') df = pd.merge(df, profiles, on='pid', how='left') df = pd.merge(df, queries_pred, on='sid', how='left') df = pd.merge(df, queries_profiles_pred, on='sid', how='left') del queries, profiles, queries_pred, queries_profiles_pred gc.collect() # reduce memory usage df = reduce_mem_usage(df) # count features df['pid_count'] = df['pid'].map(df['pid'].value_counts()) # time diff df['plan_req_time_diff'] = (df['plan_time'] - df['req_time']).astype(int) # distance ratio cols_plan_distance = ['plan_{}_distance'.format(i) for i in range(0, 7)] for i, c in enumerate(cols_plan_distance): df['plan_queries_distance_ratio{}'.format( i)] = df[c] / df['queries_distance'] df['plan_queries_distance_diff{}'.format( i)] = df[c] - df['queries_distance'] # stats features for preds cols_pred_queries = ['pred_queries{}'.format(i) for i in range(0, 12)] cols_pred_queries_profiles = [ 'pred_queries_profiles{}'.format(i) for i in range(0, 12) ] df['pred_queries_mean'] = df[cols_pred_queries].mean(axis=1) df['pred_queries_sum'] = df[cols_pred_queries].sum(axis=1) df['pred_queries_max'] = df[cols_pred_queries].max(axis=1) df['pred_queries_min'] = df[cols_pred_queries].min(axis=1) df['pred_queries_var'] = df[cols_pred_queries].var(axis=1) df['pred_queries_skew'] = df[cols_pred_queries].skew(axis=1) df['pred_queries_profiles_mean'] = df[cols_pred_queries_profiles].mean( axis=1) df['pred_queries_profiles_sum'] = df[cols_pred_queries_profiles].sum( axis=1) df['pred_queries_profiles_max'] = df[cols_pred_queries_profiles].max( axis=1) df['pred_queries_profiles_min'] = df[cols_pred_queries_profiles].min( axis=1) df['pred_queries_profiles_var'] = df[cols_pred_queries_profiles].var( axis=1) df['pred_queries_profiles_skew'] = df[cols_pred_queries_profiles].skew( axis=1) # stats features for each classes print('stats features...') for i in tqdm(range(0, 12)): cols = [ 'pred_queries{}'.format(i), 'pred_queries_profiles{}'.format(i) ] df['pred_mean{}'.format(i)] = df[cols].mean(axis=1) df['pred_sum{}'.format(i)] = df[cols].sum(axis=1) df['pred_max{}'.format(i)] = df[cols].max(axis=1) df['pred_min{}'.format(i)] = df[cols].min(axis=1) df['pred_var{}'.format(i)] = df[cols].var(axis=1) df['pred_skew{}'.format(i)] = df[cols].skew(axis=1) cols_target = [c for c in df.columns if '_target_{}'.format(i) in c] df['target_mean{}'.format(i)] = df[cols_target].mean(axis=1) df['target_sum{}'.format(i)] = df[cols_target].sum(axis=1) df['target_max{}'.format(i)] = df[cols_target].max(axis=1) df['target_min{}'.format(i)] = df[cols_target].min(axis=1) df['target_var{}'.format(i)] = df[cols_target].var(axis=1) df['target_skew{}'.format(i)] = df[cols_target].skew(axis=1) # post processing cols_transport_mode = [ 'plan_{}_transport_mode'.format(i) for i in range(0, 7) ] print('post processing...') for i in tqdm(range(1, 12)): tmp = np.zeros(len(df)) for c in cols_transport_mode: tmp += (df[c] == i).astype(int) cols_target = [c for c in df.columns if '_target_{}'.format(i) in c] for c in cols_target + [ 'pred_queries{}'.format(i), 'pred_queries_profiles{}'.format(i) ]: df[c] = df[c] * (tmp > 0) # reduce memory usage df = reduce_mem_usage(df) # split data by city df1 = df[df['y_o'] > 37.5] df2 = df[df['y_o'] < 27.5] df3 = df[df['x_o'] > 120.0] del df gc.collect() # cols for target encoding cols_target_encoding = [ 'plan_weekday', 'plan_hour', 'plan_is_holiday', 'plan_weekday_hour', 'plan_is_holiday_hour', 'plan_num_plans', 'plan_num_free_plans', 'x_o_round', 'y_o_round', 'x_d_round', 'y_d_round', 'queries_distance_round' ] cols_ratio_plan = [ 'plan_price_distance_ratio_max_plan', 'plan_price_distance_ratio_min_plan', 'plan_price_eta_ratio_max_plan', 'plan_price_eta_ratio_min_plan', 'plan_distance_eta_ratio_max_plan', 'plan_distance_eta_ratio_min_plan', 'plan_price_distance_prod_max_plan', 'plan_price_eta_prod_max_plan', 'plan_price_distance_prod_min_plan', 'plan_price_eta_prod_min_plan', 'plan_distance_eta_prod_max_plan', 'plan_distance_eta_prod_min_plan', 'plan_price_distance_eta_prod_max_plan', 'plan_price_distance_eta_prod_min_plan', 'plan_distance_ratio_0_max_plan', 'plan_distance_ratio_0_min_plan', 'plan_price_ratio_0_max_plan', 'plan_price_ratio_0_min_plan', 'plan_eta_ratio_0_max_plan', 'plan_eta_ratio_0_min_plan', 'plan_price_distance_prod_ratio_0_max_plan', 'plan_price_distance_prod_ratio_0_min_plan', 'plan_price_eta_prod_ratio_0_max_plan', 'plan_price_eta_prod_ratio_0_min_plan', 'plan_distance_eta_prod_ratio_0_max_plan', 'plan_distance_eta_prod_ratio_0_min_plan', 'plan_price_distance_eta_prod_ratio_0_max_plan', 'plan_price_distance_eta_prod_ratio_0_min_plan' ] cols_min_max_plan = [ 'plan_distance_max_plan', 'plan_distance_min_plan', 'plan_price_max_plan', 'plan_price_min_plan', 'plan_eta_max_plan', 'plan_eta_min_plan' ] cols_transport_mode = [ 'plan_{}_transport_mode'.format(i) for i in range(0, 7) ] cols_target_encoding = cols_target_encoding + cols_ratio_plan + cols_min_max_plan + cols_transport_mode + [ 'profile_k_means' ] # target encoding for each cities print('traget encoding...') for i, df in tqdm(enumerate([df1, df2, df3])): # target encoding df = targetEncodingMultiClass(df, 'click_mode', cols_target_encoding) # change dtype for col in df.columns.tolist(): if df[col].dtypes == 'float16': df[col] = df[col].astype(np.float32) # remove missing variables col_missing = removeMissingVariables(df, 0.75) df.drop(col_missing, axis=1, inplace=True) # remove correlated variables col_drop = removeCorrelatedVariables(df, 0.95) df.drop(col_drop, axis=1, inplace=True) # save as feather to_feature(df, '../features/feats{}'.format(i + 1)) # save feature name list features_json = {'features': df.columns.tolist()} to_json(features_json, '../features/00{}_all_features.json'.format(i + 1)) del df gc.collect() line_notify('{} finished.'.format(sys.argv[0]))
# # ============================================================================= train = utils.read_pickles('../data/prev_train').drop( ['SK_ID_CURR', 'SK_ID_PREV', 'TARGET'], axis=1) test = utils.read_pickles('../data/prev_test').drop( ['SK_ID_CURR', 'SK_ID_PREV'], axis=1) categorical_features = [ 'NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS', 'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON', 'NAME_TYPE_SUITE', 'NAME_CLIENT_TYPE', 'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE', 'NAME_SELLER_INDUSTRY', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION' ] le = LabelEncoder() for c in categorical_features: train[c].fillna('na dayo', inplace=True) test[c].fillna('na dayo', inplace=True) le.fit(train[c].append(test[c])) train[c] = le.transform(train[c]) test[c] = le.transform(test[c]) utils.to_feature(train.add_prefix(PREF), '../feature_prev/train') utils.to_feature(test.add_prefix(PREF), '../feature_prev/test') #============================================================================== utils.end(__file__)
def aggregate(args): path, cont_type, pref = args df = utils.read_pickles(path, [KEY, 'SK_ID_PREV', 'DAYS_ENTRY_PAYMENT'] + COL) # df = df[df['DAYS_ENTRY_PAYMENT'].between(day_start, day_end)].sort_values([KEY, 'DAYS_ENTRY_PAYMENT']) df = pd.merge(df, prev, on='SK_ID_PREV', how='left') gc.collect() if cont_type == 'NA': df = df[df['NAME_CONTRACT_TYPE'].isnull()] else: df = df[df['NAME_CONTRACT_TYPE'] == cont_type] df.sort_values(['SK_ID_PREV', 'DAYS_ENTRY_PAYMENT'], inplace=True) df.reset_index(drop=True, inplace=True) li = [] for c in COL: ret_diff = [] ret_pctchng = [] key_bk = x_bk = None for key, x in df[['SK_ID_PREV', c]].values: if key_bk is None: ret_diff.append(None) ret_pctchng.append(None) else: if key_bk == key: ret_diff.append(x - x_bk) ret_pctchng.append((x_bk - x) / x_bk) else: ret_diff.append(None) ret_pctchng.append(None) key_bk = key x_bk = x ret_diff = pd.Series(ret_diff, name=f'{c}_diff') ret_pctchng = pd.Series(ret_pctchng, name=f'{c}_pctchange') ret = pd.concat([ret_diff, ret_pctchng], axis=1) li.append(ret) callback = pd.concat(li, axis=1) col_ = callback.columns.tolist() callback[KEY] = df[KEY] num_agg = {} for c in col_: num_agg[c] = ['min', 'mean', 'max', 'var'] feature = callback.groupby(KEY).agg(num_agg) feature.columns = pd.Index( [e[0] + "_" + e[1] for e in feature.columns.tolist()]) feature.reset_index(inplace=True) utils.remove_feature(feature, var_limit=0, sample_size=19999) tmp = pd.merge(train, feature, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF + pref), '../feature/train') tmp = pd.merge(test, feature, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF + pref), '../feature/test') return