def aggregate(args): path, pref = args df = utils.read_pickles(path, [KEY, 'SK_ID_PREV', 'month'] + COL) # df = df[df['DAYS_INSTALMENT'].between(day_start, day_end)] # del df['SK_ID_PREV'] df = df.groupby([KEY, 'SK_ID_PREV', 'month'])[COL].sum().reset_index() df_agg = df.groupby(KEY).agg({**num_agg}) df_agg.columns = pd.Index( [e[0] + "_" + e[1] for e in df_agg.columns.tolist()]) df_agg['INS_COUNT'] = df.groupby(KEY).size() df_agg = df_agg.add_prefix(pref).reset_index() utils.remove_feature(df_agg, var_limit=0, sample_size=19999) tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/train') tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/test') return
def multi_agg(args): path, pref, cont_type, cont_type_pref = args print(args) ins = utils.read_pickles(path) ins = ins[ins['DAYS_INSTALMENT'].between(day_start, day_end)] ins = pd.merge(ins, prev, on='SK_ID_PREV', how='left'); gc.collect() del ins['SK_ID_CURR'] if cont_type=='NA': df = ins[ins['NAME_CONTRACT_TYPE'].isnull()] else: df = ins[ins['NAME_CONTRACT_TYPE']==cont_type] df_agg = df.groupby(KEY).agg({**utils_agg.ins_num_aggregations}) df_agg.columns = pd.Index([e[0] + "_" + e[1] for e in df_agg.columns.tolist()]) df_agg['INS_COUNT'] = df.groupby(KEY).size() df_agg = df_agg.add_prefix(pref+cont_type_pref).reset_index() utils.remove_feature(df_agg, var_limit=0, sample_size=19999) tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature_prev/train') tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature_prev/test') return
def aggregate(args): path, cont_type, pref = args df = utils.read_pickles(path, [KEY, 'DAYS_ENTRY_PAYMENT']) df = df[df['DAYS_ENTRY_PAYMENT'].between(day_start, day_end)].sort_values( [KEY, 'DAYS_ENTRY_PAYMENT']) df = pd.merge(df, prev, on=KEY, how='left') gc.collect() if cont_type == 'NA': df = df[df['NAME_CONTRACT_TYPE'].isnull()] else: df = df[df['NAME_CONTRACT_TYPE'] == cont_type] df['DEP_diff'] = df.groupby(KEY).DAYS_ENTRY_PAYMENT.diff() feature = df.groupby(KEY).agg( {'DEP_diff': ['min', 'mean', 'max', 'var', 'nunique']}) feature.columns = pd.Index( [e[0] + "_" + e[1] for e in feature.columns.tolist()]) feature.reset_index(inplace=True) utils.remove_feature(feature, var_limit=0, sample_size=19999) tmp = pd.merge(train, feature, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF + pref), '../feature_prev/train') tmp = pd.merge(test, feature, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF + pref), '../feature_prev/test') return
def plot_accuracy(X_train, Y_train, X_test, Y_test, model, title): feature_num = X_train.shape[1] # fig, ax = plt.subplots() fig, ax = plt.subplots(figsize=(7, 6)) ax.set_title(title) model.fit(X_train, Y_train) pred = model.predict(X_test) acc_for_all = accuracy_score(Y_test, pred) print(classification_report(Y_test, pred)) print(confusion_matrix(Y_test, pred)) ax.hlines(acc_for_all * 100, 0, feature_num - 1, label='Full dataset accuracy = %.2f%%' % (acc_for_all * 100)) acc = [] for i in range(feature_num): tmp_X_train = utils.remove_feature(X_train, i) tmp_X_test = utils.remove_feature(X_test, i) model.fit(tmp_X_train, Y_train) pred = model.predict(tmp_X_test) acc.append(accuracy_score(Y_test, pred) * 100) points = [i for i in range(feature_num)] xticks = [i + 1 for i in range(feature_num)] plt.xticks(points, xticks) ax.plot(range(feature_num), acc) ax.set_xlabel('Excluded feature number') ax.set_ylabel('Accuracy') plt.legend(loc='upper right') plt.show()
def aggregate(): df = utils.get_dummies(pos) li = [] for c1 in df.columns: for c2 in col_cat: if c1.startswith(c2+'_'): li.append(c1) break cat_aggregations = {} for cat in li: cat_aggregations[cat] = ['mean', 'sum'] df_agg = df.groupby(KEY).agg({**utils_agg.pos_num_aggregations, **cat_aggregations}) df_agg.columns = pd.Index([e[0] + "_" + e[1] for e in df_agg.columns.tolist()]) df_agg['POS_COUNT'] = df.groupby(KEY).size() df_agg.reset_index(inplace=True) utils.remove_feature(df_agg, var_limit=0, corr_limit=0.98, sample_size=19999) tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature_prev/train') tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature_prev/test') return
def aggregate(args): path, pref = args df = utils.read_pickles(path) df = df[df['DAYS_INSTALMENT'].between(day_start, day_end)] df_key = df[[KEY, 'SK_ID_PREV']].drop_duplicates() df_agg = df.groupby('SK_ID_PREV').agg({**utils_agg.ins_num_aggregations}) df_agg.columns = pd.Index( [e[0] + "_" + e[1] for e in df_agg.columns.tolist()]) df_agg['INS_COUNT'] = df.groupby('SK_ID_PREV').size() df_agg = df_agg.add_prefix(pref).reset_index() utils.remove_feature(df_agg, var_limit=0, sample_size=19999) df_agg = pd.merge(df_agg, df_key, on='SK_ID_PREV', how='left').drop('SK_ID_PREV', axis=1) df_agg2 = df_agg.groupby(KEY).agg(['mean', 'var']) df_agg2.columns = pd.Index( [e[0] + "_" + e[1] for e in df_agg2.columns.tolist()]) df_agg2.reset_index(inplace=True) tmp = pd.merge(train, df_agg2, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/train') tmp = pd.merge(test, df_agg2, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/test') return
def aggregate(args): print(args) k, v, prefix = args df = utils.get_dummies(prev[prev[k] == v]) li = [] for c1 in df.columns: for c2 in col_cat: if c1.startswith(c2 + '_'): li.append(c1) break cat_aggregations = {} for cat in li: cat_aggregations[cat] = ['mean', 'sum'] df_agg = df.groupby('SK_ID_CURR').agg({ **utils_agg.prev_num_aggregations, **cat_aggregations }) df_agg.columns = pd.Index( [prefix + e[0] + "_" + e[1] for e in df_agg.columns.tolist()]) # std / mean col_std = [c for c in df_agg.columns if c.endswith('_std')] for c in col_std: df_agg[f'{c}-d-mean'] = df_agg[c] / df_agg[c.replace('_std', '_mean')] # max / min col_max = [c for c in df_agg.columns if c.endswith('_max')] for c in col_max: df_agg[f'{c}-d-min'] = df_agg[c] / df_agg[c.replace('_max', '_min')] df_agg[prefix + 'PREV_COUNT'] = df.groupby('SK_ID_CURR').size() df_agg.reset_index(inplace=True) utils.remove_feature(df_agg, var_limit=0, corr_limit=0.98, sample_size=19999) tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/train') tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/test') return
def multi_agg(args): path, pref, cont_type, cont_type_pref = args print(args) ins = utils.read_pickles(path) ins = ins[ins['DAYS_INSTALMENT'].between(day_start, day_end)] ins = pd.merge(ins, prev, on='SK_ID_PREV', how='left') gc.collect() del ins['SK_ID_PREV'] if cont_type == 'NA': df = ins[ins['NAME_CONTRACT_TYPE'].isnull()] else: df = ins[ins['NAME_CONTRACT_TYPE'] == cont_type] df_agg = df.groupby(KEY).agg({**utils_agg.ins_num_aggregations}) df_agg.columns = pd.Index( [e[0] + "_" + e[1] for e in df_agg.columns.tolist()]) # std / mean col_std = [c for c in df_agg.columns if c.endswith('_std')] for c in col_std: df_agg[f'{c}-d-mean'] = df_agg[c] / df_agg[c.replace('_std', '_mean')] # max / min col_max = [c for c in df_agg.columns if c.endswith('_max')] for c in col_max: try: df_agg[f'{c}-d-min'] = df_agg[c] / df_agg[c.replace( '_max', '_min')] df_agg[f'{c}-m-min'] = df_agg[c] - df_agg[c.replace( '_max', '_min')] except: pass df_agg['INS_COUNT'] = df.groupby(KEY).size() df_agg = df_agg.add_prefix(pref + cont_type_pref).reset_index() utils.remove_feature(df_agg, var_limit=0, sample_size=19999) tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/train') tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/test') return
def aggregate(args): path, pref = args df = utils.read_pickles(path) df = df[df['DAYS_INSTALMENT'].between(day_start, day_end)] del df['SK_ID_PREV'] df_agg = df.groupby(KEY).agg({**utils_agg.ins_num_aggregations}) df_agg.columns = pd.Index( [e[0] + "_" + e[1] for e in df_agg.columns.tolist()]) # std / mean col_std = [c for c in df_agg.columns if c.endswith('_std')] for c in col_std: df_agg[f'{c}-d-mean'] = df_agg[c] / df_agg[c.replace('_std', '_mean')] # max / min col_max = [c for c in df_agg.columns if c.endswith('_max')] for c in col_max: try: df_agg[f'{c}-d-min'] = df_agg[c] / df_agg[c.replace( '_max', '_min')] df_agg[f'{c}-m-min'] = df_agg[c] - df_agg[c.replace( '_max', '_min')] except: pass df_agg['INS_COUNT'] = df.groupby(KEY).size() df_agg = df_agg.add_prefix(pref).reset_index() utils.remove_feature(df_agg, var_limit=0, sample_size=19999) tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/train') tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/test') return
feature[f'{c}_ratio'] = (dup[c] / gr[c].shift(1)).values feature[f'{c}_min'] = pd.concat([dup[c], gr[c].shift(1), gr[c].shift(2)], axis=1).min(1).values feature[f'{c}_max'] = pd.concat([dup[c], gr[c].shift(1), gr[c].shift(2)], axis=1).max(1).values feature[f'{c}_mean'] = pd.concat([dup[c], gr[c].shift(1), gr[c].shift(2)], axis=1).mean(1).values # feature[f'{c}_diff_r'] = gr[c].diff(-1).values # feature[f'{c}_ratio_r'] = ( dup[c] / gr[c].shift(-1) ).values # feature[f'{c}_min_r'] = pd.concat([ dup[c], gr[c].shift(-1), gr[c].shift(-2)], axis=1).min(1).values # feature[f'{c}_max_r'] = pd.concat([ dup[c], gr[c].shift(-1), gr[c].shift(-2)], axis=1).max(1).values # feature[f'{c}_mean_r'] = pd.concat([ dup[c], gr[c].shift(-1), gr[c].shift(-2)], axis=1).mean(1).values feature.dropna(how='all', inplace=True) utils.remove_feature(feature, var_limit=0, corr_limit=0.98, sample_size=19999) train = utils.load_train([KEY]) test = utils.load_test([KEY]) feature.reset_index(inplace=True) feature = pd.merge(feature, user_id, on=KEY, how='left') tmp = pd.merge(train, feature, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/train') tmp = pd.merge(test, feature, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/test') # ============================================================================= # drop old user
def test_fetures(X, Y): print('K-Neighbors') X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) KNN_model = KNeighborsClassifier(n_neighbors=5) KNN_model.fit(X_train, Y_train) KNN_pred = KNN_model.predict(X_test) print("%.2f%%" % (accuracy_score(Y_test, KNN_pred) * 100)) for i in range(X.shape[1]): tmp = utils.remove_feature(X, i) X_train, X_test, Y_train, Y_test = train_test_split(tmp, Y, test_size=0.2, random_state=42) KNN_model = KNeighborsClassifier(n_neighbors=5) KNN_model.fit(X_train, Y_train) KNN_pred = KNN_model.predict(X_test) # print(classification_report(Y_test, KNN_pred)) print("%.2f%%" % (accuracy_score(Y_test, KNN_pred) * 100), end=' ') print('\nSVM') X_train, X_test, Y_train, Y_test = train_test_split(tmp, Y, test_size=0.2, random_state=42) SVC_model = SVC() SVC_model.fit(X_train, Y_train) SVC_pred = SVC_model.predict(X_test) print("%.2f%%" % (accuracy_score(Y_test, SVC_pred) * 100)) for i in range(X.shape[1]): tmp = utils.remove_feature(X, i) X_train, X_test, Y_train, Y_test = train_test_split(tmp, Y, test_size=0.2, random_state=42) SVC_model = SVC() SVC_model.fit(X_train, Y_train) SVC_pred = SVC_model.predict(X_test) # print(classification_report(Y_test, SVC_pred)) print("%.2f%%" % (accuracy_score(Y_test, SVC_pred) * 100), end=' ') print('\nNaive Bayes Classifier') X_train, X_test, Y_train, Y_test = train_test_split(tmp, Y, test_size=0.2, random_state=42) GNB_model = GaussianNB() GNB_model.fit(X_train, Y_train) GNB_pred = GNB_model.predict(X_test) print("%.2f%%" % (accuracy_score(Y_test, GNB_pred) * 100)) for i in range(X.shape[1]): tmp = utils.remove_feature(X, i) X_train, X_test, Y_train, Y_test = train_test_split(tmp, Y, test_size=0.2, random_state=42) GNB_model = GaussianNB() GNB_model.fit(X_train, Y_train) GNB_pred = GNB_model.predict(X_test) # print(classification_report(Y_test, GNB_pred)) print("%.2f%%" % (accuracy_score(Y_test, GNB_pred) * 100), end=' ') print('\nDecision Tree') X_train, X_test, Y_train, Y_test = train_test_split(tmp, Y, test_size=0.2, random_state=42) tree_model = DecisionTreeClassifier() tree_model.fit(X_train, Y_train) tree_pred = tree_model.predict(X_test) print("%.2f%%" % (accuracy_score(Y_test, tree_pred) * 100)) for i in range(X.shape[1]): tmp = utils.remove_feature(X, i) X_train, X_test, Y_train, Y_test = train_test_split(tmp, Y, test_size=0.2, random_state=42) tree_model = DecisionTreeClassifier() tree_model.fit(X_train, Y_train) tree_pred = tree_model.predict(X_test) # print(classification_report(Y_test, tree_pred)) print("%.2f%%" % (accuracy_score(Y_test, tree_pred) * 100), end=' ') print('\nRandom Forest') X_train, X_test, Y_train, Y_test = train_test_split(tmp, Y, test_size=0.2, random_state=42) forest_model = RandomForestClassifier() forest_model.fit(X_train, Y_train) forest_pred = forest_model.predict(X_test) print("%.2f%%" % (accuracy_score(Y_test, forest_pred) * 100)) for i in range(X.shape[1]): tmp = utils.remove_feature(X, i) X_train, X_test, Y_train, Y_test = train_test_split(tmp, Y, test_size=0.2, random_state=42) forest_model = RandomForestClassifier() forest_model.fit(X_train, Y_train) forest_pred = forest_model.predict(X_test) # print(classification_report(Y_test, forest_pred)) print("%.2f%%" % (accuracy_score(Y_test, forest_pred) * 100), end=' ')
def prepare_dataset(X_train, X_test): tmp_X_train = X_train tmp_X_test = X_test tmp_X_train = utils.remove_feature(tmp_X_train, 17) tmp_X_train = utils.remove_feature(tmp_X_train, 16) tmp_X_train = utils.remove_feature(tmp_X_train, 15) tmp_X_train = utils.remove_feature(tmp_X_train, 14) tmp_X_train = utils.remove_feature(tmp_X_train, 13) tmp_X_train = utils.remove_feature(tmp_X_train, 12) tmp_X_train = utils.remove_feature(tmp_X_train, 11) tmp_X_train = utils.remove_feature(tmp_X_train, 10) tmp_X_train = utils.remove_feature(tmp_X_train, 9) tmp_X_train = utils.remove_feature(tmp_X_train, 8) tmp_X_train = utils.remove_feature(tmp_X_train, 7) tmp_X_train = utils.remove_feature(tmp_X_train, 6) # tmp_X_train = utils.remove_feature(tmp_X_train, 5) # tmp_X_train = utils.remove_feature(tmp_X_train, 4) # tmp_X_train = utils.remove_feature(tmp_X_train, 3) # tmp_X_train = utils.remove_feature(tmp_X_train, 2) # tmp_X_train = utils.remove_feature(tmp_X_train, 1) # tmp_X_train = utils.remove_feature(tmp_X_train, 0) tmp_X_test = utils.remove_feature(tmp_X_test, 17) tmp_X_test = utils.remove_feature(tmp_X_test, 16) tmp_X_test = utils.remove_feature(tmp_X_test, 15) tmp_X_test = utils.remove_feature(tmp_X_test, 14) tmp_X_test = utils.remove_feature(tmp_X_test, 13) tmp_X_test = utils.remove_feature(tmp_X_test, 12) tmp_X_test = utils.remove_feature(tmp_X_test, 11) tmp_X_test = utils.remove_feature(tmp_X_test, 10) tmp_X_test = utils.remove_feature(tmp_X_test, 9) tmp_X_test = utils.remove_feature(tmp_X_test, 8) tmp_X_test = utils.remove_feature(tmp_X_test, 7) tmp_X_test = utils.remove_feature(tmp_X_test, 6) # tmp_X_test = utils.remove_feature(tmp_X_test, 5) # tmp_X_test = utils.remove_feature(tmp_X_test, 4) # tmp_X_test = utils.remove_feature(tmp_X_test, 3) # tmp_X_test = utils.remove_feature(tmp_X_test, 2) # tmp_X_test = utils.remove_feature(tmp_X_test, 1) # tmp_X_test = utils.remove_feature(tmp_X_test, 0) return tmp_X_train, tmp_X_test
# gr2.sum().add_suffix('_sum'), ], axis=1) return feature col_list = [ col_delayed_day, col_delayed_money, col_delayed_money_ratio, col_not_delayed_day, col_not_delayed_money, col_not_delayed_money_ratio ] feature = [mk_feature(col) for col in col_list] feature = pd.concat(feature, axis=1) utils.remove_feature(feature) feature.reset_index(inplace=True) # ============================================================================= # merge # ============================================================================= train = utils.load_train([KEY]) test = utils.load_test([KEY]) train = pd.merge(train, feature, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(train.add_prefix(PREF), '../feature/train') test = pd.merge(test, feature, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(test.add_prefix(PREF), '../feature/test')
from keras.models import Model from keras.layers import Dense, Dropout, Activation, BatchNormalization, Input from keras.layers import Average, Add, Maximum from keras.callbacks import TensorBoard from keras.utils import plot_model from sklearn.model_selection import train_test_split import numpy as np import utils import os X, Y = utils.get_dataset('dataset_threshold_100_shift_05_2.txt') X = utils.remove_feature(X, 17) X = utils.remove_feature(X, 16) X = utils.remove_feature(X, 15) X = utils.remove_feature(X, 14) X = utils.remove_feature(X, 13) X = utils.remove_feature(X, 12) X = utils.remove_feature(X, 11) X = utils.remove_feature(X, 10) X = utils.remove_feature(X, 9) X = utils.remove_feature(X, 8) X = utils.remove_feature(X, 7) X = utils.remove_feature(X, 6) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) input_shape = X[0].shape output_num = Y.shape[1]
def aggregate(args): path, cont_type, pref = args df = utils.read_pickles(path, [KEY, 'SK_ID_PREV', 'DAYS_ENTRY_PAYMENT'] + COL) # df = df[df['DAYS_ENTRY_PAYMENT'].between(day_start, day_end)].sort_values([KEY, 'DAYS_ENTRY_PAYMENT']) df = pd.merge(df, prev, on='SK_ID_PREV', how='left') gc.collect() if cont_type == 'NA': df = df[df['NAME_CONTRACT_TYPE'].isnull()] else: df = df[df['NAME_CONTRACT_TYPE'] == cont_type] df.sort_values(['SK_ID_PREV', 'DAYS_ENTRY_PAYMENT'], inplace=True) df.reset_index(drop=True, inplace=True) li = [] for c in COL: ret_diff = [] ret_pctchng = [] key_bk = x_bk = None for key, x in df[['SK_ID_PREV', c]].values: if key_bk is None: ret_diff.append(None) ret_pctchng.append(None) else: if key_bk == key: ret_diff.append(x - x_bk) ret_pctchng.append((x_bk - x) / x_bk) else: ret_diff.append(None) ret_pctchng.append(None) key_bk = key x_bk = x ret_diff = pd.Series(ret_diff, name=f'{c}_diff') ret_pctchng = pd.Series(ret_pctchng, name=f'{c}_pctchange') ret = pd.concat([ret_diff, ret_pctchng], axis=1) li.append(ret) callback = pd.concat(li, axis=1) col_ = callback.columns.tolist() callback[KEY] = df[KEY] num_agg = {} for c in col_: num_agg[c] = ['min', 'mean', 'max', 'var'] feature = callback.groupby(KEY).agg(num_agg) feature.columns = pd.Index( [e[0] + "_" + e[1] for e in feature.columns.tolist()]) feature.reset_index(inplace=True) utils.remove_feature(feature, var_limit=0, sample_size=19999) tmp = pd.merge(train, feature, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF + pref), '../feature/train') tmp = pd.merge(test, feature, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF + pref), '../feature/test') return
def get_part(X, n): tmp = X if (n == 1): tmp = utils.remove_feature(tmp, 17) tmp = utils.remove_feature(tmp, 16) tmp = utils.remove_feature(tmp, 15) tmp = utils.remove_feature(tmp, 14) tmp = utils.remove_feature(tmp, 13) tmp = utils.remove_feature(tmp, 12) tmp = utils.remove_feature(tmp, 11) tmp = utils.remove_feature(tmp, 10) tmp = utils.remove_feature(tmp, 9) tmp = utils.remove_feature(tmp, 8) tmp = utils.remove_feature(tmp, 7) tmp = utils.remove_feature(tmp, 6) elif (n == 2): tmp = utils.remove_feature(tmp, 17) tmp = utils.remove_feature(tmp, 16) tmp = utils.remove_feature(tmp, 15) tmp = utils.remove_feature(tmp, 14) tmp = utils.remove_feature(tmp, 13) tmp = utils.remove_feature(tmp, 12) tmp = utils.remove_feature(tmp, 5) tmp = utils.remove_feature(tmp, 4) tmp = utils.remove_feature(tmp, 3) tmp = utils.remove_feature(tmp, 2) tmp = utils.remove_feature(tmp, 1) tmp = utils.remove_feature(tmp, 0) else: tmp = utils.remove_feature(tmp, 11) tmp = utils.remove_feature(tmp, 10) tmp = utils.remove_feature(tmp, 9) tmp = utils.remove_feature(tmp, 8) tmp = utils.remove_feature(tmp, 7) tmp = utils.remove_feature(tmp, 6) tmp = utils.remove_feature(tmp, 5) tmp = utils.remove_feature(tmp, 4) tmp = utils.remove_feature(tmp, 3) tmp = utils.remove_feature(tmp, 2) tmp = utils.remove_feature(tmp, 1) tmp = utils.remove_feature(tmp, 0) return tmp
print(id_curr, c) raise tmp = pd.DataFrame.from_dict(di, orient='index').T tmp['SK_ID_CURR'] = id_curr return tmp.set_index('SK_ID_CURR') # ============================================================================= # main # ============================================================================= pool = Pool(NTHREAD) callback = pool.map(multi, ids) pool.close() base = pd.concat(callback) utils.remove_feature(base) # ============================================================================= # merge # ============================================================================= base.reset_index(inplace=True) if base.columns.duplicated().sum() != 0: raise Exception(base.columns[base.columns.duplicated()]) train = utils.load_train([KEY]) train = pd.merge(train, base, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(train.add_prefix(PREF), '../feature/train') test = utils.load_test([KEY]) test = pd.merge(test, base, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(test.add_prefix(PREF), '../feature/test')