def Statistic_features(train, test, target_features, group_features): train_list_=[] test_list_=[] for t in target_features: for g in group_features: if t != g: s_train, s_test = proj_num_on_cat(train, test, target_column=t, group_column=g) train_list_.append(s_train) test_list_.append(s_test) return np.hstack(train_list_), np.hstack(test_list_)
'ps_ind_01' ] g_fea = [ 'ps_car_13', 'ps_ind_03', 'ps_reg_03', 'ps_ind_15', 'ps_reg_01', 'ps_ind_01', 'ps_ind_05_cat' ] + cat_fea t_fea = list(set(t_fea)) g_fea = list(set(g_fea)) #proj for t in t_fea: for g in g_fea: if t != g: s_train, s_test = proj_num_on_cat(train, test, target_column=t, group_column=g) train_list.append(s_train) test_list.append(s_test) X = sparse.hstack(train_list).tocsr() X_test = sparse.hstack(test_list).tocsr() #X = train_num #X_test = test_num all_data = np.vstack([X.toarray(), X_test.toarray()]) #all_data = np.vstack([X, X_test]) scaler = StandardScaler() scaler.fit(all_data) X = scaler.transform(X.toarray()) X_test = scaler.transform(X_test.toarray()) #X = scaler.transform(X)
for c in cat_fea + ['new_ind','new_reg','new_car']: d = pd.concat([train[c],test[c]]).value_counts().to_dict() train['%s_count'%c] = train[c].apply(lambda x:d.get(x,0)) test['%s_count'%c] = test[c].apply(lambda x:d.get(x,0)) cat_count_features.append('%s_count'%c) print(train_num.dtypes) train_list = [train_num.replace([np.inf, -np.inf, np.nan], 0), train[cat_count_features]]#, np.ones(shape=(train_num.shape[0], 1))] test_list = [test_num.replace([np.inf, -np.inf, np.nan], 0), test[cat_count_features]]#, np.ones(shape=(test_num.shape[0], 1))] #proj for t in ['ps_car_13', 'ps_ind_03', 'ps_reg_03', 'ps_ind_15', 'ps_reg_01', 'ps_ind_01']: for g in ['ps_car_13', 'ps_ind_03', 'ps_reg_03', 'ps_ind_15', 'ps_reg_01', 'ps_ind_01', 'ps_ind_05_cat']: if t != g: s_train, s_test = proj_num_on_cat(train, test, target_column=t, group_column=g) train_list.append(s_train) test_list.append(s_test) X = sparse.hstack(train_list).tocsr() X_test = sparse.hstack(test_list).tocsr() #X = train_num #X_test = test_num all_data = np.vstack([X.toarray(), X_test.toarray()]) #all_data = np.vstack([X, X_test]) scaler = StandardScaler() scaler.fit(all_data) X = scaler.transform(X.toarray()) X_test = scaler.transform(X_test.toarray()) #X = scaler.transform(X) #X_test = scaler.transform(X_test)