def Statistic_features(train, test, target_features, group_features):
    train_list_=[]
    test_list_=[]
    for t in target_features:
        for g in group_features:
            if t != g:
                s_train, s_test = proj_num_on_cat(train, test, target_column=t, group_column=g)
                train_list_.append(s_train)
                test_list_.append(s_test)
    return np.hstack(train_list_), np.hstack(test_list_)
예제 #2
0
    'ps_ind_01'
]
g_fea = [
    'ps_car_13', 'ps_ind_03', 'ps_reg_03', 'ps_ind_15', 'ps_reg_01',
    'ps_ind_01', 'ps_ind_05_cat'
] + cat_fea

t_fea = list(set(t_fea))
g_fea = list(set(g_fea))

#proj
for t in t_fea:
    for g in g_fea:
        if t != g:
            s_train, s_test = proj_num_on_cat(train,
                                              test,
                                              target_column=t,
                                              group_column=g)
            train_list.append(s_train)
            test_list.append(s_test)
X = sparse.hstack(train_list).tocsr()
X_test = sparse.hstack(test_list).tocsr()
#X = train_num
#X_test = test_num
all_data = np.vstack([X.toarray(), X_test.toarray()])
#all_data = np.vstack([X, X_test])

scaler = StandardScaler()
scaler.fit(all_data)
X = scaler.transform(X.toarray())
X_test = scaler.transform(X_test.toarray())
#X = scaler.transform(X)
예제 #3
0
for c in cat_fea + ['new_ind','new_reg','new_car']:
    d = pd.concat([train[c],test[c]]).value_counts().to_dict()
    train['%s_count'%c] = train[c].apply(lambda x:d.get(x,0))
    test['%s_count'%c] = test[c].apply(lambda x:d.get(x,0))
    cat_count_features.append('%s_count'%c)


print(train_num.dtypes)
train_list = [train_num.replace([np.inf, -np.inf, np.nan], 0), train[cat_count_features]]#, np.ones(shape=(train_num.shape[0], 1))]
test_list = [test_num.replace([np.inf, -np.inf, np.nan], 0), test[cat_count_features]]#, np.ones(shape=(test_num.shape[0], 1))]

#proj
for t in ['ps_car_13', 'ps_ind_03', 'ps_reg_03', 'ps_ind_15', 'ps_reg_01', 'ps_ind_01']:
    for g in ['ps_car_13', 'ps_ind_03', 'ps_reg_03', 'ps_ind_15', 'ps_reg_01', 'ps_ind_01', 'ps_ind_05_cat']:
        if t != g:
            s_train, s_test = proj_num_on_cat(train, test, target_column=t, group_column=g)
            train_list.append(s_train)
            test_list.append(s_test)
X = sparse.hstack(train_list).tocsr()
X_test = sparse.hstack(test_list).tocsr()
#X = train_num
#X_test = test_num
all_data = np.vstack([X.toarray(), X_test.toarray()])
#all_data = np.vstack([X, X_test])

scaler = StandardScaler()
scaler.fit(all_data)
X = scaler.transform(X.toarray())
X_test = scaler.transform(X_test.toarray())
#X = scaler.transform(X)
#X_test = scaler.transform(X_test)