def Multiply_Divide(train, test, features):
    """
    combinations:
    combinations(['A', 'B','C'],2)  retrun AB AC BC
    combinations(range(4), 3) --> 012 013 023 123
    """
    feature_names= []
    for e, (x, y) in enumerate(combinations(features, 2)):
        train, test, feature_name= interaction_features(train, test, x, y, e)
        for name in feature_name:
            feature_names.append(name)


    return train, test, feature_names
Exemplo n.º 2
0
test_id = test['id']
del test['id']

cat_fea = [x for x in list(train) if 'cat' in x]
bin_fea = [x for x in list(train) if 'bin' in x]

train['missing'] = (train == -1).sum(axis=1).astype(float)
test['missing'] = (test == -1).sum(axis=1).astype(float)

# include interactions
for e, (x, y) in enumerate(
        combinations([
            'ps_car_13', 'ps_ind_03', 'ps_reg_03', 'ps_ind_15', 'ps_reg_01',
            'ps_ind_01'
        ], 2)):
    train, test = interaction_features(train, test, x, y, e)

num_features = [c for c in list(train) if ('cat' not in c and 'calc' not in c)]
num_features.append('missing')
inter_fea = [x for x in list(train) if 'inter' in x]
#train['cat_sum'] = train[cat_fea].sum(axis=1)
#test['cat_sum'] = test[cat_fea].sum(axis=1)

#X = train.as_matrix()
#X_test = test.as_matrix()
#print(X.shape, X_test.shape)
#ohe
ohe = OneHotEncoder(sparse=True)

train_cat = train[cat_fea].as_matrix()
train_num = train[[x for x in list(train) if x in num_features]]
Exemplo n.º 3
0
train_id = train['id']
del train['target'], train['id']

test = pd.read_csv("../input/test.csv")
test_id = test['id']
del test['id']

cat_fea = [x for x in list(train) if 'cat' in x]
bin_fea = [x for x in list(train) if 'bin' in x]

train['missing'] = (train==-1).sum(axis=1).astype(float)
test['missing'] = (test==-1).sum(axis=1).astype(float)

# include interactions
for e, (x, y) in enumerate(combinations(['ps_car_13', 'ps_ind_03', 'ps_reg_03', 'ps_ind_15', 'ps_reg_01', 'ps_ind_01'], 2)):
    train, test = interaction_features(train, test, x, y, e)

num_features = [c for c in list(train) if ('cat' not in c and 'calc' not in c)]
num_features.append('missing')
inter_fea = [x for x in list(train) if 'inter' in x]
#train['cat_sum'] = train[cat_fea].sum(axis=1)
#test['cat_sum'] = test[cat_fea].sum(axis=1)

path = "../input/"
num_features_comb = []
for p in os.listdir(path):
    if 'ps_reg_02___ps_car_07_cat' in p or 'ps_reg_01___ps_car_13___ps_car_15' in p:
        print(p)
        x,xt = pd.read_pickle(path+p)
        train[p] = x
        test[p] = xt