def load_x(ds, preset):
    feature_parts = [Dataset.load_part(ds, part) for part in preset.get('features', [])]
    prediction_parts = [load_prediction(ds, p, mode=preset.get('predictions_mode', 'fulltrain')) for p in preset.get('predictions', [])]
    prediction_parts = [p.clip(lower=0.1).values.reshape((p.shape[0], 1)) for p in prediction_parts]

    if 'prediction_transform' in preset:
        prediction_parts = map(preset['prediction_transform'], prediction_parts)

    return hstack(feature_parts + prediction_parts)
import pandas as pd

from util import Dataset

for name in ['train', 'test']:
    print "Processing %s..." % name

    num = pd.DataFrame(Dataset.load_part(name, 'numeric'),
                       columns=Dataset.get_part_features('numeric'))
    df = pd.DataFrame(index=num.index)

    df['diff_1_6'] = num['cont1'] - num['cont6']
    df['diff_1_9'] = num['cont1'] - num['cont9']
    df['diff_1_10'] = num['cont1'] - num['cont10']
    df['diff_6_9'] = num['cont6'] - num['cont9']
    df['diff_6_10'] = num['cont6'] - num['cont10']
    df['diff_6_11'] = num['cont6'] - num['cont11']
    df['diff_6_12'] = num['cont6'] - num['cont12']
    df['diff_6_13'] = num['cont6'] - num['cont13']
    df['diff_7_11'] = num['cont7'] - num['cont11']
    df['diff_7_12'] = num['cont7'] - num['cont12']
    df['diff_11_12'] = num['cont11'] - num['cont12']

    if name == 'train':
        Dataset.save_part_features('numeric_combinations', list(df.columns))

    Dataset(numeric_combinations=df.values).save(name)

print "Done."
def y_decode(y):
    og = Dataset.load_part("train", "target_labels")
    le = LabelEncoder()
    le.classes_ = og
    z = [int(i) for i in y]
    return le.inverse_transform(z)
print("Preset: %s" % args.preset)

preset = presets[args.preset]

feature_builders = preset.get('feature_builders', [])

n_bags = preset.get('n_bags', 1)
n_splits = preset.get('n_splits', 1)

y_aggregator = preset.get('agg', mode_agg)
y_transform, y_inv_transform = preset.get('y_transform',
                                          (lambda y: y, lambda y: y))

print("Loading train data...")
train_x = load_x('train', preset)
train_y = Dataset.load_part('train', 'target')
train_p = np.zeros((train_x.shape[0], n_splits * n_bags))
train_r = Dataset.load('train',
                       parts=np.unique(
                           sum([b.requirements for b in feature_builders],
                               ['target'])))

feature_names = extract_feature_names(preset)
print(args.optimize)

if args.optimize:
    opt_train_idx, opt_eval_idx = train_test_split(range(len(train_y)),
                                                   test_size=0.2)

    opt_train_x = train_x[opt_train_idx]
    opt_train_y = train_y[opt_train_idx]
示例#5
0
import numpy as np

from scipy.stats import skew, boxcox

from tqdm import tqdm
from util import Dataset

print("Loading data...")

train_num = Dataset.load_part('train', 'numeric_mean')
test_num = Dataset.load_part('test', 'numeric_mean')

train_num_enc = np.zeros(train_num.shape, dtype=np.float32)
test_num_enc = np.zeros(test_num.shape, dtype=np.float32)

with tqdm(total=train_num.shape[1], desc='  Transforming', unit='cols') as pbar:
    for col in range(train_num.shape[1]):
        values = np.hstack((train_num[:, col], test_num[:, col]))
        print(values)
        sk = skew(values)

        if sk > 0.25:
            values_enc, lam = boxcox(values+1)

            train_num_enc[:, col] = values_enc[:train_num.shape[0]]
            test_num_enc[:, col] = values_enc[train_num.shape[0]:]
        else:
            train_num_enc[:, col] = train_num[:, col]
            test_num_enc[:, col] = test_num[:, col]

        pbar.update(1)
import numpy as np
import scipy.sparse as sp

from tqdm import tqdm
from util import Dataset

print("Loading data...")

min_freq = 10

train_cat = Dataset.load_part('train', 'cat_manual')
test_cat = Dataset.load_part('test', 'cat_manual')

train_cat_enc = []
test_cat_enc = []

cats = Dataset.get_part_features('categorical')
features = []

with tqdm(total=len(cats), desc='  Encoding', unit='cols') as pbar:
    for col, cat in enumerate(cats):
        value_counts = dict(
            zip(*np.unique(train_cat[:, col], return_counts=True)))

        train_rares = np.zeros(train_cat.shape[0], dtype=np.uint8)
        test_rares = np.zeros(test_cat.shape[0], dtype=np.uint8)

        for val in value_counts:
            if value_counts[val] >= min_freq:
                features.append('%s_%s' % (cat, val))
                train_cat_enc.append(
示例#7
0
import numpy as np
import scipy.sparse as sp
import pandas as pd
from sklearn.preprocessing import scale

from tqdm import tqdm
from util import Dataset

print("Loading data...")

idx = Dataset.load_part("train", 'id')

train_cat = pd.DataFrame(Dataset.load_part("train", 'categorical_mode'),
                         columns=Dataset.get_part_features('categorical_mode'),
                         index=idx)
train_num = pd.DataFrame(Dataset.load_part("train", 'numeric_mean'),
                         columns=Dataset.get_part_features('numeric_mean'),
                         index=idx)

train = pd.concat([train_cat, train_num], axis=1)

idx = Dataset.load_part("test", 'id')

test_cat = pd.DataFrame(Dataset.load_part("test", 'categorical_mode'),
                        columns=Dataset.get_part_features('categorical_mode'),
                        index=idx)
test_num = pd.DataFrame(Dataset.load_part("test", 'numeric_mean'),
                        columns=Dataset.get_part_features('numeric_mean'),
                        index=idx)

test = pd.concat([test_cat, test_num], axis=1)
示例#8
0
import numpy as np
import scipy.sparse as sp

from tqdm import tqdm
from util import Dataset

print "Loading data..."

min_freq = 10

train_cat = Dataset.load_part('train', 'categorical')
test_cat = Dataset.load_part('test', 'categorical')

train_cat_enc = []
test_cat_enc = []

cats = Dataset.get_part_features('categorical')
features = []

with tqdm(total=len(cats), desc='  Encoding', unit='cols') as pbar:
    for col, cat in enumerate(cats):
        value_counts = dict(
            zip(*np.unique(train_cat[:, col], return_counts=True)))

        train_rares = np.zeros(train_cat.shape[0], dtype=np.uint8)
        test_rares = np.zeros(test_cat.shape[0], dtype=np.uint8)

        for val in value_counts:
            if value_counts[val] >= min_freq:
                features.append('%s_%s' % (cat, val))
                train_cat_enc.append(
import numpy as np

from util import Dataset, vstack, hstack

from sklearn.preprocessing import scale
from sklearn.decomposition import TruncatedSVD

n_components = 500  # 500 components explain 99.8% of variance

print "Loading data..."

train_num = Dataset.load_part('train', 'numeric')
train_cat = Dataset.load_part('train', 'categorical_dummy')

test_num = Dataset.load_part('test', 'numeric')
test_cat = Dataset.load_part('test', 'categorical_dummy')

train_cnt = train_num.shape[0]

print "Combining data..."

all_data = hstack((scale(vstack(
    (train_num, test_num)).astype(np.float64)).astype(np.float32),
                   vstack((train_cat, test_cat))))

del train_num, train_cat, test_num, test_cat

print "Fitting svd..."

svd = TruncatedSVD(n_components)
res = svd.fit_transform(all_data)
示例#10
0
import numpy as np
import scipy.sparse as sp
from scipy.stats import boxcox
import pandas as pd
from sklearn.preprocessing import scale

from tqdm import tqdm
from util import Dataset

print("Loading data...")

idx = Dataset.load_part("train", 'id')

train_num = pd.DataFrame(Dataset.load_part("train", 'numeric_mean'), columns=Dataset.get_part_features('numeric_mean'), index=idx)

idx = Dataset.load_part("test", 'id')

test_num = pd.DataFrame(Dataset.load_part("test", 'numeric_mean'), columns=Dataset.get_part_features('numeric_mean'), index=idx)


all_nData = train_num.append(test_num)
print(all_nData.head())

all_num_norm = pd.DataFrame()
all_num_norm["ApplicantIncome"] = np.log1p(all_nData.ApplicantIncome)
all_num_norm["CoapplicantIncome"] = np.log1p(all_nData.CoapplicantIncome)
all_num_norm["LoanAmount"] = (np.log1p(all_nData.LoanAmount))
all_num_norm["Loan_Amount_Term"] = np.log1p(all_nData.Loan_Amount_Term)

train_custom = all_num_norm[:train_num.shape[0]]
test_custom = all_num_norm[train_num.shape[0]:]
import pandas as pd
import sys

from sklearn.metrics import mean_absolute_error

from util import Dataset, load_prediction

df = pd.DataFrame({'loss': Dataset.load_part('train', 'loss')},
                  index=Dataset.load_part('train', 'id'))

edges = df['loss'].quantile([0.2, 0.4, 0.6, 0.8]).values

df['bucket'] = len(edges)
for i in reversed(xrange(len(edges))):
    df.loc[df['loss'] <= edges[i], 'bucket'] = i

pred = load_prediction('train', sys.argv[1])

errs = (pd.Series(pred, index=df.index) - df['loss']).abs()

print errs.groupby(df['bucket']).mean()
print("Preset: %s" % args.preset)

preset = presets[args.preset]

feature_builders = preset.get('feature_builders', [])

n_bags = preset.get('n_bags', 1)
n_splits = preset.get('n_splits', 1)

y_aggregator = preset.get('agg', mode_agg)
y_transform, y_inv_transform = preset.get('y_transform', (lambda y: y, lambda y: y))

print("Loading train data...")
train_x = load_x('train', preset)
train_y = Dataset.load_part('train', 'target')
train_p = np.zeros((train_x.shape[0], n_splits * n_bags))
train_r = Dataset.load('train', parts=np.unique(sum([b.requirements for b in feature_builders], ['target'])))

feature_names = extract_feature_names(preset)
print(args.optimize)

if args.optimize:
    opt_train_idx, opt_eval_idx = train_test_split(range(len(train_y)), test_size=0.2)

    opt_train_x = train_x[opt_train_idx]
    opt_train_y = train_y[opt_train_idx]
    opt_train_r = train_r.slice(opt_train_idx)

    opt_eval_x = train_x[opt_eval_idx]
    opt_eval_y = train_y[opt_eval_idx]
示例#13
0
import pandas as pd

from util import Dataset

for name in ['train', 'test']:
    print "Processing %s..." % name

    idx = Dataset.load_part(name, 'id')

    # Load parts
    numeric = pd.DataFrame(Dataset.load_part(name, 'numeric'),
                           columns=Dataset.get_part_features('numeric_lin'),
                           index=idx)
    numeric_lin = pd.DataFrame(
        Dataset.load_part(name, 'numeric_lin'),
        columns=Dataset.get_part_features('numeric_lin'),
        index=idx)

    # Build features
    df = pd.DataFrame(index=idx)
    #df['cont14'] = numeric['cont14']
    df['cont_1_9_diff'] = numeric_lin['cont9'] - numeric_lin['cont1']

    # Save column names
    if name == 'train':
        Dataset.save_part_features('manual', list(df.columns))

    Dataset(manual=df.values).save(name)

print "Done."
示例#14
0
import pandas as pd
import numpy as np

import sys

from sklearn.metrics import mean_absolute_error
from sklearn.cross_validation import KFold
from statsmodels.regression.quantile_regression import QuantReg

from util import Dataset

pred_name = sys.argv[1]

n_folds = 8

train_y = Dataset.load_part('train', 'loss')
train_x = pd.read_csv('preds/%s-train.csv' % pred_name)['loss'].values

orig_maes = []
corr_maes = []

for fold, (fold_train_idx, fold_eval_idx) in enumerate(
        KFold(len(train_y), n_folds, shuffle=True, random_state=2016)):
    fold_train_x = train_x[fold_train_idx]
    fold_train_y = train_y[fold_train_idx]

    fold_eval_x = train_x[fold_eval_idx]
    fold_eval_y = train_y[fold_eval_idx]

    model = QuantReg(fold_train_y, fold_train_x).fit(q=0.5)