def load_x(ds, preset): feature_parts = [Dataset.load_part(ds, part) for part in preset.get('features', [])] prediction_parts = [load_prediction(ds, p, mode=preset.get('predictions_mode', 'fulltrain')) for p in preset.get('predictions', [])] prediction_parts = [p.clip(lower=0.1).values.reshape((p.shape[0], 1)) for p in prediction_parts] if 'prediction_transform' in preset: prediction_parts = map(preset['prediction_transform'], prediction_parts) return hstack(feature_parts + prediction_parts)
import pandas as pd from util import Dataset for name in ['train', 'test']: print "Processing %s..." % name num = pd.DataFrame(Dataset.load_part(name, 'numeric'), columns=Dataset.get_part_features('numeric')) df = pd.DataFrame(index=num.index) df['diff_1_6'] = num['cont1'] - num['cont6'] df['diff_1_9'] = num['cont1'] - num['cont9'] df['diff_1_10'] = num['cont1'] - num['cont10'] df['diff_6_9'] = num['cont6'] - num['cont9'] df['diff_6_10'] = num['cont6'] - num['cont10'] df['diff_6_11'] = num['cont6'] - num['cont11'] df['diff_6_12'] = num['cont6'] - num['cont12'] df['diff_6_13'] = num['cont6'] - num['cont13'] df['diff_7_11'] = num['cont7'] - num['cont11'] df['diff_7_12'] = num['cont7'] - num['cont12'] df['diff_11_12'] = num['cont11'] - num['cont12'] if name == 'train': Dataset.save_part_features('numeric_combinations', list(df.columns)) Dataset(numeric_combinations=df.values).save(name) print "Done."
def y_decode(y): og = Dataset.load_part("train", "target_labels") le = LabelEncoder() le.classes_ = og z = [int(i) for i in y] return le.inverse_transform(z)
print("Preset: %s" % args.preset) preset = presets[args.preset] feature_builders = preset.get('feature_builders', []) n_bags = preset.get('n_bags', 1) n_splits = preset.get('n_splits', 1) y_aggregator = preset.get('agg', mode_agg) y_transform, y_inv_transform = preset.get('y_transform', (lambda y: y, lambda y: y)) print("Loading train data...") train_x = load_x('train', preset) train_y = Dataset.load_part('train', 'target') train_p = np.zeros((train_x.shape[0], n_splits * n_bags)) train_r = Dataset.load('train', parts=np.unique( sum([b.requirements for b in feature_builders], ['target']))) feature_names = extract_feature_names(preset) print(args.optimize) if args.optimize: opt_train_idx, opt_eval_idx = train_test_split(range(len(train_y)), test_size=0.2) opt_train_x = train_x[opt_train_idx] opt_train_y = train_y[opt_train_idx]
import numpy as np from scipy.stats import skew, boxcox from tqdm import tqdm from util import Dataset print("Loading data...") train_num = Dataset.load_part('train', 'numeric_mean') test_num = Dataset.load_part('test', 'numeric_mean') train_num_enc = np.zeros(train_num.shape, dtype=np.float32) test_num_enc = np.zeros(test_num.shape, dtype=np.float32) with tqdm(total=train_num.shape[1], desc=' Transforming', unit='cols') as pbar: for col in range(train_num.shape[1]): values = np.hstack((train_num[:, col], test_num[:, col])) print(values) sk = skew(values) if sk > 0.25: values_enc, lam = boxcox(values+1) train_num_enc[:, col] = values_enc[:train_num.shape[0]] test_num_enc[:, col] = values_enc[train_num.shape[0]:] else: train_num_enc[:, col] = train_num[:, col] test_num_enc[:, col] = test_num[:, col] pbar.update(1)
import numpy as np import scipy.sparse as sp from tqdm import tqdm from util import Dataset print("Loading data...") min_freq = 10 train_cat = Dataset.load_part('train', 'cat_manual') test_cat = Dataset.load_part('test', 'cat_manual') train_cat_enc = [] test_cat_enc = [] cats = Dataset.get_part_features('categorical') features = [] with tqdm(total=len(cats), desc=' Encoding', unit='cols') as pbar: for col, cat in enumerate(cats): value_counts = dict( zip(*np.unique(train_cat[:, col], return_counts=True))) train_rares = np.zeros(train_cat.shape[0], dtype=np.uint8) test_rares = np.zeros(test_cat.shape[0], dtype=np.uint8) for val in value_counts: if value_counts[val] >= min_freq: features.append('%s_%s' % (cat, val)) train_cat_enc.append(
import numpy as np import scipy.sparse as sp import pandas as pd from sklearn.preprocessing import scale from tqdm import tqdm from util import Dataset print("Loading data...") idx = Dataset.load_part("train", 'id') train_cat = pd.DataFrame(Dataset.load_part("train", 'categorical_mode'), columns=Dataset.get_part_features('categorical_mode'), index=idx) train_num = pd.DataFrame(Dataset.load_part("train", 'numeric_mean'), columns=Dataset.get_part_features('numeric_mean'), index=idx) train = pd.concat([train_cat, train_num], axis=1) idx = Dataset.load_part("test", 'id') test_cat = pd.DataFrame(Dataset.load_part("test", 'categorical_mode'), columns=Dataset.get_part_features('categorical_mode'), index=idx) test_num = pd.DataFrame(Dataset.load_part("test", 'numeric_mean'), columns=Dataset.get_part_features('numeric_mean'), index=idx) test = pd.concat([test_cat, test_num], axis=1)
import numpy as np import scipy.sparse as sp from tqdm import tqdm from util import Dataset print "Loading data..." min_freq = 10 train_cat = Dataset.load_part('train', 'categorical') test_cat = Dataset.load_part('test', 'categorical') train_cat_enc = [] test_cat_enc = [] cats = Dataset.get_part_features('categorical') features = [] with tqdm(total=len(cats), desc=' Encoding', unit='cols') as pbar: for col, cat in enumerate(cats): value_counts = dict( zip(*np.unique(train_cat[:, col], return_counts=True))) train_rares = np.zeros(train_cat.shape[0], dtype=np.uint8) test_rares = np.zeros(test_cat.shape[0], dtype=np.uint8) for val in value_counts: if value_counts[val] >= min_freq: features.append('%s_%s' % (cat, val)) train_cat_enc.append(
import numpy as np from util import Dataset, vstack, hstack from sklearn.preprocessing import scale from sklearn.decomposition import TruncatedSVD n_components = 500 # 500 components explain 99.8% of variance print "Loading data..." train_num = Dataset.load_part('train', 'numeric') train_cat = Dataset.load_part('train', 'categorical_dummy') test_num = Dataset.load_part('test', 'numeric') test_cat = Dataset.load_part('test', 'categorical_dummy') train_cnt = train_num.shape[0] print "Combining data..." all_data = hstack((scale(vstack( (train_num, test_num)).astype(np.float64)).astype(np.float32), vstack((train_cat, test_cat)))) del train_num, train_cat, test_num, test_cat print "Fitting svd..." svd = TruncatedSVD(n_components) res = svd.fit_transform(all_data)
import numpy as np import scipy.sparse as sp from scipy.stats import boxcox import pandas as pd from sklearn.preprocessing import scale from tqdm import tqdm from util import Dataset print("Loading data...") idx = Dataset.load_part("train", 'id') train_num = pd.DataFrame(Dataset.load_part("train", 'numeric_mean'), columns=Dataset.get_part_features('numeric_mean'), index=idx) idx = Dataset.load_part("test", 'id') test_num = pd.DataFrame(Dataset.load_part("test", 'numeric_mean'), columns=Dataset.get_part_features('numeric_mean'), index=idx) all_nData = train_num.append(test_num) print(all_nData.head()) all_num_norm = pd.DataFrame() all_num_norm["ApplicantIncome"] = np.log1p(all_nData.ApplicantIncome) all_num_norm["CoapplicantIncome"] = np.log1p(all_nData.CoapplicantIncome) all_num_norm["LoanAmount"] = (np.log1p(all_nData.LoanAmount)) all_num_norm["Loan_Amount_Term"] = np.log1p(all_nData.Loan_Amount_Term) train_custom = all_num_norm[:train_num.shape[0]] test_custom = all_num_norm[train_num.shape[0]:]
import pandas as pd import sys from sklearn.metrics import mean_absolute_error from util import Dataset, load_prediction df = pd.DataFrame({'loss': Dataset.load_part('train', 'loss')}, index=Dataset.load_part('train', 'id')) edges = df['loss'].quantile([0.2, 0.4, 0.6, 0.8]).values df['bucket'] = len(edges) for i in reversed(xrange(len(edges))): df.loc[df['loss'] <= edges[i], 'bucket'] = i pred = load_prediction('train', sys.argv[1]) errs = (pd.Series(pred, index=df.index) - df['loss']).abs() print errs.groupby(df['bucket']).mean()
print("Preset: %s" % args.preset) preset = presets[args.preset] feature_builders = preset.get('feature_builders', []) n_bags = preset.get('n_bags', 1) n_splits = preset.get('n_splits', 1) y_aggregator = preset.get('agg', mode_agg) y_transform, y_inv_transform = preset.get('y_transform', (lambda y: y, lambda y: y)) print("Loading train data...") train_x = load_x('train', preset) train_y = Dataset.load_part('train', 'target') train_p = np.zeros((train_x.shape[0], n_splits * n_bags)) train_r = Dataset.load('train', parts=np.unique(sum([b.requirements for b in feature_builders], ['target']))) feature_names = extract_feature_names(preset) print(args.optimize) if args.optimize: opt_train_idx, opt_eval_idx = train_test_split(range(len(train_y)), test_size=0.2) opt_train_x = train_x[opt_train_idx] opt_train_y = train_y[opt_train_idx] opt_train_r = train_r.slice(opt_train_idx) opt_eval_x = train_x[opt_eval_idx] opt_eval_y = train_y[opt_eval_idx]
import pandas as pd from util import Dataset for name in ['train', 'test']: print "Processing %s..." % name idx = Dataset.load_part(name, 'id') # Load parts numeric = pd.DataFrame(Dataset.load_part(name, 'numeric'), columns=Dataset.get_part_features('numeric_lin'), index=idx) numeric_lin = pd.DataFrame( Dataset.load_part(name, 'numeric_lin'), columns=Dataset.get_part_features('numeric_lin'), index=idx) # Build features df = pd.DataFrame(index=idx) #df['cont14'] = numeric['cont14'] df['cont_1_9_diff'] = numeric_lin['cont9'] - numeric_lin['cont1'] # Save column names if name == 'train': Dataset.save_part_features('manual', list(df.columns)) Dataset(manual=df.values).save(name) print "Done."
import pandas as pd import numpy as np import sys from sklearn.metrics import mean_absolute_error from sklearn.cross_validation import KFold from statsmodels.regression.quantile_regression import QuantReg from util import Dataset pred_name = sys.argv[1] n_folds = 8 train_y = Dataset.load_part('train', 'loss') train_x = pd.read_csv('preds/%s-train.csv' % pred_name)['loss'].values orig_maes = [] corr_maes = [] for fold, (fold_train_idx, fold_eval_idx) in enumerate( KFold(len(train_y), n_folds, shuffle=True, random_state=2016)): fold_train_x = train_x[fold_train_idx] fold_train_y = train_y[fold_train_idx] fold_eval_x = train_x[fold_eval_idx] fold_eval_y = train_y[fold_eval_idx] model = QuantReg(fold_train_y, fold_train_x).fit(q=0.5)