-
Notifications
You must be signed in to change notification settings - Fork 2
/
transductor_decorrelate_models.py
102 lines (87 loc) · 4.15 KB
/
transductor_decorrelate_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
'''
1. first, run this script in mode = 'dump' to dump predictions
from transductor models
set model range to include available transductor models, e.g. for
transductor_model_281.pkl through transductor_model_300.pkl range(281,301)
2. second, run this script in mode = 'decorrelate' to create combined
submission
n_models_to_combine are the first least correlated models to combine.
Makes little effect in practice, 20 is better, but 1 is enough.
'''
import numpy as np
import pandas as pd
from os.path import basename
import subprocess
import cPickle
import h5py
import misc as pt
from misc import add_features
np.random.seed(1337) # for reproducibility
def preprocess_and_dump_hd5(in_csv, pf_csv, X_file, scaler, chunksize=100000):
# Get number of lines in the CSV file
nlines = subprocess.check_output('wc -l %s' % in_csv, shell=True)
nlines = int(nlines.split()[0])
# Get header
df = pd.read_csv(in_csv, nrows=1)
header_row = df.columns
filter_out = ['id', 'min_ANNmuon', 'production', 'mass', 'weight', 'signal']
df = add_features(df)
features = list(f for f in df.columns if f not in filter_out)
with h5py.File(X_file, "w") as f:
X = f.create_dataset("X", (nlines-1,len(features)+1), dtype='float64')
# Iteratively read CSV
for k,i in enumerate(range(1, nlines, chunksize)):
print("iteration {}, line {}".format(k, i))
df = pd.read_csv(in_csv,
header=None, # no header
nrows=chunksize, # number of rows to read at each iteration
names=header_row, # set header
skiprows=i) # skip rows that were already read
pf = pd.read_csv(pf_csv,
header=None, # no header
nrows=chunksize, # number of rows to read at each iteration
names=["id", "prediction"], # set header
skiprows=i) # skip rows that were already read
df = add_features(df)
data = scaler.transform(df[features])
data = np.hstack((data, pf['prediction'].values.reshape(-1,1)))
X[i-1:i-1+chunksize,:] = data
model_range = range(281,301)
n_models_to_combine = 10
mode = 'decorrelate' # dump (predictions) | decorrelate (and generate submission)
if mode == 'dump':
print("Load transductor scaler")
with open(pt.transductor_scaler_file, 'rb') as fid:
scaler = cPickle.load(fid)
print("Scale and dump test data to {}".format(basename(pt.X_file)))
preprocess_and_dump_hd5(pt.test_file, pt.raw_submission_file, pt.X_file, scaler)
with h5py.File(pt.X_file, "r") as f:
X = f.get("X")
with h5py.File(pt.transductor_predictions_file, "w") as fpreds:
for k,i in enumerate(model_range):
m = pt.transductor_model_file.format(i)
print("Predict model {}/{} from {}".format(k+1,
len(model_range), basename(m)))
with open(m, 'rb') as fid:
model = cPickle.load(fid)
p = model.predict(X, batch_size=256, verbose=0)[:, 1]
if k==0: # 1st iteration
probs = fpreds.create_dataset("probs", (len(model_range),len(p)))
probs[k,:] = np.array(p)
elif mode == 'decorrelate':
print("Generate transductor submission")
with h5py.File(pt.transductor_predictions_file, "r") as fpreds:
probs = fpreds.get("probs")
corr = np.corrcoef(probs)
cr=corr.sum(axis=0)
idx = np.argsort(cr) # sorted correlations
ids = idx[:n_models_to_combine] # take first least corrlated
ids.sort() # h5py requires increasing index
print("Models to combine: %s" %
", ".join(map(lambda x: basename(pt.transductor_model_file.format(x)).split(".")[0], np.array(model_range)[ids])))
p = probs[list(ids)].mean(axis=0)
df = pd.read_csv(pt.test_file, usecols=["id"])
transductor_submission = pd.DataFrame({"id": df["id"], "prediction": p})
transductor_submission.to_csv(pt.transductor_submission_file, index=False)
else:
print("Unknown mode")