num_bins = 8
bin_size = int(np.ceil(len(a)/num_bins))
bins = [a[i*bin_size:(i+1)*bin_size] for i in range(num_bins)]

X = D[:, cols]
y = D[:, c.isalert]

auc = num_bins*[0]
fpr = num_bins*[0]
tpr = num_bins*[0]
w = num_bins*[0]
b = num_bins*[0]
for i in range(num_bins):
    train = [item for j in range(num_bins) if j != i for item in bins[j]]
    test = bins[i]
    auc[i], fpr[i], tpr[i], w[i], b[i] = fit_logistic_regression(
            X[train,:], y[train,:], X[test,:], y[test,:], C=C)

def save_result(save_path):
    with open(save_path, 'w') as f:
        json.dump(get_result(), f, indent=4)

def get_result():
    wl = [a[0].tolist() for a in w]
    save_data = dict(zip(
        ['weights', 'auc', 'intercepts', 'C', 'num_bins', 'dataset_size', 'generator'], 
        [wl, auc, b, C, num_bins, D.shape[0], __file__]
    ))
    return save_data

X = D[:, cols]
X = X[a[:320000], :]
y = D[a[:320000], c('isalert')]
y = y.astype(int)^1

Xt = D[:, cols]
Xt = Xt[a[320000:], :]
yt = D[a[320000:], c('isalert')]
yt = yt.astype(int)^1

num_tests = 1
auc = num_tests*[0]
fpr = num_tests*[0]
tpr = num_tests*[0]
w = num_tests*[0]
b = num_tests*[0]

for i in range(num_tests):
    auc[i], fpr[i], tpr[i], w[i], b[i] = fit_logistic_regression(X, y, Xt, yt, C=10**(i+3))


def save_result(append=''):
    wl = [a[0].tolist() for a in w]
    save_data = dict(zip(['weights', 'auc', 'intercepts'], [wl, auc, b]))
    save_path = '{0}/session/17-recreating-winning-entry'\
            '/data/regression-result.json'.format(path)
    with open(save_path, 'w') as f:
        json.dump(save_data, f)
        
Exemplo n.º 3
0
Xt = Xt[ts_rows, :]
yt = D[ts_rows, c('isalert')]

auc = np.zeros((max_features,90));

# Remove P3, P6, P8, V7 and V9 and 
# the corresponding running features.
# See session 9 on data exploration
# for details
cc = LabelIndex(L_ex[4:])
exclude = cc('p3', 'p6', 'p8', 'v7', 'v9', 
             'mp3', 'mp6', 'mp8', 'mv7', 'mv9',
             'sdp3', 'sdp6', 'sdp8', 'sdv7', 'sdv9')
candidates = [i for i in range(90) if i not in exclude]

chosen = []

for i in range(max_features):
    for c in candidates:
        features = chosen + [c]
        result = fit_logistic_regression(X[:, features], y, Xt[:, features], yt)
        auc[i, c] = result[0]
    
    chosen_feature = auc[i,:].argmax()

    if auc[i,chosen_feature] <= auc[i-1,:].max():
        break

    candidates.remove(chosen_feature)
    chosen.append(chosen_feature)