/
run.py
91 lines (67 loc) · 2.56 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# SHefali Umrania
# 02-750 Fall 2017
# Final Project
import data_loader as dl
import learner
from sklearn import metrics
from copy import deepcopy
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# params
pool_size = 100
bsize = 50
budget = 2500
# load data
xtrain, ytrain = dl.load_data('Data/DIFFICULT_TRAIN.csv')
xtest, ytest = dl.load_data('Data/DIFFICULT_TEST.csv')
xtest = np.array(xtest)
xtrain_label, xtrain_nolabel, ytrain_label, ytrain_nolabel = dl.pool_data(pool_size, xtrain, ytrain)
xtrain_label_rand, xtrain_nolabel_rand, ytrain_label_rand, ytrain_nolabel_rand = dl.pool_data(pool_size, xtrain, ytrain)
active_errors=[]
random_errors=[]
batches = []
min_error = 1
min_cost = 0
best_model = None
cost = pool_size
while cost < budget:
# run active learner
dist, classifier, idx = learner.classify(xtrain_label, ytrain_label, xtrain_nolabel)
label_dict = dl.load_labels(ytrain)
idx_query = learner.find_query(dist, label_dict, bsize)
xtrain_label, xtrain_nolabel, ytrain_label, ytrain_nolabel = learner.active_update(xtrain_label, xtrain_nolabel, ytrain_label, ytrain_nolabel, idx_query)
# predict
active_pred = classifier.predict(xtest[:,idx])
active_error = 1.0 - metrics.accuracy_score(ytest, active_pred)
active_errors.append(active_error)
cost += bsize
batches.append(cost)
# save best model
if active_error < min_error:
min_cost = cost
min_error = active_error
best_model = deepcopy(classifier)
# run random learner
_, classifier_rand, idx_rand = learner.classify(xtrain_label_rand, ytrain_label_rand, xtrain_nolabel_rand)
xtrain_label_rand, xtrain_nolabel_rand, ytrain_label_rand, ytrain_nolabel_rand = learner.random_update(xtrain_label_rand, xtrain_nolabel_rand, ytrain_label_rand, ytrain_nolabel_rand, bsize)
random_pred = classifier_rand.predict(xtest[:,idx_rand])
random_error = 1.0 - metrics.accuracy_score(ytest, random_pred)
random_errors.append(random_error)
print "Batch:", cost, " Active Error:", active_error, " Random Error:", random_error
# Plot
plt.plot(batches, active_errors, 'r-', label='Active Learner')
plt.plot(batches, random_errors, 'y-', label='Random Learner')
plt.title('Test Error v/s Budget')
plt.xlabel('Budget')
plt.ylabel('Error on test set')
plt.legend(loc=1)
plt.show()
# Blinded Prediction
df = pd.read_csv('Data/DIFFICULT_BLINDED.csv')
size = df.shape[1] - 1
xblind = np.array(df.ix[:, 1:size])
ypred = best_model.predict(xblind[:, idx])
instance_id = df.ix[:, 0]
save_df = pd.DataFrame({'instance_id':instance_id, 'prediction':ypred})
save_df.to_csv('blinded_pred.csv', header=False, index=False)