forked from iitml/ALwR
/
selection_strategies.py
170 lines (126 loc) · 5.81 KB
/
selection_strategies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
'''
selection_strategies.py
This file contains the following:
RandomBootstrap: pick instances randomly for bootstrapping the initial model
RandomStrategy: pick the next document randomly from pool
UNCSampling: pick the next document based on uncertainty
UNCPreferNoConflict: pick the next document out of top k uncertain instances, which does not contain conflicting rationales
UNCPreferConflict: pick the next document out of top k uncertain instances, which contains conflicting rationales
'''
import sys
import os
sys.path.append(os.path.abspath("."))
#import sys
from time import time
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from numpy.random import RandomState
from feature_expert import print_all_features
from sklearn.naive_bayes import MultinomialNB
#from models import FeatureMNBUniform, FeatureMNBWeighted, PoolingMNB
from sklearn import metrics
def RandomBootstrap(X_pool, y_pool, size, balance, seed=0):
'''
Assume the task is binary classification
'''
print ('-' * 50)
print ('Starting bootstrap...')
print ('Initial training set size = %d' % size)
start = time()
random_state = RandomState(seed=seed)
poolsize = y_pool.shape[0]
pool_set = np.arange(poolsize)
if balance: # select 1/2 * size from each class
class0_size = (int)(size / 2)
class1_size = (int)(size - class0_size)
class0_indices = np.nonzero(y_pool == 0)[0]
class1_indices = np.nonzero(y_pool == 1)[0]
class0_docs = random_state.permutation(class0_indices)[:class0_size]
class1_docs = random_state.permutation(class1_indices)[:class1_size]
training_set = np.hstack((class0_docs, class1_docs))
else: # otherwise, pick 'size' documents randomly
training_set = random_state.permutation(pool_set)[:size]
pool_set = np.setdiff1d(pool_set, training_set)
print ('bootstraping took %0.2fs.' % (time() - start))
return (training_set.tolist(), pool_set.tolist())
class RandomStrategy(object):
def __init__(self, seed=0):
self.rgen = RandomState(seed)
def choices(self, model, X, pool, k):
return self.rgen.permutation(pool)[:k]
class UNCSampling(object):
'''
This class performs uncertainty sampling based on the model.
'''
def choices(self, model, X, pool, k):
if isinstance(model, MultinomialNB):
y_probas = model.predict_proba(X[pool])
uncerts = np.argsort(np.max(y_probas, axis=1))[:k]
else:
y_decision = model.decision_function(X[pool])
uncerts = np.argsort(np.absolute(y_decision))[:k]
return [pool[u] for u in uncerts]
class UNCPreferNoConflict(object):
'''
This class picks uncertain instances.
Whenever it can, it prefers instances that do not have conflicts.
No conflict means this instance do not have annotated features
from opposing classes.
'''
def choices(self, model, X, pool, k, discovered_class0_feats, discovered_class1_feats, top_k):
if isinstance(model, MultinomialNB):
y_probas = model.predict_proba(X[pool])
uncerts = np.array(pool)[np.argsort(np.max(y_probas, axis=1))][:top_k]
else:
y_decision = model.decision_function(X[pool])
uncerts = np.array(pool)[np.argsort(np.absolute(y_decision))][:top_k]
conflicted = []
chosen = set()
for unc in uncerts[:top_k]:
x_feats = X[unc].indices
x_class0_feats = discovered_class0_feats.intersection(x_feats)
x_class1_feats = discovered_class1_feats.intersection(x_feats)
conflicted.append(len(x_class0_feats) > 0) and (len(x_class1_feats) > 0)
index = 0
while len(chosen) < k and index < top_k:
if not conflicted[index]:
chosen.add(uncerts[index])
index += 1
if len(chosen) < k: # Not enough un-conflicted could be found
index = 0
while len(chosen) < k and index < top_k:
chosen.add(uncerts[index])
index += 1
return list(chosen)
class UNCPreferConflict(object):
'''
This class picks uncertain instances.
Whenever it can, it prefers instances that have a conflict.
A conflict means this instance has annotated features
from opposing classes.
'''
def choices(self, model, X, pool, k, discovered_class0_feats, discovered_class1_feats, top_k):
if isinstance(model, MultinomialNB):
y_probas = model.predict_proba(X[pool])
uncerts = np.array(pool)[np.argsort(np.max(y_probas, axis=1))][:top_k]
else:
y_decision = model.decision_function(X[pool])
uncerts = np.array(pool)[np.argsort(np.absolute(y_decision))][:top_k]
conflicted = []
chosen = set()
for unc in uncerts[:top_k]:
x_feats = X[unc].indices
x_class0_feats = discovered_class0_feats.intersection(x_feats)
x_class1_feats = discovered_class1_feats.intersection(x_feats)
conflicted.append(len(x_class0_feats) > 0) and (len(x_class1_feats) > 0)
index = 0
while len(chosen) < k and index < top_k:
if conflicted[index]:
chosen.add(uncerts[index])
index += 1
if len(chosen) < k: # Not enough conflicted could be found
index = 0
while len(chosen) < k and index < top_k:
chosen.add(uncerts[index])
index += 1
return list(chosen)