def split_train_test(test_size): # choose a dataset with unbalanced class instances #data = sklearn.datasets.fetch_mldata('segment') data = sklearn.datasets.fetch_mldata('vehicle') X = StandardScaler().fit_transform(data['data']) target = np.unique(data['target']) # mapping the targets to 0 to n_classes-1 y = np.array([np.where(target == i)[0][0] for i in data['target']]) X_trn, X_tst, y_trn, y_tst = \ train_test_split(X, y, test_size=test_size, stratify=y) # making sure each class appears ones initially init_y_ind = np.array( [np.where(y_trn == i)[0][0] for i in range(len(target))]) y_ind = np.array([i for i in range(len(X_trn)) if i not in init_y_ind]) trn_ds = Dataset( np.vstack((X_trn[init_y_ind], X_trn[y_ind])), np.concatenate((y_trn[init_y_ind], [None] * (len(y_ind))))) tst_ds = Dataset(X_tst, y_tst) fully_labeled_trn_ds = Dataset( np.vstack((X_trn[init_y_ind], X_trn[y_ind])), np.concatenate((y_trn[init_y_ind], y_trn[y_ind]))) cost_matrix = 2000. * np.random.rand(len(target), len(target)) np.fill_diagonal(cost_matrix, 0) print(trn_ds.get_entries()[0]) print(np.shape(fully_labeled_trn_ds.get_entries())) return trn_ds, tst_ds, fully_labeled_trn_ds, cost_matrix
class UncertaintySampler(object): def __init__(self, X, y, labs, n=2): y = [yy if yy >= 0 else None for yy in y] self.dataset = Dataset(X, y) self.labs = labs self.uc = UncertaintySampling(self.dataset, method='lc', model=LinearSVC()) self.n = n def get_next(self): print >> sys.stderr, 'get_next: start' out = self.uc.make_query(n=self.n) print >> sys.stderr, 'get_next: done' return out def set_label(self, idx, label): print >> sys.stderr, 'set_label: start' out = self.dataset.update(idx, label) print >> sys.stderr, 'set_label: done' return out def get_data(self): X, y = zip(*self.dataset.get_entries()) X, y = np.vstack(X), np.array( [yy if yy is not None else -1 for yy in y]) return X, y def n_hits(self): labels = np.array(zip(*self.dataset.get_entries())[1]) return (labels == 1).sum() def n_labeled(self): return self.dataset.len_labeled() def is_labeled(self, idx): return idx in np.where(zip(*self.dataset.get_entries())[1])[0] def save(self, outpath): """ !! This should be updated to save in same format as simple_las """ X, y = self.get_data() f = h5py.File( '%s-%s-%s.h5' % (outpath, 'uncertainty', datetime.now().strftime('%Y%m%d_%H%M%S'))) f['X'] = X f['y'] = y f['labs'] = self.labs f.close()