def make_query(self): dataset = self.dataset labeled_pool, Y = dataset.get_labeled_entries() unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries() labeled_pool = np.array(labeled_pool) Y = np.array(Y) X_pool = np.array(X_pool) br = BinaryRelevance(self.br_base) br.train(Dataset(labeled_pool, Y)) trnf = br.predict_proba(labeled_pool) poolf = br.predict_proba(X_pool) f = poolf * 2 - 1 trnf = np.sort(trnf, axis=1)[:, ::-1] trnf /= np.tile(trnf.sum(axis=1).reshape(-1, 1), (1, trnf.shape[1])) if len(np.unique(Y.sum(axis=1))) == 1: lr = DummyClf() else: lr = self.logistic_regression_ lr.train(Dataset(trnf, Y.sum(axis=1))) idx_poolf = np.argsort(poolf, axis=1)[:, ::-1] poolf = np.sort(poolf, axis=1)[:, ::-1] poolf /= np.tile(poolf.sum(axis=1).reshape(-1, 1), (1, poolf.shape[1])) pred_num_lbl = lr.predict(poolf).astype(int) yhat = -1 * np.ones((len(X_pool), self.n_labels), dtype=int) for i, p in enumerate(pred_num_lbl): yhat[i, idx_poolf[i, :p]] = 1 score = ((1 - yhat * f) / 2).sum(axis=1) ask_id = self.random_state_.choice(np.where(score == np.max(score))[0]) return unlabeled_entry_ids[ask_id]
def split_train_test(dataset_filepath, test_size, n_labeled): X, y = init_data(dataset_filepath) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=test_size) trn_ds = Dataset( X_train, np.concatenate( [y_train[:n_labeled], [None] * (len(y_train) - n_labeled)])) tst_ds = Dataset(X_test, y_test) fully_labeled_trn_ds = Dataset(X_train, y_train) return trn_ds, tst_ds, y_train, fully_labeled_trn_ds
def check_functions(self, adapter, clf): adapter.train(Dataset(self.X_train, self.y_train)) clf.fit(self.X_train, self.y_train) assert_array_equal(adapter.predict(self.X_train), clf.predict(self.X_train)) assert_array_equal(adapter.predict(self.X_test), clf.predict(self.X_test)) self.assertEqual(adapter.score(Dataset(self.X_train, self.y_train)), clf.score(self.X_train, self.y_train)) self.assertEqual(adapter.score(Dataset(self.X_test, self.y_test)), clf.score(self.X_test, self.y_test))
def test_svm(self): svc_clf = SVC(gamma="auto") svc_clf.fit(self.X_train, self.y_train) svm = SVM() svm.train(Dataset(self.X_train, self.y_train)) assert_array_equal(svc_clf.predict(self.X_train), svm.predict(self.X_train)) assert_array_equal(svc_clf.predict(self.X_test), svm.predict(self.X_test)) self.assertEqual(svc_clf.score(self.X_train, self.y_train), svm.score(Dataset(self.X_train, self.y_train))) self.assertEqual(svc_clf.score(self.X_test, self.y_test), svm.score(Dataset(self.X_test, self.y_test)))
def test_binary_relevance_parallel(self): br = BinaryRelevance(base_clf=LogisticRegression(solver='liblinear', multi_class="ovr", random_state=1126), n_jobs=1) br.train(Dataset(self.X_train, self.Y_train)) br_par = BinaryRelevance(base_clf=LogisticRegression( solver='liblinear', random_state=1126), n_jobs=2) br_par.train(Dataset(self.X_train, self.Y_train)) assert_array_equal( br.predict(self.X_test).astype(int), br_par.predict(self.X_test).astype(int))
def test_perceptron(self): clf = sklearn.linear_model.Perceptron() clf.fit(self.X_train, self.y_train) perceptron = Perceptron() perceptron.train(Dataset(self.X_train, self.y_train)) assert_array_equal(clf.predict(self.X_train), perceptron.predict(self.X_train)) assert_array_equal(clf.predict(self.X_test), perceptron.predict(self.X_test)) self.assertEqual(clf.score(self.X_train, self.y_train), perceptron.score(Dataset(self.X_train, self.y_train))) self.assertEqual(clf.score(self.X_test, self.y_test), perceptron.score(Dataset(self.X_test, self.y_test)))
def test_hs_report_entry_label(self): ds = Dataset(self.X, self.y) qs = HS(ds, self.classes, random_state=1126) y_report = [] for i in range(len(self.y)): y_report.append(qs.report_entry_label(i)) assert_array_equal(y_report, self.y)
def make_query(self): dataset = self.dataset X, y = dataset.get_labeled_entries() unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries() classes = np.unique(y) n_classes = len(classes) self.model.train(dataset) proba = self.model.predict_proba(X_pool) scores = [] for i, x in enumerate(X_pool): score = [] for yi in range(n_classes): m = copy.deepcopy(self.model) m.train(Dataset(np.vstack((X, [x])), y + [yi])) p = m.predict_proba(X_pool) if self.loss == '01': # 0/1 loss score.append(proba[i, yi] * np.sum(1 - np.max(p, axis=1))) elif self.loss == 'log': # log loss score.append(proba[i, yi] * -np.sum(p * np.log(p))) scores.append(np.sum(score)) choices = np.where(np.array(scores) == np.min(scores))[0] ask_idx = self.random_state_.choice(choices) return unlabeled_entry_ids[ask_idx]
def setUp(self): self.X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [0, 1], [0, -2], [1.5, 1.5], [-2, -2]] self.y = [-1, -1, -1, 1, 1, 1, -1, -1, 1, 1] self.quota = 4 self.fully_labeled_trn_ds = Dataset(self.X, self.y) self.lbr = IdealLabeler(self.fully_labeled_trn_ds)
def test_HintSVM(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:5], [None] * (len(self.y) - 5)])) qs = HintSVM(trn_ds, random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([24, 235, 228, 209, 18, 143, 119, 90, 149, 207]))
def test_DensityWeightedUncertaintySampling(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = DWUS(trn_ds, random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([30, 179, 104, 186, 28, 65, 142, 62, 257, 221]))
def test_quire(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:5], [None] * (len(self.y) - 5)])) qs = QUIRE(trn_ds) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([117, 175, 256, 64, 103, 118, 180, 159, 129, 235]))
def test_mmc(self): trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5)) qs = MMC(trn_ds, random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([117, 655, 1350, 909, 1003, 1116, 546, 1055, 165, 1441]))
def test_RandomSampling(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:5], [None] * (len(self.y) - 5)])) qs = RandomSampling(trn_ds, random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([150, 16, 122, 157, 233, 160, 114, 163, 155, 56]))
def test_logistic_regression(self): clf = sklearn.linear_model.LogisticRegression( solver='liblinear', multi_class="ovr") clf.fit(self.X_train, self.y_train) lr = LogisticRegression(solver='liblinear', multi_class="ovr") lr.train(Dataset(self.X_train, self.y_train)) assert_array_equal( clf.predict(self.X_train), lr.predict(self.X_train)) assert_array_equal( clf.predict(self.X_test), lr.predict(self.X_test)) self.assertEqual( clf.score(self.X_train, self.y_train), lr.score(Dataset(self.X_train, self.y_train))) self.assertEqual( clf.score(self.X_test, self.y_test), lr.score(Dataset(self.X_test, self.y_test)))
def check_proba(self, adapter, clf): adapter.train(Dataset(self.X_train, self.y_train)) clf.fit(self.X_train, self.y_train) assert_array_equal(adapter.predict_proba(self.X_train), clf.predict_proba(self.X_train)) assert_array_equal(adapter.predict_real(self.X_train), clf.predict_proba(self.X_train))
def test_alce_lr(self): cost_matrix = np.random.RandomState(1126).rand(3, 3) np.fill_diagonal(cost_matrix, 0) ds = Dataset(self.X + self.X_pool, self.y[:3] + [None for _ in range(len(self.X_pool))]) qs = ALCE(ds, cost_matrix, LinearRegression(), random_state=1126) qseq = run_qs(ds, qs, self.y_truth, self.quota) assert_array_equal( qseq, np.array([58, 118, 134, 43, 60, 139, 87, 78, 67, 146]))
def test_eer(self): ds = Dataset(self.X + self.X_pool, self.y[:3] + [None for _ in range(len(self.X_pool))]) qs = EER(ds, LogisticRegression(solver='liblinear', multi_class="ovr"), random_state=1126) qseq = run_qs(ds, qs, self.y_truth, self.quota) assert_array_equal( qseq, np.array([131, 20, 129, 78, 22, 139, 88, 43, 141, 133]))
def test_hs_random_selecting(self): ds = Dataset(self.X, self.y[:10] + [None] * (len(self.y) - 10)) qs = HS(ds, self.classes, active_selecting=False, random_state=1126) qseq = run_qs(ds, qs, self.y, len(self.y) - 10) assert_array_equal( np.concatenate([qseq[:10], qseq[-10:]]), np.array([ 48, 143, 13, 142, 88, 130, 29, 87, 36, 28, 58, 137, 49, 105, 76, 71, 63, 47, 64, 55 ]))
def test_hs_active_selecting(self): ds = Dataset(self.X, self.y[:10] + [None] * (len(self.y) - 10)) qs = HS(ds, self.classes, active_selecting=True, random_state=1126) qseq = run_qs(ds, qs, self.y, len(self.y) - 10) assert_array_equal( np.concatenate([qseq[:10], qseq[-10:]]), np.array([ 48, 143, 13, 64, 101, 108, 51, 87, 36, 28, 43, 118, 47, 25, 81, 82, 95, 40, 67, 120 ]))
def test_adaptive_active_learning(self): trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5)) qs = AdaptiveActiveLearning(trn_ds, base_clf=LogisticRegression( solver='liblinear', multi_class="ovr"), n_jobs=-1, random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([594, 827, 1128, 419, 1223, 484, 96, 833, 37, 367]))
def test_binary_minimization(self): trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5)) qs = BinaryMinimization(trn_ds, LogisticRegression(solver='liblinear', multi_class="ovr"), random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([936, 924, 1211, 1286, 590, 429, 404, 962, 825, 30]))
def test_eer_01(self): ds = Dataset(self.X + self.X_pool, self.y[:3] + [None for _ in range(len(self.X_pool))]) qs = EER(ds, LogisticRegression(solver='liblinear', multi_class="ovr"), loss='01', random_state=1126) qseq = run_qs(ds, qs, self.y_truth, self.quota) assert_array_equal( qseq, np.array([105, 16, 131, 117, 109, 148, 136, 115, 144, 121]))
def test_quire_mykernel(self): def my_kernel(X, Y): return np.dot(X, Y.T) np.random.seed(1126) trn_ds = Dataset( self.X, np.concatenate([self.y[:5], [None] * (len(self.y) - 5)])) qs = QUIRE(trn_ds, kernel=my_kernel) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([9, 227, 176, 110, 52, 117, 228, 205, 103, 175]))
def test_UcertaintySamplingEntropy(self): random.seed(1126) trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = UncertaintySampling(trn_ds, method='entropy', model=LogisticRegression(solver="liblinear", multi_class="ovr")) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([145, 66, 82, 37, 194, 60, 191, 211, 245, 131]))
def test_variance_reduction(self): trn_ds = Dataset(self.X, np.concatenate([self.y[:2], [None] * (len(self.y) - 2)])) qs = VarianceReduction( trn_ds, model=LogisticRegression(solver='liblinear', multi_class="ovr"), sigma=0.1 ) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal(qseq, np.array([4, 5, 2, 3]))
def test_multilabel_with_auxiliary_learner_mmr(self): trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5)) qs = MultilabelWithAuxiliaryLearner( trn_ds, major_learner=BinaryRelevance( LogisticRegression(solver='liblinear', multi_class="ovr")), auxiliary_learner=BinaryRelevance(SVM(gamma="auto")), criterion='mmr', random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([1258, 1461, 231, 1198, 1498, 1374, 955, 1367, 265, 144]))
def test_alce_lr_embed5(self): cost_matrix = np.random.RandomState(1126).rand(3, 3) np.fill_diagonal(cost_matrix, 0) ds = Dataset(self.X + self.X_pool, self.y[:3] + [None for _ in range(len(self.X_pool))]) qs = ALCE(ds, cost_matrix, LinearRegression(), embed_dim=5, random_state=1126) qseq = run_qs(ds, qs, self.y_truth, self.quota) assert_array_equal( qseq, np.array([106, 118, 141, 43, 63, 99, 65, 89, 26, 52]))
def test_hs_subsampling(self): ds = Dataset(self.X, self.y[:10] + [None] * (len(self.y) - 10)) sub_qs = UncertaintySampling(ds, model=SVM(gamma='auto', decision_function_shape='ovr')) qs = HS(ds, self.classes, subsample_qs=sub_qs, random_state=1126) qseq = run_qs(ds, qs, self.y, len(self.y) - 10) assert_array_equal( np.concatenate([qseq[:10], qseq[-10:]]), np.array([ 120, 50, 33, 28, 78, 133, 52, 124, 102, 109, 81, 108, 12, 10, 89, 114, 92, 126, 48, 25 ]))
def _E(args): X, y, qx, clf, label_count, sigma, model = args sigmoid = lambda x: 1 / (1 + np.exp(-x)) query_point = sigmoid(clf.predict_real([qx])) feature_count = len(X[0]) ret = 0.0 for i in range(label_count): clf_ = copy.copy(model) clf_.train(Dataset(np.vstack((X, [qx])), np.append(y, i))) PI = sigmoid(clf_.predict_real(np.vstack((X, [qx])))) ret += query_point[-1][i] * _Phi(sigma, PI[:-1], X, PI[-1], qx, label_count, feature_count) return ret