def test_speedup_greedy(self): X = np.random.uniform(-1, 1, size=(1000, 2)) Y = np.ones(X.shape[0]) negative_examples = np.where(X[:, 0] < 0) Y[negative_examples] = -1 Y_obstructed = ObstructedY(Y) Y_obstructed.query(range(100)) m = Perceptron(alpha=0, n_iter=100).fit(X, Y) dist = construct_normalized_euc(X) D = pairwise_distances(X, metric=dist) r1 = quasi_greedy_batch(X, Y_obstructed, m, rng=None, batch_size=20, D=D) r2 = quasi_greedy_batch_slow(X, Y_obstructed, m, rng=None, batch_size=20, dist=dist, D=D) self.assertTrue(np.array_equal(r1[0], r2[0])) self.assertAlmostEqual(r1[1], r2[1])
def test_greedy_unc(self): mean_1 = np.array([-2, 0]) mean_2 = np.array([2, 0]) cov = np.array([[1, 0], [0, 1]]) X_1 = np.random.multivariate_normal(mean_1, cov, 100) X_2 = np.random.multivariate_normal(mean_2, cov, 200) X = np.vstack([X_1, X_2]) y = np.ones(X.shape[0]) y[101:] = -1 # shuffle data p = np.random.permutation(X.shape[0]) X = X[p] y = y[p] y = ObstructedY(y) y.query(np.random.randint(0, X.shape[0], 50)) model = SVC(C=1, kernel='linear') model.fit(X[y.known], y[y.known]) picked, _ = quasi_greedy_batch_slow(X, y, current_model=model, c=0.0, rng=self.rng, batch_size=10, dist='cosine_distance_normalized', \ base_strategy='uncertainty_sampling') unc_pick, _ = uncertainty_sampling(X, y, model, batch_size=10, rng=self.rng) self.assertTrue(set(picked) == set(unc_pick))
def test_qbc(self): mean_1 = np.array([-2, 0]) mean_2 = np.array([2, 0]) cov = np.array([[1, 0], [0, 1]]) X_1 = np.random.multivariate_normal(mean_1, cov, 100) X_2 = np.random.multivariate_normal(mean_2, cov, 200) X = np.vstack([X_1, X_2]) y = np.ones(X.shape[0]) y[101:] = -1 # shuffle data p = np.random.permutation(X.shape[0]) X = X[p] y = y[p] y = ObstructedY(y) y.query(np.random.randint(0, X.shape[0], 50)) model = SVC(C=1, kernel='linear') model.fit(X[y.known], y[y.known]) pick, _ = query_by_bagging(X, y, current_model=None, base_model=model, batch_size=50, rng=self.rng, n_bags=5, method='entropy') mean_picked_dist = np.abs(model.decision_function(X[pick])).mean() not_picked = [i for i in xrange(X.shape[0]) if i not in set(pick)] mean_unpicked_dist = np.abs(model.decision_function(X[not_picked])).mean() self.assertTrue(mean_picked_dist < mean_unpicked_dist)
def test_unknown_id(self): oy = ObstructedY(self.y) self.assertTrue(all(oy.unknown_ids == np.arange(self.y.shape[0]))) oy.query(range(50)) self.assertTrue((all(oy.unknown_ids == np.arange(50) + 50))) oy.query(range(50,100)) self.assertTrue(len(oy.unknown_ids) == 0)
def setUp(self): self.decision_model = DecisionDummy() self.prob_model = ProbDummy() self.X = np.linspace(0.6, 1, 20).reshape(-1, 1) self.batch_size = 3 self.rng = np.random.RandomState(666) self.y = np.ones(self.X.shape[0]) self.y[np.random.randint(0, 20, 15)] = -1 self.y = ObstructedY(self.y)
def test_unknown_id(self): oy = ObstructedY(self.y) self.assertTrue(all(oy.unknown_ids == np.arange(self.y.shape[0]))) oy.query(range(50)) self.assertTrue((all(oy.unknown_ids == np.arange(50) + 50))) oy.query(range(50, 100)) self.assertTrue(len(oy.unknown_ids) == 0)
def test_qbc(self): mean_1 = np.array([-2, 0]) mean_2 = np.array([2, 0]) cov = np.array([[1, 0], [0, 1]]) X_1 = np.random.multivariate_normal(mean_1, cov, 100) X_2 = np.random.multivariate_normal(mean_2, cov, 200) X = np.vstack([X_1, X_2]) y = np.ones(X.shape[0]) y[101:] = -1 # shuffle data p = np.random.permutation(X.shape[0]) X = X[p] y = y[p] y = ObstructedY(y) y.query(np.random.randint(0, X.shape[0], 50)) model = SVC(C=1, kernel='linear') model.fit(X[y.known], y[y.known]) pick, _ = query_by_bagging(X, y, current_model=None, base_model=model, batch_size=50, rng=self.rng, n_bags=5, method='entropy') mean_picked_dist = np.abs(model.decision_function(X[pick])).mean() not_picked = [i for i in xrange(X.shape[0]) if i not in set(pick)] mean_unpicked_dist = np.abs(model.decision_function( X[not_picked])).mean() self.assertTrue(mean_picked_dist < mean_unpicked_dist)
def test_element_access(self): oy = ObstructedY(self.y) self.assertEqual(oy.query(42), self.y[42]) self.assertEqual(oy[42], self.y[42]) self.assertTrue(all(oy.query([6, 66]) == self.y[[6, 66]])) self.assertTrue(all(oy[[6, 66]] == self.y[[6, 66]])) self.assertTrue(all(oy.query(np.array([1, 2])) == self.y[[1, 2]])) self.assertTrue(all(oy[np.array([1, 2])] == self.y[[1, 2]])) oy.query([3, 4, 5, 6]) self.assertTrue(all(oy[3:7] == self.y[3:7]))
def test_element_access(self): oy = ObstructedY(self.y) self.assertEqual(oy.query(42), self.y[42]) self.assertEqual(oy[42], self.y[42]) self.assertTrue(all(oy.query([6,66]) == self.y[[6,66]])) self.assertTrue(all(oy[[6,66]] == self.y[[6,66]])) self.assertTrue(all(oy.query(np.array([1,2])) == self.y[[1,2]])) self.assertTrue(all(oy[np.array([1,2])] == self.y[[1,2]])) oy.query([3,4,5,6]) self.assertTrue(all(oy[3:7] == self.y[3:7]))
def fit_AL_on_folds(model_cls, base_model_cls, base_model_kwargs, projector_cls, \ folds, base_seed=1, warm_start_percentage=0, id_folds=-1, logger=main_logger): metrics = defaultdict(list) monitors = [] if id_folds == -1: id_folds = range(len(folds)) for i in id_folds: start_time = time.time() rng = np.random.RandomState(base_seed + i) X = folds[i]['X_train'] y = folds[i]['Y_train']["data"] y_obst = ObstructedY(y) X_valid = folds[i]['X_valid'] y_valid = folds[i]['Y_valid']["data"] # Add fixed projection to models that accept projector base_model_cls_fold = partial(base_model_cls, random_state=base_seed + i, **base_model_kwargs) if "EEM" in base_model_cls.__name__ or "TWELM" in base_model_cls.__name__ or "RandomNB" in base_model_cls.__name__: base_model_cls_fold = partial(base_model_cls_fold, projector=projector_cls( rng=base_seed + i, X=X["data"])) elif hasattr(base_model_cls, "transform"): logger.warning( "base_model_cls has transform, but didn't fix projection") logger.info("Fitting fold on " + str(X["data"].shape)) # Important to seed model based on fold, because part of strategies might be independent of data model = model_cls(random_state=base_seed + i, base_model_cls=base_model_cls_fold) test_error_datasets = [("concept", (X_valid["data"], y_valid))] if "cluster_A" in X_valid: test_error_datasets.append( ("cluster_A_concept", (X_valid["data"][X_valid["cluster_A"]], y_valid[X_valid["cluster_A"]]))) if "cluster_B" in X_valid: test_error_datasets.append( ("cluster_B_concept", (X_valid["data"][X_valid["cluster_B"]], y_valid[X_valid["cluster_B"]]))) if "cluster_A" in X: logger.info("cluster A training size: " + str(len(X["cluster_A"]))) test_error_datasets.append( ("cluster_A_unlabeled", (X["data"][X["cluster_A"]], y[X["cluster_A"]]))) if "cluster_B" in X: test_error_datasets.append( ("cluster_B_unlabeled", (X["data"][X["cluster_B"]], y[X["cluster_B"]]))) if "cluster_A" in X: warm_start_size = max( 100, int(warm_start_percentage * len(X["cluster_A"]))) warm_start = rng.choice(X["cluster_A"], warm_start_size, replace=False) y_obst.query(warm_start) else: warm_start_size = max( 100, int(warm_start_percentage * X["data"].shape[0])) warm_start = rng.choice(range(X["data"].shape[0]), warm_start_size, replace=False) y_obst.query(warm_start) model.fit(X, y_obst, test_error_datasets=test_error_datasets) y_valid_pred = model.predict(X_valid["data"]) y_pred = model.predict(X["data"]) for metric_name, metric_value in chain( binary_metrics(y_valid, y_valid_pred, "valid").items(), binary_metrics(y, y_pred, "train").items()): metrics[metric_name].append(metric_value) fold_monitors = copy.deepcopy(model.monitors) for key, values in dict(fold_monitors).iteritems(): if key != 'iter': assert isinstance( values, list), "monitor %s is not a list: %s" % (key, type(values)) fold_monitors['mean_' + key] = np.mean(values) fold_monitors['auc_' + key] = auc(np.arange(len(values)), values) fold_monitors['fold_time'] = time.time() - start_time monitors.append(fold_monitors) return metrics, monitors
np.random.seed(666) mean_1 = np.array([-2, 0]) mean_2 = np.array([2, 0]) cov = np.array([[1, 0], [0, 1]]) X_1 = np.random.multivariate_normal(mean_1, cov, 100) X_2 = np.random.multivariate_normal(mean_2, cov, 100) X = np.vstack([X_1, X_2]) y = np.ones(X.shape[0]) y[101:] = -1 # shuffle data p = np.random.permutation(X.shape[0]) X = X[p] y = y[p] y = ObstructedY(y) y.query(np.random.randint(0, X.shape[0], 50)) model = SVC(C=1, kernel='linear', probability=True) model.fit(X[y.known], y[y.known]) pick = query_by_bagging(X, y, model, batch_size=20, rng=np.random.RandomState(666), n_bags=10, method='KL') not_picked = [i for i in xrange(X.shape[0]) if i not in set(pick)] y_plot = y._y y_plot[pick] = 2 plt.figure(figsize=(10,10)) plt.scatter(X[y.unknown_ids, 0], X[y.unknown_ids, 1], c=y_plot[y.unknown_ids], s=100, linewidths=0) plt.ylim(-6,6) plt.show()
def test_constructor(self): oy = ObstructedY(self.y) self.assertTrue(all(oy._y == self.y)) self.assertTrue(not any(oy.known)) self.assertEqual(len(self.y[oy.known]), 0)
def test_bad_access_list(self): oy = ObstructedY(self.y) oy[[6, 66]]
def test_bad_access_slice(self): oy = ObstructedY(self.y) oy[6:66]
def test_bad_access_single(self): oy = ObstructedY(self.y) oy[42]
def test_peeking(self): oy = ObstructedY(self.y) oy.query([])
def fit_AL_on_folds(model_cls, base_model_cls, base_model_kwargs, projector_cls, \ folds, base_seed=1, warm_start_percentage=0, id_folds=-1, logger=main_logger): metrics = defaultdict(list) monitors = [] if id_folds == -1: id_folds = range(len(folds)) for i in id_folds: start_time = time.time() rng = np.random.RandomState(base_seed+i) X = folds[i]['X_train'] y = folds[i]['Y_train']["data"] y_obst = ObstructedY(y) X_valid = folds[i]['X_valid'] y_valid = folds[i]['Y_valid']["data"] # Add fixed projection to models that accept projector base_model_cls_fold = partial(base_model_cls, random_state=base_seed+i, **base_model_kwargs) if "EEM" in base_model_cls.__name__ or "TWELM" in base_model_cls.__name__ or "RandomNB" in base_model_cls.__name__: base_model_cls_fold = partial(base_model_cls_fold, projector=projector_cls(rng=base_seed+i, X=X["data"])) elif hasattr(base_model_cls, "transform"): logger.warning("base_model_cls has transform, but didn't fix projection") logger.info("Fitting fold on "+str(X["data"].shape)) # Important to seed model based on fold, because part of strategies might be independent of data model = model_cls(random_state=base_seed + i, base_model_cls=base_model_cls_fold) test_error_datasets = [("concept", (X_valid["data"], y_valid))] if "cluster_A" in X_valid: test_error_datasets.append(("cluster_A_concept", (X_valid["data"][X_valid["cluster_A"]], y_valid[X_valid["cluster_A"]]))) if "cluster_B" in X_valid: test_error_datasets.append(("cluster_B_concept", (X_valid["data"][X_valid["cluster_B"]], y_valid[X_valid["cluster_B"]]))) if "cluster_A" in X: logger.info("cluster A training size: "+str(len(X["cluster_A"]))) test_error_datasets.append(("cluster_A_unlabeled", (X["data"][X["cluster_A"]], y[X["cluster_A"]]))) if "cluster_B" in X: test_error_datasets.append(("cluster_B_unlabeled", (X["data"][X["cluster_B"]], y[X["cluster_B"]]))) if "cluster_A" in X: warm_start_size = max(100, int(warm_start_percentage * len(X["cluster_A"]))) warm_start = rng.choice(X["cluster_A"], warm_start_size, replace=False) y_obst.query(warm_start) else: warm_start_size = max(100, int(warm_start_percentage * X["data"].shape[0])) warm_start = rng.choice(range(X["data"].shape[0]), warm_start_size, replace=False) y_obst.query(warm_start) model.fit(X, y_obst, test_error_datasets=test_error_datasets) y_valid_pred = model.predict(X_valid["data"]) y_pred = model.predict(X["data"]) for metric_name, metric_value in chain( binary_metrics(y_valid, y_valid_pred, "valid").items(), binary_metrics(y, y_pred, "train").items()): metrics[metric_name].append(metric_value) fold_monitors = copy.deepcopy(model.monitors) for key, values in dict(fold_monitors).iteritems(): if key != 'iter': assert isinstance(values, list), "monitor %s is not a list: %s" % (key, type(values)) fold_monitors['mean_' + key] = np.mean(values) fold_monitors['auc_' + key] = auc(np.arange(len(values)), values) fold_monitors['fold_time'] = time.time() - start_time monitors.append(fold_monitors) return metrics, monitors
def test_full_query(self): oy = ObstructedY(self.y) self.assertTrue(all(oy.query(range(100)) == self.y)) self.assertTrue(all(oy[:] == self.y)) self.assertTrue(all(oy.known))
def fit(self, X, y, test_error_datasets=[]): """ :param test_error_datasets. Will calculate error on those datasets: list of tuple ["name", (X,y)] or list of indexes of train X, y >>>model.fit(X, y, [("concept", (X_test, y_test)), ("main_cluster", ids))]) """ y = copy.deepcopy(y) rng = check_random_state(self.rng) self.strategy_projection_seed = rng.randint(0,100) if isinstance(X, dict): self.D = get_tanimoto_pairwise_distances(loader=X["i"]["loader"], preprocess_fncs=X["i"]["preprocess_fncs"], name=X["i"]["name"]) X_info = X["i"] X = X["data"] else: # This branch will work only with most basic strategies X_info = None self.D = None self.monitors = defaultdict(list) # self.base_model = self.base_model_cls() if not isinstance(y, ObstructedY): y = ObstructedY(y) self.monitors['n_already_labeled'] = [0] self.monitors['iter'] = 0 # times self.monitors['strat_times'] = [] self.monitors['grid_times'] = [] self.monitors['concept_test_times'] = [] self.monitors['unlabeled_test_times'] = [] max_iteration = (y.shape[0] - y.known.sum())/self.batch_size + 1 self.logger.info("Running Active Learninig Experiment for approximately "+str(max_iteration) + " iterations") self.logger.info("Logging concept error every iteration") if self.n_label is None and self.n_iter is None: self.n_label = X.shape[0] self.logger.info("Warm start size: " + str(len(y.known_ids))) # 0 warm start labeled = len(y.known_ids) if len(y.known_ids) == 0: labeled = self._query_labels(X, X_info, y, model=None, rng=rng) self.logger.info("WARNING: Model performing random query, because all labels are unknown") self.grid_seed = rng.randint(100) while True: # We assume that in first iteration first query is performed for us if self.monitors['iter'] != 0: labeled = self._query_labels(X, X_info, y, model=self.model, rng=rng) # Fit model parameters start = time.time() scorer = make_scorer(self.metrics[0]) try: # Some if-ology to make sure we don't crash too often here. if len(y.known_ids) < 10: n_folds = 2 else: n_folds = self.n_folds seed = self.grid_seed + self.monitors['iter'] grid = GridSearch(base_model_cls=self.base_model_cls, param_grid=self.param_grid, seed=seed, n_folds=n_folds, adaptive=self.adaptive_grid) self.model = grid.fit(X[y.known_ids], y[y.known_ids]) except Exception, e: self.logger.error(y.known_ids) self.logger.error(X[y.known_ids].shape) self.logger.error("Failed to fit grid!. Fitting random parameters!") self.logger.error(str(e)) self.logger.error(traceback.format_exc()) self.model = self.base_model_cls().fit(X[y.known_ids], y[y.known_ids]) self.monitors['grid_times'].append(time.time() - start) self.monitors['n_already_labeled'].append(self.monitors['n_already_labeled'][-1] + labeled) self.monitors['iter'] += 1 self.logger.info("Iter: %i, labeled %i/%i" % (self.monitors['iter'], self.monitors['n_already_labeled'][-1], self.n_label)) # Test on supplied datasets start = time.time() for reported_name, D in test_error_datasets: if len(D) > 2 and isinstance(D, list): X_test = X[D] y_test = y[D] elif len(D) == 2: X_test = D[0] y_test = D[1] else: raise ValueError("Incorrect format of test_error_datasets") pred = self.model.predict(X_test) for metric in self.metrics: self.monitors[metric.__name__ + "_" + reported_name].append(metric(y_test, pred)) self.monitors['concept_test_times'].append(time.time() - start) # test on remaining training data if self.n_label - self.monitors['n_already_labeled'][-1] > 0: start = time.time() pred = self.model.predict(X[np.invert(y.known)]) self.monitors['unlabeled_test_times'].append(time.time() - start) for metric in self.metrics: self.monitors[metric.__name__ + "_unlabeled"].append(metric(y.peek(), pred)) # Check stopping criterions if self.n_iter is not None: if self.monitors['iter'] == self.n_iter: break elif self.n_label - self.monitors['n_already_labeled'][-1] == 0: break elif self.n_label - self.monitors['n_already_labeled'][-1] < self.batch_size: self.batch_size = self.n_label - self.monitors['n_already_labeled'][-1] self.logger.debug("Decreasing batch size to: %i" % self.batch_size) assert self.batch_size >= 0
def test_nad_index_single(self): oy = ObstructedY(self.y) oy[666]
def test_nad_index_slice(self): oy = ObstructedY(self.y) oy[666:777]
def test_nad_index_list(self): oy = ObstructedY(self.y) oy[[666, 777]]
np.random.seed(666) mean_1 = np.array([-2, 0]) mean_2 = np.array([2, 0]) cov = np.array([[1, 0], [0, 1]]) X_1 = np.random.multivariate_normal(mean_1, cov, 100) X_2 = np.random.multivariate_normal(mean_2, cov, 100) X = np.vstack([X_1, X_2]) y = np.ones(X.shape[0]) y[101:] = -1 # shuffle data p = np.random.permutation(X.shape[0]) X = X[p] y = y[p] y = ObstructedY(y) y.query(np.random.randint(0, X.shape[0], 50)) model = SVC(C=1, kernel='linear', probability=True) model.fit(X[y.known], y[y.known]) pick = query_by_bagging(X, y, model, batch_size=20, rng=np.random.RandomState(666), n_bags=10, method='KL') not_picked = [i for i in xrange(X.shape[0]) if i not in set(pick)]