def test_accuracy(self): performance_metrics = [Accuracy()] experiment = HoldOutExperiment( client=self.__client, X=self.__X_class, Y=self.__y_class, scenario_type=PoolBasedSamplingScenario, ml_technique=self.__ml_technique_class, performance_metrics=performance_metrics, query_strategy=self.__query_strategy, oracle=SimulatedOracle(labels=self.__y_class), stopping_criteria=MaxIteration(value=10), self_partition=True, test_ratio=0.3, initial_label_rate=0.05, all_class=True ) start_time = time.time() result = experiment.evaluate(client=self.__client, verbose=True) print() print("---Active Learning experiment %s seconds ---" % (time.time() - start_time)) query_analyser = ExperimentAnalyserFactory.experiment_analyser( performance_metrics=[metric.metric_name for metric in performance_metrics], method_name=self.__query_strategy.query_function_name, method_results=result, type="queries" ) # get a brief description of the experiment query_analyser.plot_learning_curves(title='Active Learning experiment results')
def test_fifteen_iteration(self): experiment = HoldOutExperiment( client=None, X=self.__X, Y=self.__y, scenario_type=PoolBasedSamplingScenario, train_idx=self.__train_idx, test_idx=self.__test_idx, label_idx=self.__label_idx, unlabel_idx=self.__unlabel_idx, ml_technique=self.__ml_technique, performance_metrics=[Mse(squared=True)], query_strategy=QueryRegressionStd(), oracle=SimulatedOracle(labels=self.__y), stopping_criteria=MaxIteration(15), self_partition=False ) result = experiment.evaluate(verbose=False) regressor = result[0].ml_technique # plotting the initial estimation with plt.style.context('seaborn-white'): plt.figure(figsize=(14, 7)) x = np.linspace(0, 20, 1000) pred, std = regressor.predict(x.reshape(-1, 1), return_std=True) plt.plot(x, pred) plt.fill_between(x, pred.reshape(-1, ) - std, pred.reshape(-1, ) + std, alpha=0.2) plt.scatter(self.__X, self.__y, c='k') plt.title('Initial estimation') plt.show()
def test_kullback_leibler_divergence(self): query_strategy = QueryKullbackLeiblerDivergence(n_jobs=5) # init the ALExperiment experiment = HoldOutExperiment( client=self.__client, X=self.__X, Y=self.__y, scenario_type=PoolBasedSamplingScenario, ml_technique=self.__ml_technique, performance_metrics=self.__performance_metrics, query_strategy=query_strategy, oracle=SimulatedOracle(labels=self.__y), stopping_criteria=MaxIteration(5), self_partition=True, test_ratio=0.3, initial_label_rate=0.05, all_class=True ) start_time = time.time() result = experiment.evaluate(client=self.__client, verbose=True) print() print("---Active Learning experiment %s seconds ---" % (time.time() - start_time)) query_analyser = ExperimentAnalyserFactory.experiment_analyser( performance_metrics=[metric.metric_name for metric in self.__performance_metrics], method_name=query_strategy.query_function_name, method_results=result, type="queries" ) # get a brief description of the experiment query_analyser.plot_learning_curves(title='Active Learning experiment results')
def test_query_regression_std_batch_size(self): # Get the data X = np.random.choice(np.linspace(0, 20, 1000), size=100, replace=False).reshape(-1, 1) y = np.sin(X) + np.random.normal(scale=0.3, size=X.shape) # assembling initial training set train_idx, test_idx, label_idx, unlabel_idx = split( X=X, y=y, test_ratio=0.3, initial_label_rate=0.05, split_count=1, all_class=True) # defining the kernel for the Gaussian process ml_technique = GaussianProcessRegressor( kernel=RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \ + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))) experiment = HoldOutExperiment( client=self.__client, X=X, Y=y, scenario_type=PoolBasedSamplingScenario, train_idx=train_idx, test_idx=test_idx, label_idx=label_idx, unlabel_idx=unlabel_idx, ml_technique=ml_technique, performance_metrics=[Mse(squared=True)], query_strategy=QueryRegressionStd(), oracle=SimulatedOracle(labels=y), stopping_criteria=PercentOfUnlabel(value=70), self_partition=False, batch_size=self.__batch_size ) result = experiment.evaluate(verbose=True) regressor = result[0].ml_technique # plotting the initial estimation with plt.style.context('seaborn-white'): plt.figure(figsize=(14, 7)) x = np.linspace(0, 20, 1000) pred, std = regressor.predict(x.reshape(-1, 1), return_std=True) plt.plot(x, pred) plt.fill_between(x, pred.reshape(-1, ) - std, pred.reshape(-1, ) + std, alpha=0.2) plt.scatter(X, y, c='k') plt.title('Initial estimation') plt.show()
def test_mse(self): performance_metrics = [Mse(squared=False)] experiment = HoldOutExperiment( client=self.__client, X=self.__X_reg, Y=self.__y_reg, scenario_type=PoolBasedSamplingScenario, ml_technique=self.__ml_technique_reg, performance_metrics=performance_metrics, query_strategy=self.__query_strategy, oracle=SimulatedOracle(labels=self.__y_reg), stopping_criteria=MaxIteration(value=20), self_partition=True, test_ratio=0.3, initial_label_rate=0.05, all_class=True ) start_time = time.time() result = experiment.evaluate(client=self.__client, verbose=True) print() print("---Active Learning experiment %s seconds ---" % (time.time() - start_time)) query_analyser = ExperimentAnalyserFactory.experiment_analyser( performance_metrics=[metric.metric_name for metric in performance_metrics], method_name=self.__query_strategy.query_function_name, method_results=result, type="queries" ) # get a brief description of the experiment query_analyser.plot_learning_curves(title='Active Learning experiment results') result = experiment.evaluate(verbose=True) regressor = result[0].ml_technique # plotting the initial estimation with plt.style.context('seaborn-white'): plt.figure(figsize=(14, 7)) x = np.linspace(0, 20, 1000) pred, std = regressor.predict(x.reshape(-1, 1), return_std=True) plt.plot(x, pred) plt.fill_between(x, pred.reshape(-1, ) - std, pred.reshape(-1, ) + std, alpha=0.2) plt.scatter(self.__X_reg, self.__y_reg, c='k') plt.title('Initial estimation') plt.show()
def test_hold_out_marginSamplingQuery_unlabelSetEmpty(self): ml_technique = LogisticRegression() stopping_criteria = UnlabelSetEmpty() query_strategy = QueryMarginSampling() performance_metrics = [ Accuracy(), F1(average='weighted'), HammingLoss() ] # performance_metrics = [Mse(square=False), Mse(square=True)] # init the ALExperiment experiment = HoldOutExperiment(client=self.__client, X=self.__X, Y=self.__y, scenario_type=PoolBasedSamplingScenario, ml_technique=ml_technique, performance_metrics=performance_metrics, query_strategy=query_strategy, oracle=SimulatedOracle(labels=self.__y), stopping_criteria=stopping_criteria, self_partition=True, test_ratio=0.3, initial_label_rate=0.05, all_class=False) result = experiment.evaluate(client=self.__client, verbose=True) query_analyser = ExperimentAnalyserFactory.experiment_analyser( performance_metrics=[ metric.metric_name for metric in performance_metrics ], method_name=query_strategy.query_function_name, method_results=result, type="queries") # get a brief description of the experiment query_analyser.plot_learning_curves( title='Active Learning experiment results') np.random.seed(0) indices = np.random.permutation(len(self.__X)) iris_X_test = self.__X[indices[-10:]] print(result[0].ml_technique.predict(iris_X_test))
def test_hold_out_randomQuery_unlabelSetEmpty_ConsoleHumanOracle(self): ml_technique = LogisticRegression(solver='sag') stopping_criteria = MaxIteration(5) query_strategy = QueryInstanceRandom() performance_metrics = [ Accuracy(), F1(average='weighted'), HammingLoss() ] # init the ALExperiment experiment = HoldOutExperiment( client=self.__client, X=self.__X, Y=self.__y, scenario_type=PoolBasedSamplingScenario, ml_technique=ml_technique, performance_metrics=performance_metrics, query_strategy=query_strategy, oracle=ConsoleHumanOracle(labels=self.__y), stopping_criteria=stopping_criteria, self_partition=True, test_ratio=0.3, initial_label_rate=0.05, all_class=True) start_time = time.time() result = experiment.evaluate(client=self.__client, verbose=True) print() print("---Active Learning experiment %s seconds ---" % (time.time() - start_time)) query_analyser = ExperimentAnalyserFactory.experiment_analyser( performance_metrics=[ metric.metric_name for metric in performance_metrics ], method_name=query_strategy.query_function_name, method_results=result, type="queries") # get a brief description of the experiment query_analyser.plot_learning_curves( title='Active Learning experiment results')
def execute_experiment(self, num_iters, file_name): for i in range(0, num_iters): X, y = make_classification(n_samples=self._instance_num, n_features=self._feature_num, n_informative=2 * self._label_num, n_redundant=self._label_num, n_repeated=0, n_classes=self._label_num, n_clusters_per_class=self._label_num, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None, chunks=self._instance_num * 0.10) experiment = HoldOutExperiment( self.__client, X, y, scenario_type=PoolBasedSamplingScenario, ml_technique=self._ml_technique, performance_metrics=self._performance_metrics, query_strategy=self._query_strategy, oracle=SimulatedOracle(labels=y), stopping_criteria=MaxIteration(25), self_partition=True, test_ratio=0.3, initial_label_rate=0.05, all_class=True, batch_size=100, rebalance=True) start_time = time.time() experiment.evaluate(client=self.__client, multithread=False, verbose=True) end_time = time.time() - start_time self.dump_iteration(file_name, {"iter": i + 1, "time": end_time})
def test_hold_out_marginSamplingQuery_unlabelSetEmpty(self): ml_technique = LogisticRegression(solver='liblinear') stopping_criteria = MaxIteration(50) query_strategy = QueryMarginSampling() performance_metrics = [ Accuracy(), F1(average='macro'), HammingLoss(), Precision(average='macro'), Recall(average='macro') ] # init the ALExperiment experiment = HoldOutExperiment(client=None, X=self.__X.to_numpy(), Y=self.__y.to_numpy(), scenario_type=PoolBasedSamplingScenario, ml_technique=ml_technique, performance_metrics=performance_metrics, query_strategy=query_strategy, oracle=SimulatedOracle(labels=self.__y), stopping_criteria=stopping_criteria, self_partition=True, test_ratio=0.3, initial_label_rate=0.05, all_class=False) result = experiment.evaluate(verbose=False) query_analyser = ExperimentAnalyserFactory.experiment_analyser( performance_metrics=[ metric.metric_name for metric in performance_metrics ], method_name=query_strategy.query_function_name, method_results=result, type="queries") # get a brief description of the experiment query_analyser.plot_learning_curves( title='Active Learning experiment results')
def test_hold_out_self_partitioning(self): split_count = 1 instance_num = 100 self.__X, self.__y = make_classification(n_samples=instance_num, n_features=4, n_informative=2, n_redundant=2, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None) # init the ALExperiment experiment = HoldOutExperiment(self.__X, self.__y, self_partition=True, stopping_criteria=UnlabelSetEmpty(), test_ratio=0.3, initial_label_rate=0.05, all_class=True) assert len(experiment._train_idx) == split_count assert len(experiment._test_idx) == split_count assert len(experiment._label_idx) == split_count assert len(experiment._unlabel_idx) == split_count for i in range(split_count): train = set(experiment._train_idx[i]) test = set(experiment._test_idx[i]) lab = set(experiment._label_idx[i]) unl = set(experiment._unlabel_idx[i]) assert len(test) == round(0.3 * instance_num) assert len(lab) == round(0.05 * len(train)) # validity traintest = train.union(test) labun = lab.union(unl) assert traintest == set(range(instance_num)) assert labun == train
def test_ActiveLearning_HoldHout(self): # INI the ALExperiment ----------------------------------------------------------------------------------------- al_ml_technique = LogisticRegression(solver='sag') stopping_criteria = MaxIteration(10) query_strategy = QueryMarginSampling() performance_metrics = [ Accuracy(), F1(average='macro'), HammingLoss(), Precision(average='macro'), Recall(average='macro') ] experiment = HoldOutExperiment( client=self.__client, X=self.__X.to_numpy(), Y=self.__y['BAD'].to_numpy(), scenario_type=PoolBasedSamplingScenario, train_idx=self.__train_idx, test_idx=self.__test_idx, label_idx=self.__label_idx, unlabel_idx=self.__unlabel_idx, ml_technique=al_ml_technique, performance_metrics=performance_metrics, query_strategy=query_strategy, oracle=SimulatedOracle(labels=self.__y['BAD'].to_numpy()), stopping_criteria=stopping_criteria, self_partition=False, rebalance=True, batch_size=50) print("") start_time = time.time() result = experiment.evaluate(verbose=True) print("---Active Learning experiment %s seconds ---" % (time.time() - start_time)) query_analyser = ExperimentAnalyserFactory.experiment_analyser( performance_metrics=[ metric.metric_name for metric in performance_metrics ], method_name=query_strategy.query_function_name, method_results=result, type="queries") # get a brief description of the experiment query_analyser.plot_learning_curves( title='Active Learning experiment results') foldIndex = 0 train_x = self.__X.iloc[self.__train_idx[foldIndex], :] train_y = self.__y.iloc[self.__train_idx[foldIndex], :] test_x = self.__X.iloc[self.__test_idx[foldIndex], :] test_y = self.__y.iloc[self.__test_idx[foldIndex], :] active_y_pred = result[0].ml_technique.predict(test_x) print("Active Learning Accuracy score : ", accuracy_score(test_y, active_y_pred)) print( "Active Learning F1 score: ", f1_score(test_y, active_y_pred, average='macro', zero_division=0)) print("Active Learning Hamming Loss", hamming_loss(test_y, active_y_pred)) print( "Active Learning Precision score : ", precision_score(test_y, active_y_pred, average='macro', zero_division=0)) print( "Active Learning Recall score : ", recall_score(test_y, active_y_pred, average='macro', zero_division=0)) # END the ALExperiment ----------------------------------------------------------------------------------------- # INI the PLExperiment ----------------------------------------------------------------------------------------- pl_ml_technique = LogisticRegression(solver='liblinear') print("") start_time = time.time() pl_ml_technique.fit(train_x, train_y) print("---Passive Learning experiment %s seconds ---" % (time.time() - start_time)) passive_y_pred = pl_ml_technique.predict(test_x) print("Pasive Learning Accuracy score : ", accuracy_score(test_y, passive_y_pred)) print( "Pasive Learning F1 score: ", f1_score(test_y, passive_y_pred, average='macro', zero_division=0)) print("Pasive Learning Hamming Loss", hamming_loss(test_y, passive_y_pred)) print( "Pasive Learning Precision score : ", precision_score(test_y, passive_y_pred, average='macro', zero_division=0)) print( "Pasive Learning Recall score : ", recall_score(test_y, passive_y_pred, average='macro', zero_division=0))