def test_score_calculation_specific(self): model_output = [[0.7, 0.2, 0.1], [0.4, 0.5, 0.1]] model_output = np.array(model_output) scorer = ScorerClassification(model_output) scores = scorer.calculate_scores() self.assertListEqual(list(scores["uncertainty_least_confidence"]), [(1 - 0.7) / (1 - 1. / 3.), (1 - 0.5) / (1 - 1. / 3.)]) self.assertListEqual(list(scores["uncertainty_margin"]), [1 - (0.7 - 0.2), 1 - (0.5 - 0.4)]) for val1, val2 in zip(scores["uncertainty_entropy"], _entropy(model_output) / np.log2(3)): self.assertAlmostEqual(val1, val2, places=8)
def test_scorer_classification_variable_model_output_tensor_order(self): for tensor_order in range(1, 5): model_output = np.ndarray((3, ) * tensor_order) with self.subTest( msg=f"model_output.shape = {model_output.shape}"): if tensor_order == 2 or tensor_order == 0: scorer = ScorerClassification(model_output=model_output) scores = scorer.calculate_scores() for score_values in scores.values(): self.assertEqual(type(score_values), np.ndarray) else: with self.assertRaises(ValueError): scorer = ScorerClassification( model_output=model_output)
def test_score_calculation_binary(self): model_output = [[0.7], [0.4]] model_output = np.array(model_output) scorer = ScorerClassification(model_output) scores = scorer.calculate_scores() self.assertListEqual(list(scores["uncertainty_least_confidence"]), [(1 - 0.7) / (1 - 1. / 2.), (1 - 0.6) / (1 - 1. / 2.)]) self.assertListEqual(list(scores["uncertainty_margin"]), [1 - (0.7 - 0.3), 1 - (0.6 - 0.4)]) model_output = np.concatenate([model_output, 1 - model_output], axis=1) for val1, val2 in zip(scores["uncertainty_entropy"], _entropy(model_output) / np.log2(2)): self.assertAlmostEqual(val1, val2, places=8)
def test_agent(self): self.api_workflow_client.embedding_id = "embedding_id_xyz" agent_0 = ActiveLearningAgent(self.api_workflow_client) agent_1 = ActiveLearningAgent(self.api_workflow_client, query_tag_name="query_tag_name_xyz") agent_2 = ActiveLearningAgent(self.api_workflow_client, query_tag_name="query_tag_name_xyz", preselected_tag_name="preselected_tag_name_xyz") agent_3 = ActiveLearningAgent(self.api_workflow_client, preselected_tag_name="preselected_tag_name_xyz") for method in [SamplingMethod.CORAL, SamplingMethod.CORESET, SamplingMethod.RANDOM]: for agent in [agent_0, agent_1, agent_2, agent_3]: for batch_size in [2, 6]: n_old_labeled = len(agent.labeled_set) n_old_unlabeled = len(agent.unlabeled_set) n_samples = len(agent.labeled_set) + batch_size if method == SamplingMethod.CORAL and len(agent.labeled_set) == 0: sampler_config = SamplerConfig(n_samples=n_samples, method=SamplingMethod.CORESET) else: sampler_config = SamplerConfig(n_samples=n_samples, method=method) if sampler_config.method == SamplingMethod.CORAL: predictions = np.random.rand(len(agent.unlabeled_set), 10).astype(np.float32) predictions_normalized = predictions / np.sum(predictions, axis=1)[:, np.newaxis] al_scorer = ScorerClassification(predictions_normalized) labeled_set, added_set = agent.query(sampler_config=sampler_config, al_scorer=al_scorer) else: sampler_config = SamplerConfig(n_samples=n_samples) labeled_set, added_set = agent.query(sampler_config=sampler_config) self.assertEqual(n_old_labeled + len(added_set), len(labeled_set)) assert set(added_set).issubset(labeled_set) self.assertEqual(len(list(set(agent.labeled_set) & set(agent.unlabeled_set))), 0) self.assertEqual(n_old_unlabeled - len(added_set), len(agent.unlabeled_set))
def t_est_active_learning(api_workflow_client: ApiWorkflowClient, method: SamplingMethod = SamplingMethod.CORAL, query_tag_name: str = 'initial-tag', preselected_tag_name: str = None, n_samples_additional: List[int] = [2, 5]): # create the tags with 100 respectively 10 samples if not yet existant if query_tag_name is not None: sampler_config = SamplerConfig(method=SamplingMethod.RANDOM, n_samples=100, name=query_tag_name) try: api_workflow_client.sampling(sampler_config=sampler_config) except RuntimeError: pass if preselected_tag_name is not None: sampler_config = SamplerConfig(method=SamplingMethod.RANDOM, n_samples=10, name=preselected_tag_name) try: api_workflow_client.sampling(sampler_config=sampler_config) except RuntimeError: pass # define the active learning agent agent = ActiveLearningAgent(api_workflow_client, query_tag_name=query_tag_name, preselected_tag_name=preselected_tag_name) total_no_samples = len(agent.unlabeled_set) + len(agent.labeled_set) al_scorer = None for iteration, n_samples_additional in enumerate(n_samples_additional): n_samples = len(agent.labeled_set) + n_samples_additional print( f"Beginning with iteration {iteration} to have {n_samples} labeled samples." ) # Perform a sampling method_here = SamplingMethod.CORESET if iteration == 0 and method == SamplingMethod.CORAL else method sampler_config = SamplerConfig(method=method_here, n_samples=n_samples) if al_scorer is None: agent.query(sampler_config=sampler_config) else: agent.query(sampler_config=sampler_config, al_scorer=al_scorer) assert len(agent.labeled_set) == n_samples assert len(agent.unlabeled_set) == total_no_samples - n_samples # Update the scorer n_samples = len(agent.query_set) n_classes = 10 predictions = np.random.rand(n_samples, n_classes) predictions_normalized = predictions / np.sum(predictions, axis=1)[:, np.newaxis] model_output = predictions_normalized al_scorer = ScorerClassification(model_output=predictions) print("Success!")
def test_score_calculation(self): n_samples = 100 n_classes = 10 predictions = np.random.rand(n_samples, n_classes) predictions_normalized = predictions / np.sum(predictions, axis=1)[:, np.newaxis] model_output = predictions_normalized scorer = ScorerClassification(model_output) scores = scorer.calculate_scores() scores_prediction_entropy = scores["prediction-entropy"] scores_prediction_margin = scores["prediction-margin"] assert scores_prediction_entropy.shape == (n_samples, ) assert scores_prediction_margin.shape == (n_samples, ) assert all(scores_prediction_entropy >= 0) assert all(scores_prediction_margin >= 0) assert all(scores_prediction_margin <= 1)
def test_score_calculation_random(self): n_samples = 10000 n_classes = 10 np.random.seed(42) predictions = np.random.rand(n_samples, n_classes) predictions_normalized = predictions / np.sum(predictions, axis=1)[:, np.newaxis] model_output = predictions_normalized scorer = ScorerClassification(model_output) scores = scorer.calculate_scores() self.assertEqual(set(scores.keys()), set(ScorerClassification.score_names())) for score_name, score in scores.items(): self.assertEqual(score.shape, (n_samples, )) self.assertTrue(all(score >= 0)) self.assertTrue(all(score <= 1)) self.assertEqual(type(score), np.ndarray)
def test_scorer_classification_variable_model_output_dimension(self): for num_samples in range(5): for num_classes in range(5): with self.subTest( msg= f"model_output.shape = ({num_samples},{num_classes})"): if num_samples > 0: preds = [1. / num_samples] * num_classes else: preds = [] model_output = [preds] * num_samples if num_classes == 0 and num_samples > 0: with self.assertRaises(ValueError): scorer = ScorerClassification( model_output=model_output) else: scorer = ScorerClassification( model_output=model_output) scores = scorer.calculate_scores() self.assertEqual( set(scores.keys()), set(ScorerClassification.score_names())) for score_values in scores.values(): self.assertEqual(len(score_values), len(model_output)) self.assertEqual(type(score_values), np.ndarray)
def test_agent_only_upload_scores(self): self.api_workflow_client.embedding_id = "embedding_id_xyz" agent = ActiveLearningAgent( self.api_workflow_client, preselected_tag_name="preselected_tag_name_xyz", ) n_predictions = len(agent.query_set) predictions = np.random.rand(n_predictions, 10).astype(np.float32) predictions_normalized = predictions / np.sum(predictions, axis=1)[:, np.newaxis] al_scorer = ScorerClassification(predictions_normalized) agent.upload_scores(al_scorer)
def test_agent_wrong_scores(self): self.api_workflow_client.embedding_id = "embedding_id_xyz" agent = ActiveLearningAgent(self.api_workflow_client, preselected_tag_name="preselected_tag_name_xyz") method = SamplingMethod.CORAL n_samples = len(agent.labeled_set) + 2 n_predictions = len(agent.unlabeled_set) - 3 # the -3 should cause en error predictions = np.random.rand(n_predictions, 10).astype(np.float32) predictions_normalized = predictions / np.sum(predictions, axis=1)[:, np.newaxis] al_scorer = ScorerClassification(predictions_normalized) sampler_config = SamplerConfig(n_samples=n_samples, method=method) with self.assertRaises(ValueError): labeled_set, added_set = agent.query(sampler_config=sampler_config, al_scorer=al_scorer)
def test_agent_without_embedding_id(self): agent = ActiveLearningAgent( self.api_workflow_client, preselected_tag_name="preselected_tag_name_xyz") method = SamplingMethod.CORAL n_samples = len(agent.labeled_set) + 2 n_predictions = len(agent.query_set) predictions = np.random.rand(n_predictions, 10).astype(np.float32) predictions_normalized = predictions / np.sum(predictions, axis=1)[:, np.newaxis] al_scorer = ScorerClassification(predictions_normalized) sampler_config = SamplerConfig(n_samples=n_samples, method=method) agent.query(sampler_config=sampler_config, al_scorer=al_scorer)
def test_agent(self): self.api_workflow_client.embedding_id = "embedding_id_xyz" agent_0 = ActiveLearningAgent(self.api_workflow_client) agent_1 = ActiveLearningAgent(self.api_workflow_client, query_tag_name="query_tag_name_xyz") agent_2 = ActiveLearningAgent( self.api_workflow_client, query_tag_name="query_tag_name_xyz", preselected_tag_name="preselected_tag_name_xyz") agent_3 = ActiveLearningAgent( self.api_workflow_client, preselected_tag_name="preselected_tag_name_xyz") for method in [ SamplingMethod.CORAL, SamplingMethod.CORESET, SamplingMethod.RANDOM ]: for agent in [agent_0, agent_1, agent_2, agent_3]: for batch_size in [2, 6]: n_samples = len(agent.labeled_set) + batch_size if method == SamplingMethod.CORAL and len( agent.labeled_set) > 0: sampler_config = SamplerConfig( n_samples=n_samples, method=SamplingMethod.CORESET) else: sampler_config = SamplerConfig(n_samples=n_samples, method=method) if sampler_config.method == SamplingMethod.CORESET: predictions = np.random.rand(len(agent.unlabeled_set), 10) predictions_normalized = predictions / np.sum( predictions, axis=1)[:, np.newaxis] al_scorer = ScorerClassification( predictions_normalized) chosen_filenames = agent.query( sampler_config=sampler_config, al_scorer=al_scorer) else: sampler_config = SamplerConfig(n_samples=n_samples) chosen_filenames = agent.query( sampler_config=sampler_config)
def test_scorer_classification_empty_model_output(self): scorer = ScorerClassification(model_output=[]) scores = scorer.calculate_scores() self.assertEqual(set(scores.keys()), set(ScorerClassification.score_names()))
print(f"There are {len(agent.labeled_set)} samples in the labeled set.") # %% # 2. Train a classifier on the labeled set. labeled_set_features = dataset.get_features(agent.labeled_set) labeled_set_labels = dataset.get_labels(agent.labeled_set) classifier.fit(X=labeled_set_features, y=labeled_set_labels) # %% # 3. Use the classifier to predict on the query set. query_set_features = dataset.get_features(agent.query_set) predictions = classifier.predict_proba(X=query_set_features) # %% # 4. Calculate active learning scores from the prediction. active_learning_scorer = ScorerClassification(model_output=predictions) # %% # 5. Use an active learning agent to choose the next samples to be labeled based on the active learning scores. # We want to sample another 100 samples to have 300 samples in total and use the active learning sampler CORAL for it. sampler_config = SamplerConfig(n_samples=300, method=SamplingMethod.CORAL, name='al-iteration-1') agent.query(sampler_config=sampler_config, al_scorer=active_learning_scorer) print(f"There are {len(agent.labeled_set)} samples in the labeled set.") # %% # 6. Update the labeled set to include the newly chosen samples and remove them from the unlabeled set. # This is already done internally inside the ActiveLearningAgent - no work for you :) # %%