def test_score(self): model = MultitaskClassifier([self.task1]) metrics = model.score([self.dataloader]) # deterministic random tie breaking alternates predicted labels self.assertEqual(metrics["task1/dataset/train/accuracy"], 0.4) # test dataframe format metrics_df = model.score([self.dataloader], as_dataframe=True) self.assertTrue(isinstance(metrics_df, pd.DataFrame)) self.assertEqual(metrics_df.at[0, "score"], 0.4)
def test_score_shuffled(self): # Test scoring with a shuffled dataset set_seed(123) class SimpleVoter(nn.Module): def forward(self, x): """Set class 0 to -1 if x and 1 otherwise""" mask = x % 2 == 0 out = torch.zeros(x.shape[0], 2) out[mask, 0] = 1 # class 0 out[~mask, 1] = 1 # class 1 return out # Create model task_name = "VotingTask" module_name = "simple_voter" module_pool = nn.ModuleDict({module_name: SimpleVoter()}) op0 = Operation(module_name=module_name, inputs=[("_input_", "data")], name="op0") op_sequence = [op0] task = Task(name=task_name, module_pool=module_pool, op_sequence=op_sequence) model = MultitaskClassifier([task]) # Create dataset y_list = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] x_list = [i for i in range(len(y_list))] Y = torch.LongTensor(y_list * 100) X = torch.FloatTensor(x_list * 100) dataset = DictDataset(name="dataset", split="train", X_dict={"data": X}, Y_dict={task_name: Y}) # Create dataloaders dataloader = DictDataLoader(dataset, batch_size=2, shuffle=False) scores = model.score([dataloader]) self.assertEqual(scores["VotingTask/dataset/train/accuracy"], 0.6) dataloader_shuffled = DictDataLoader(dataset, batch_size=2, shuffle=True) scores_shuffled = model.score([dataloader_shuffled]) self.assertEqual(scores_shuffled["VotingTask/dataset/train/accuracy"], 0.6)
def test_convergence(self): """Test slicing convergence with 1 slice task that represents ~25% of the data.""" dataloaders = [] for df, split in [(self.df_train, "train"), (self.df_valid, "valid")]: dataloader = create_dataloader(df, split) dataloaders.append(dataloader) base_task = create_task("task", module_suffixes=["A", "B"]) # Apply SFs slicing_functions = [h] # high coverage slice slice_names = [sf.name for sf in slicing_functions] applier = PandasSFApplier(slicing_functions) S_train = applier.apply(self.df_train, progress_bar=False) S_valid = applier.apply(self.df_valid, progress_bar=False) self.assertEqual(S_train.shape, (self.N_TRAIN, )) self.assertEqual(S_valid.shape, (self.N_VALID, )) self.assertIn("h", S_train.dtype.names) # Add slice labels add_slice_labels(dataloaders[0], base_task, S_train) add_slice_labels(dataloaders[1], base_task, S_valid) # Convert to slice tasks tasks = convert_to_slice_tasks(base_task, slice_names) model = MultitaskClassifier(tasks=tasks) # Train trainer = Trainer(lr=0.001, n_epochs=50, progress_bar=False) trainer.fit(model, dataloaders) scores = model.score(dataloaders) # Confirm near perfect scores self.assertGreater(scores["task/TestData/valid/accuracy"], 0.94) self.assertGreater(scores["task_slice:h_pred/TestData/valid/accuracy"], 0.94) self.assertGreater(scores["task_slice:h_ind/TestData/valid/f1"], 0.94) # Calculate/check train/val loss train_dataset = dataloaders[0].dataset train_loss_output = model.calculate_loss(train_dataset.X_dict, train_dataset.Y_dict) train_loss = train_loss_output[0]["task"].item() self.assertLess(train_loss, 0.1) val_dataset = dataloaders[1].dataset val_loss_output = model.calculate_loss(val_dataset.X_dict, val_dataset.Y_dict) val_loss = val_loss_output[0]["task"].item() self.assertLess(val_loss, 0.1)
def test_performance(self): """Test slicing performance with 2 corresponding slice tasks that represent roughly <10% of the data.""" dataloaders = [] for df, split in [(self.df_train, "train"), (self.df_valid, "valid")]: dataloader = create_dataloader(df, split) dataloaders.append(dataloader) base_task = create_task("task", module_suffixes=["A", "B"]) # Apply SFs slicing_functions = [f, g] # low-coverage slices slice_names = [sf.name for sf in slicing_functions] applier = PandasSFApplier(slicing_functions) S_train = applier.apply(self.df_train, progress_bar=False) S_valid = applier.apply(self.df_valid, progress_bar=False) # Add slice labels add_slice_labels(dataloaders[0], base_task, S_train) add_slice_labels(dataloaders[1], base_task, S_valid) # Convert to slice tasks tasks = convert_to_slice_tasks(base_task, slice_names) model = MultitaskClassifier(tasks=tasks) # Train # NOTE: Needs more epochs to convergence with more heads trainer = Trainer(lr=0.001, n_epochs=65, progress_bar=False) trainer.fit(model, dataloaders) scores = model.score(dataloaders) # Confirm reasonably high slice scores # Check train scores self.assertGreater(scores["task/TestData/train/f1"], 0.9) self.assertGreater(scores["task_slice:f_pred/TestData/train/f1"], 0.9) self.assertGreater(scores["task_slice:f_ind/TestData/train/f1"], 0.9) self.assertGreater(scores["task_slice:g_pred/TestData/train/f1"], 0.9) self.assertGreater(scores["task_slice:g_ind/TestData/train/f1"], 0.9) self.assertGreater(scores["task_slice:base_pred/TestData/train/f1"], 0.9) self.assertEqual(scores["task_slice:base_ind/TestData/train/f1"], 1.0) # Check valid scores self.assertGreater(scores["task/TestData/valid/f1"], 0.9) self.assertGreater(scores["task_slice:f_pred/TestData/valid/f1"], 0.9) self.assertGreater(scores["task_slice:f_ind/TestData/valid/f1"], 0.9) self.assertGreater(scores["task_slice:g_pred/TestData/valid/f1"], 0.9) self.assertGreater(scores["task_slice:g_ind/TestData/valid/f1"], 0.9) self.assertGreater(scores["task_slice:base_pred/TestData/valid/f1"], 0.9) # base_ind is trivial: all labels are positive self.assertEqual(scores["task_slice:base_ind/TestData/valid/f1"], 1.0)
def test_remapped_labels(self): # Test additional label keys in the Y_dict # Without remapping, model should ignore them task_name = self.task1.name X = torch.FloatTensor([[i, i] for i in range(NUM_EXAMPLES)]) Y = torch.ones(NUM_EXAMPLES).long() Y_dict = {task_name: Y, "other_task": Y} dataset = DictDataset(name="dataset", split="train", X_dict={"data": X}, Y_dict=Y_dict) dataloader = DictDataLoader(dataset, batch_size=BATCH_SIZE) model = MultitaskClassifier([self.task1]) loss_dict, count_dict = model.calculate_loss(dataset.X_dict, dataset.Y_dict) self.assertIn("task1", loss_dict) # Test setting without remapping results = model.predict(dataloader) self.assertIn("task1", results["golds"]) self.assertNotIn("other_task", results["golds"]) scores = model.score([dataloader]) self.assertIn("task1/dataset/train/accuracy", scores) self.assertNotIn("other_task/dataset/train/accuracy", scores) # Test remapped labelsets results = model.predict(dataloader, remap_labels={"other_task": task_name}) self.assertIn("task1", results["golds"]) self.assertIn("other_task", results["golds"]) results = model.score([dataloader], remap_labels={"other_task": task_name}) self.assertIn("task1/dataset/train/accuracy", results) self.assertIn("other_task/dataset/train/accuracy", results)
def test_convergence(self): """Test multitask classifier convergence with two tasks.""" dataloaders = [] for offset, task_name in zip([0.0, 0.25], ["task1", "task2"]): df = create_data(N_TRAIN, offset) dataloader = create_dataloader(df, "train", task_name) dataloaders.append(dataloader) for offset, task_name in zip([0.0, 0.25], ["task1", "task2"]): df = create_data(N_VALID, offset) dataloader = create_dataloader(df, "valid", task_name) dataloaders.append(dataloader) task1 = create_task("task1", module_suffixes=["A", "A"]) task2 = create_task("task2", module_suffixes=["A", "B"]) model = MultitaskClassifier(tasks=[task1, task2]) # Train trainer = Trainer(lr=0.001, n_epochs=10, progress_bar=False) trainer.fit(model, dataloaders) scores = model.score(dataloaders) # Confirm near perfect scores on both tasks for idx, task_name in enumerate(["task1", "task2"]): self.assertGreater(scores[f"{task_name}/TestData/valid/accuracy"], 0.95) # Calculate/check train/val loss train_dataset = dataloaders[idx].dataset train_loss_output = model.calculate_loss( train_dataset.X_dict, train_dataset.Y_dict ) train_loss = train_loss_output[0][task_name].item() self.assertLess(train_loss, 0.05) val_dataset = dataloaders[2 + idx].dataset val_loss_output = model.calculate_loss( val_dataset.X_dict, val_dataset.Y_dict ) val_loss = val_loss_output[0][task_name].item() self.assertLess(val_loss, 0.05)
) # Add task to list of tasks tasks.append(task_object) # Input list of tasks to MultitaskClassifier object to create model with architecture set for each task model = MultitaskClassifier(tasks) # Set out trainer settings - I.e. how the model will train trainer_config = { "progress_bar": True, "n_epochs": 2, "lr": 0.02, "logging": True, "log_writer": "json", "checkpointing": True, } # Create trainer object using above settings trainer = Trainer(**trainer_config) # Train model using above settings on the datasets linked trainer.fit(model, dataloaders) # Output training stats of model trainer.log_writer.write_log("output_statistics.json") # Score model using test set and print model_scores = model.score(dataloaders) print(model_scores)
# %% from snorkel.classification import Trainer trainer_config = {"progress_bar": False, "n_epochs": 10, "lr": 0.02} trainer = Trainer(**trainer_config) trainer.fit(model, dataloaders) # %% [markdown] # ### Evaluate model # %% [markdown] # After training, we can call the model.score() method to see the final performance of our trained model. # %% model.score(dataloaders) # %% [markdown] # Task-specific metrics are recorded in the form `task/dataset/split/metric` corresponding to the task the made the predictions, the dataset the predictions were made on, the split being evaluated, and the metric being calculated. # # For model-wide metrics (such as the total loss over all tasks or the learning rate), the default task name is `model` and the dataset name is `all` (e.g. `model/all/train/loss`). # %% [markdown] # # Your Turn # %% [markdown] # To check your understanding of how to use the multi-task `MultitaskClassifier`, see if you can add a task to this multi-task model. # # We'll generate the data for you (again, with a train, valid, and test split). # Let's call it the `inv_circle_task`, since it will have the same distribution as our circle data, but with the inverted (flipped) labels. # Intuitively, a model that is very good at telling whether a point is within a certain region should also be very good at telling if it's outside the region.