def test_classifier_dataset(self): """Unit test of DictDataset""" x1 = [ torch.Tensor([1]), torch.Tensor([1, 2]), torch.Tensor([1, 2, 3]), torch.Tensor([1, 2, 3, 4]), torch.Tensor([1, 2, 3, 4, 5]), ] y1 = torch.Tensor([0, 0, 0, 0, 0]) dataset = DictDataset( X_dict={"data1": x1}, Y_dict={"task1": y1}, name="new_data", split="train" ) # Check if the dataset is correctly constructed self.assertTrue(torch.equal(dataset[0][0]["data1"], x1[0])) self.assertTrue(torch.equal(dataset[0][1]["task1"], y1[0])) self.assertEqual( repr(dataset), "DictDataset(name=new_data, X_keys=['data1'], Y_keys=['task1'])", ) # Test from_tensors inits with default values dataset = DictDataset.from_tensors(x1, y1, "train") self.assertEqual( repr(dataset), f"DictDataset(name={DEFAULT_DATASET_NAME}, " f"X_keys=['{DEFAULT_INPUT_DATA_KEY}'], Y_keys=['{DEFAULT_TASK_NAME}'])", )
def create_dataloader(task_name="task", split="train", **kwargs): X = torch.FloatTensor([[i, i] for i in range(NUM_EXAMPLES)]) Y = torch.ones(NUM_EXAMPLES).long() dataset = DictDataset(name="dataset", split=split, X_dict={"data": X}, Y_dict={task_name: Y}) dataloader = DictDataLoader(dataset, batch_size=BATCH_SIZE, **kwargs) return dataloader
def test_score_shuffled(self): # Test scoring with a shuffled dataset class SimpleVoter(nn.Module): def forward(self, x): """Set class 0 to -1 if x and 1 otherwise""" mask = x % 2 == 0 out = torch.zeros(x.shape[0], 2) out[mask, 0] = 1 # class 0 out[~mask, 1] = 1 # class 1 return out # Create model task_name = "VotingTask" module_name = "simple_voter" module_pool = nn.ModuleDict({module_name: SimpleVoter()}) op0 = Operation(module_name=module_name, inputs=[("_input_", "data")], name="op0") op_sequence = [op0] task = Task(name=task_name, module_pool=module_pool, op_sequence=op_sequence) model = MultitaskModel([task]) # Create dataset y_list = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] x_list = [i for i in range(len(y_list))] Y = torch.LongTensor(y_list * 100) X = torch.FloatTensor(x_list * 100) dataset = DictDataset(name="dataset", split="train", X_dict={"data": X}, Y_dict={task_name: Y}) # Create dataloaders dataloader = DictDataLoader(dataset, batch_size=2, shuffle=False) scores = model.score([dataloader]) self.assertEqual(scores["VotingTask/dataset/train/accuracy"], 0.6) dataloader_shuffled = DictDataLoader(dataset, batch_size=2, shuffle=True) scores_shuffled = model.score([dataloader_shuffled]) self.assertEqual(scores_shuffled["VotingTask/dataset/train/accuracy"], 0.6)
def test_remapped_labels(self): # Test additional label keys in the Y_dict # Without remapping, model should ignore them task_name = self.task1.name X = torch.FloatTensor([[i, i] for i in range(NUM_EXAMPLES)]) Y = torch.ones(NUM_EXAMPLES).long() Y_dict = {task_name: Y, "other_task": Y} dataset = DictDataset(name="dataset", split="train", X_dict={"data": X}, Y_dict=Y_dict) dataloader = DictDataLoader(dataset, batch_size=BATCH_SIZE) model = MultitaskModel([self.task1]) loss_dict, count_dict = model.calculate_loss(dataset.X_dict, dataset.Y_dict) self.assertIn("task1", loss_dict) # Test setting without remapping results = model.predict(dataloader) self.assertIn("task1", results["golds"]) self.assertNotIn("other_task", results["golds"]) scores = model.score([dataloader]) self.assertIn("task1/dataset/train/accuracy", scores) self.assertNotIn("other_task/dataset/train/accuracy", scores) # Test remapped labelsets results = model.predict(dataloader, remap_labels={"other_task": task_name}) self.assertIn("task1", results["golds"]) self.assertIn("other_task", results["golds"]) results = model.score([dataloader], remap_labels={"other_task": task_name}) self.assertIn("task1/dataset/train/accuracy", results) self.assertIn("other_task/dataset/train/accuracy", results)
def test_classifier_dataloader(self): """Unit test of DictDataLoader""" x1 = [ torch.Tensor([1]), torch.Tensor([1, 2]), torch.Tensor([1, 2, 3]), torch.Tensor([1, 2, 3, 4]), torch.Tensor([1, 2, 3, 4, 5]), ] y1 = torch.Tensor([0, 0, 0, 0, 0]) x2 = [ torch.Tensor([1, 2, 3, 4, 5]), torch.Tensor([1, 2, 3, 4]), torch.Tensor([1, 2, 3]), torch.Tensor([1, 2]), torch.Tensor([1]), ] y2 = torch.Tensor([1, 1, 1, 1, 1]) dataset = DictDataset( name="new_data", split="train", X_dict={"data1": x1, "data2": x2}, Y_dict={"task1": y1, "task2": y2}, ) dataloader1 = DictDataLoader(dataset=dataset, batch_size=2) x_batch, y_batch = next(iter(dataloader1)) # Check if the dataloader is correctly constructed self.assertEqual(dataloader1.dataset.split, "train") self.assertTrue(torch.equal(x_batch["data1"], torch.Tensor([[1, 0], [1, 2]]))) self.assertTrue( torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0]]) ) ) self.assertTrue(torch.equal(y_batch["task1"], torch.Tensor([0, 0]))) self.assertTrue(torch.equal(y_batch["task2"], torch.Tensor([1, 1]))) dataloader2 = DictDataLoader(dataset=dataset, batch_size=3) x_batch, y_batch = next(iter(dataloader2)) # Check if the dataloader with differet batch size is correctly constructed self.assertEqual(dataloader2.dataset.split, "train") self.assertTrue( torch.equal( x_batch["data1"], torch.Tensor([[1, 0, 0], [1, 2, 0], [1, 2, 3]]) ) ) self.assertTrue( torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0], [1, 2, 3, 0, 0]]), ) ) self.assertTrue(torch.equal(y_batch["task1"], torch.Tensor([0, 0, 0]))) self.assertTrue(torch.equal(y_batch["task2"], torch.Tensor([1, 1, 1]))) y3 = [ torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), ] dataset.Y_dict["task2"] = y3 x_batch, y_batch = next(iter(dataloader1)) # Check dataloader is correctly updated with update dataset self.assertTrue( torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0]]) ) ) self.assertTrue(torch.equal(y_batch["task2"], torch.Tensor([[2], [2]]))) x_batch, y_batch = next(iter(dataloader2)) self.assertTrue( torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0], [1, 2, 3, 0, 0]]), ) ) self.assertTrue(torch.equal(y_batch["task2"], torch.Tensor([[2], [2], [2]])))
# # In the `DictDataset`, each label corresponds to a particular `Task` by name. We'll define these `Task` objects in the following section as we define our model. # # `DictDataLoader` is a wrapper for `torch.utils.data.DataLoader`, which handles the collate function for `DictDataset` appropriately. # + from cerbero.core import DictDataset, DictDataLoader dataloaders = [] for task_name in ["class", "rgb"]: for split, X, Y in (("train", X_train, Y_train), ("valid", X_val, Y_val), ("test", X_test, Y_test)): X_dict = {f"{task_name}_data": torch.FloatTensor(X[task_name])} YTensor = torch.FloatTensor if task_name == "rgb" else torch.LongTensor Y_dict = {f"{task_name}_task": YTensor(Y[task_name])} dataset = DictDataset(f"{task_name}Dataset", split, X_dict, Y_dict) dataloader = DictDataLoader(dataset, batch_size=32) dataloaders.append(dataloader) # - # We now have 4 data loaders, one for each split (`train`, `val`) of each task (`class_task` and `rgb_task`) # ## Define Model # Now we'll define the `MultitaskClassifier` model, a PyTorch multi-task classifier. We'll instantiate it from a list of `Tasks` # + import torch.nn as nn from cerbero.core import Operation
import random import unittest import numpy as np import torch from cerbero.core import DictDataLoader, DictDataset from cerbero.schedulers import SequentialScheduler, ShuffledScheduler dataset1 = DictDataset( "d1", "train", X_dict={"data": [0, 1, 2, 3, 4]}, Y_dict={"labels": torch.LongTensor([1, 1, 1, 1, 1])}, ) dataset2 = DictDataset( "d2", "train", X_dict={"data": [5, 6, 7, 8, 9]}, Y_dict={"labels": torch.LongTensor([2, 2, 2, 2, 2])}, ) dataloader1 = DictDataLoader(dataset1, batch_size=2) dataloader2 = DictDataLoader(dataset2, batch_size=2) dataloaders = [dataloader1, dataloader2] class SequentialTest(unittest.TestCase): def test_sequential(self): scheduler = SequentialScheduler() data = []
# # `DictDataloader` is a wrapper for `torch.utils.data.Dataloader`, which handles the collate function for `DictDataset` appropriately. # + from cerbero.core import DictDataset, DictDataLoader dataloaders = [] for task_name in ["circle", "square"]: for split, X, Y in ( ("train", X_train, Y_train), ("valid", X_valid, Y_valid), ("test", X_test, Y_test), ): X_dict = {f"{task_name}_data": torch.FloatTensor(X[task_name])} Y_dict = {f"{task_name}_task": torch.LongTensor(Y[task_name])} dataset = DictDataset(f"{task_name}Dataset", split, X_dict, Y_dict) dataloader = DictDataLoader(dataset, batch_size=32) dataloaders.append(dataloader) # - # We now have 6 data loaders, one for each split (`train`, `valid`, `test`) of each task (`circle_task` and `square_task`). # ## Define Model # Now we'll define the `MultitaskClassifier` model, a PyTorch multi-task classifier. # We'll instantiate it from a list of `Tasks`. # ### Tasks # A `Task` represents a path through a neural network. In `MultitaskClassifier`, this path corresponds to a particular sequence of PyTorch modules through which each data point will make a forward pass. #