예제 #1
0
    def test_convergence(self):
        """Test slicing convergence with 1 slice task that represents ~25% of
        the data."""

        dataloaders = []
        for df, split in [(self.df_train, "train"), (self.df_valid, "valid")]:
            dataloader = create_dataloader(df, split)
            dataloaders.append(dataloader)

        base_task = create_task("task", module_suffixes=["A", "B"])

        # Apply SFs
        slicing_functions = [h]  # high coverage slice
        slice_names = [sf.name for sf in slicing_functions]
        applier = PandasSFApplier(slicing_functions)
        S_train = applier.apply(self.df_train, progress_bar=False)
        S_valid = applier.apply(self.df_valid, progress_bar=False)

        self.assertEqual(S_train.shape, (self.N_TRAIN, ))
        self.assertEqual(S_valid.shape, (self.N_VALID, ))
        self.assertIn("h", S_train.dtype.names)

        # Add slice labels
        add_slice_labels(dataloaders[0], base_task, S_train)
        add_slice_labels(dataloaders[1], base_task, S_valid)

        # Convert to slice tasks
        tasks = convert_to_slice_tasks(base_task, slice_names)
        model = MultitaskClassifier(tasks=tasks)

        # Train
        trainer = Trainer(lr=0.001, n_epochs=50, progress_bar=False)
        trainer.fit(model, dataloaders)
        scores = model.score(dataloaders)

        # Confirm near perfect scores
        self.assertGreater(scores["task/TestData/valid/accuracy"], 0.94)
        self.assertGreater(scores["task_slice:h_pred/TestData/valid/accuracy"],
                           0.94)
        self.assertGreater(scores["task_slice:h_ind/TestData/valid/f1"], 0.94)

        # Calculate/check train/val loss
        train_dataset = dataloaders[0].dataset
        train_loss_output = model.calculate_loss(train_dataset.X_dict,
                                                 train_dataset.Y_dict)
        train_loss = train_loss_output[0]["task"].item()
        self.assertLess(train_loss, 0.1)

        val_dataset = dataloaders[1].dataset
        val_loss_output = model.calculate_loss(val_dataset.X_dict,
                                               val_dataset.Y_dict)
        val_loss = val_loss_output[0]["task"].item()
        self.assertLess(val_loss, 0.1)
예제 #2
0
 def test_partially_empty_batch(self):
     dataset = create_dataloader("task1", shuffle=False).dataset
     dataset.Y_dict["task1"][0] = -1
     model = MultitaskClassifier([self.task1])
     loss_dict, count_dict = model.calculate_loss(dataset.X_dict,
                                                  dataset.Y_dict)
     self.assertEqual(count_dict["task1"], 9)
예제 #3
0
 def test_empty_batch(self):
     dataset = create_dataloader("task1", shuffle=False).dataset
     dataset.Y_dict["task1"] = torch.full_like(dataset.Y_dict["task1"], -1)
     model = MultitaskClassifier([self.task1])
     loss_dict, count_dict = model.calculate_loss(dataset.X_dict,
                                                  dataset.Y_dict)
     self.assertFalse(loss_dict)
     self.assertFalse(count_dict)
    def test_convergence(self):
        """Test multitask classifier convergence with two tasks."""

        dataloaders = []

        for offset, task_name in zip([0.0, 0.25], ["task1", "task2"]):
            df = create_data(N_TRAIN, offset)
            dataloader = create_dataloader(df, "train", task_name)
            dataloaders.append(dataloader)

        for offset, task_name in zip([0.0, 0.25], ["task1", "task2"]):
            df = create_data(N_VALID, offset)
            dataloader = create_dataloader(df, "valid", task_name)
            dataloaders.append(dataloader)

        task1 = create_task("task1", module_suffixes=["A", "A"])
        task2 = create_task("task2", module_suffixes=["A", "B"])
        model = MultitaskClassifier(tasks=[task1, task2])

        # Train
        trainer = Trainer(lr=0.001, n_epochs=10, progress_bar=False)
        trainer.fit(model, dataloaders)
        scores = model.score(dataloaders)

        # Confirm near perfect scores on both tasks
        for idx, task_name in enumerate(["task1", "task2"]):
            self.assertGreater(scores[f"{task_name}/TestData/valid/accuracy"], 0.95)

            # Calculate/check train/val loss
            train_dataset = dataloaders[idx].dataset
            train_loss_output = model.calculate_loss(
                train_dataset.X_dict, train_dataset.Y_dict
            )
            train_loss = train_loss_output[0][task_name].item()
            self.assertLess(train_loss, 0.05)

            val_dataset = dataloaders[2 + idx].dataset
            val_loss_output = model.calculate_loss(
                val_dataset.X_dict, val_dataset.Y_dict
            )
            val_loss = val_loss_output[0][task_name].item()
            self.assertLess(val_loss, 0.05)
예제 #5
0
    def test_remapped_labels(self):
        # Test additional label keys in the Y_dict
        # Without remapping, model should ignore them
        task_name = self.task1.name
        X = torch.FloatTensor([[i, i] for i in range(NUM_EXAMPLES)])
        Y = torch.ones(NUM_EXAMPLES).long()

        Y_dict = {task_name: Y, "other_task": Y}
        dataset = DictDataset(name="dataset",
                              split="train",
                              X_dict={"data": X},
                              Y_dict=Y_dict)
        dataloader = DictDataLoader(dataset, batch_size=BATCH_SIZE)

        model = MultitaskClassifier([self.task1])
        loss_dict, count_dict = model.calculate_loss(dataset.X_dict,
                                                     dataset.Y_dict)
        self.assertIn("task1", loss_dict)

        # Test setting without remapping
        results = model.predict(dataloader)
        self.assertIn("task1", results["golds"])
        self.assertNotIn("other_task", results["golds"])
        scores = model.score([dataloader])
        self.assertIn("task1/dataset/train/accuracy", scores)
        self.assertNotIn("other_task/dataset/train/accuracy", scores)

        # Test remapped labelsets
        results = model.predict(dataloader,
                                remap_labels={"other_task": task_name})
        self.assertIn("task1", results["golds"])
        self.assertIn("other_task", results["golds"])
        results = model.score([dataloader],
                              remap_labels={"other_task": task_name})
        self.assertIn("task1/dataset/train/accuracy", results)
        self.assertIn("other_task/dataset/train/accuracy", results)