예제 #1
0
    def test_classifier_dataset(self):
        """Unit test of DictDataset"""

        x1 = [
            torch.Tensor([1]),
            torch.Tensor([1, 2]),
            torch.Tensor([1, 2, 3]),
            torch.Tensor([1, 2, 3, 4]),
            torch.Tensor([1, 2, 3, 4, 5]),
        ]

        y1 = torch.Tensor([0, 0, 0, 0, 0])

        dataset = DictDataset(
            X_dict={"data1": x1}, Y_dict={"task1": y1}, name="new_data", split="train"
        )

        # Check if the dataset is correctly constructed
        self.assertTrue(torch.equal(dataset[0][0]["data1"], x1[0]))
        self.assertTrue(torch.equal(dataset[0][1]["task1"], y1[0]))
        self.assertEqual(
            repr(dataset),
            "DictDataset(name=new_data, X_keys=['data1'], Y_keys=['task1'])",
        )

        # Test from_tensors inits with default values
        dataset = DictDataset.from_tensors(x1, y1, "train")
        self.assertEqual(
            repr(dataset),
            f"DictDataset(name={DEFAULT_DATASET_NAME}, "
            f"X_keys=['{DEFAULT_INPUT_DATA_KEY}'], Y_keys=['{DEFAULT_TASK_NAME}'])",
        )
예제 #2
0
def create_dataloader(task_name="task", split="train"):
    X = torch.FloatTensor([[i, i] for i in range(NUM_EXAMPLES)])
    Y = torch.ones(NUM_EXAMPLES).long()

    dataset = DictDataset(
        name="dataset", split=split, X_dict={"data": X}, Y_dict={task_name: Y}
    )

    dataloader = DictDataLoader(dataset, batch_size=BATCH_SIZE)
    return dataloader
예제 #3
0
    def test_score_shuffled(self):
        # Test scoring with a shuffled dataset

        set_seed(123)

        class SimpleVoter(nn.Module):
            def forward(self, x):
                """Set class 0 to -1 if x and 1 otherwise"""
                mask = x % 2 == 0
                out = torch.zeros(x.shape[0], 2)
                out[mask, 0] = 1  # class 0
                out[~mask, 1] = 1  # class 1
                return out

        # Create model
        task_name = "VotingTask"
        module_name = "simple_voter"
        module_pool = nn.ModuleDict({module_name: SimpleVoter()})
        op0 = Operation(module_name=module_name,
                        inputs=[("_input_", "data")],
                        name="op0")
        op_sequence = [op0]
        task = Task(name=task_name,
                    module_pool=module_pool,
                    op_sequence=op_sequence)
        model = MultitaskClassifier([task])

        # Create dataset
        y_list = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
        x_list = [i for i in range(len(y_list))]
        Y = torch.LongTensor(y_list * 100)
        X = torch.FloatTensor(x_list * 100)
        dataset = DictDataset(name="dataset",
                              split="train",
                              X_dict={"data": X},
                              Y_dict={task_name: Y})

        # Create dataloaders
        dataloader = DictDataLoader(dataset, batch_size=2, shuffle=False)
        scores = model.score([dataloader])

        self.assertEqual(scores["VotingTask/dataset/train/accuracy"], 0.6)

        dataloader_shuffled = DictDataLoader(dataset,
                                             batch_size=2,
                                             shuffle=True)
        scores_shuffled = model.score([dataloader_shuffled])
        self.assertEqual(scores_shuffled["VotingTask/dataset/train/accuracy"],
                         0.6)
예제 #4
0
    def test_add_slice_labels(self):
        # Create dummy data
        # Given slicing function f(), we expect the first two entries to be active
        x = torch.Tensor([0.1, 0.2, 0.3, 0.4, 0.5])
        y = torch.Tensor([0, 1, 1, 0, 1]).long()
        dataset = DictDataset(
            name="TestData", split="train", X_dict={"data": x}, Y_dict={"TestTask": y}
        )

        # Ensure that we start with 1 labelset
        self.assertEqual(len(dataset.Y_dict), 1)

        # Apply SFs with PandasSFApplier
        df = pd.DataFrame({"val": x, "y": y})
        slicing_functions = [f]
        applier = PandasSFApplier(slicing_functions)
        S = applier.apply(df, progress_bar=False)

        dataloader = DictDataLoader(dataset)

        dummy_task = create_dummy_task(task_name="TestTask")
        add_slice_labels(dataloader, dummy_task, S)

        # Ensure that all the fields are present
        labelsets = dataloader.dataset.Y_dict
        self.assertIn("TestTask", labelsets)
        self.assertIn("TestTask_slice:base_ind", labelsets)
        self.assertIn("TestTask_slice:base_pred", labelsets)
        self.assertIn("TestTask_slice:f_ind", labelsets)
        self.assertIn("TestTask_slice:f_pred", labelsets)
        self.assertEqual(len(labelsets), 5)

        # Ensure "ind" contains mask
        self.assertEqual(
            labelsets["TestTask_slice:f_ind"].numpy().tolist(), [1, 1, 0, 0, 0]
        )
        self.assertEqual(
            labelsets["TestTask_slice:base_ind"].numpy().tolist(), [1, 1, 1, 1, 1]
        )

        # Ensure "pred" contains masked elements
        self.assertEqual(
            labelsets["TestTask_slice:f_pred"].numpy().tolist(), [0, 1, -1, -1, -1]
        )
        self.assertEqual(
            labelsets["TestTask_slice:base_pred"].numpy().tolist(), [0, 1, 1, 0, 1]
        )
        self.assertEqual(labelsets["TestTask"].numpy().tolist(), [0, 1, 1, 0, 1])
예제 #5
0
def create_dataloader(df: pd.DataFrame, split: str) -> DictDataLoader:
    dataset = DictDataset(
        name="TestData",
        split=split,
        X_dict={
            "coordinates": torch.stack(
                (torch.tensor(df["x1"]), torch.tensor(df["x2"])), dim=1
            )
        },
        Y_dict={"task": torch.tensor(df["y"], dtype=torch.long)},
    )

    dataloader = DictDataLoader(
        dataset=dataset, batch_size=4, shuffle=(dataset.split == "train")
    )
    return dataloader
예제 #6
0
    def test_remapped_labels(self):
        # Test additional label keys in the Y_dict
        # Without remapping, model should ignore them
        task_name = self.task1.name
        X = torch.FloatTensor([[i, i] for i in range(NUM_EXAMPLES)])
        Y = torch.ones(NUM_EXAMPLES).long()

        Y_dict = {task_name: Y, "other_task": Y}
        dataset = DictDataset(name="dataset",
                              split="train",
                              X_dict={"data": X},
                              Y_dict=Y_dict)
        dataloader = DictDataLoader(dataset, batch_size=BATCH_SIZE)

        model = MultitaskClassifier([self.task1])
        loss_dict, count_dict = model.calculate_loss(dataset.X_dict,
                                                     dataset.Y_dict)
        self.assertIn("task1", loss_dict)

        # Test setting without remapping
        results = model.predict(dataloader)
        self.assertIn("task1", results["golds"])
        self.assertNotIn("other_task", results["golds"])
        scores = model.score([dataloader])
        self.assertIn("task1/dataset/train/accuracy", scores)
        self.assertNotIn("other_task/dataset/train/accuracy", scores)

        # Test remapped labelsets
        results = model.predict(dataloader,
                                remap_labels={"other_task": task_name})
        self.assertIn("task1", results["golds"])
        self.assertIn("other_task", results["golds"])
        results = model.score([dataloader],
                              remap_labels={"other_task": task_name})
        self.assertIn("task1/dataset/train/accuracy", results)
        self.assertIn("other_task/dataset/train/accuracy", results)
예제 #7
0
    def test_make_slice_dataloader(self):
        # Test correct construction
        dataloader = self.slice_model.make_slice_dataloader(
            dataset=self.datasets[0], S=self.S)
        Y_dict = dataloader.dataset.Y_dict
        self.assertEqual(len(Y_dict), 7)
        self.assertIn("test_task", Y_dict)
        self.assertIn("test_task_slice:base_pred", Y_dict)
        self.assertIn("test_task_slice:base_ind", Y_dict)
        self.assertIn("test_task_slice:f_pred", Y_dict)
        self.assertIn("test_task_slice:f_ind", Y_dict)
        self.assertIn("test_task_slice:g_pred", Y_dict)
        self.assertIn("test_task_slice:g_ind", Y_dict)

        # Test bad data input
        bad_data_dataset = DictDataset(
            name="test_data",
            split="train",
            X_dict={self.data_name: self.X},
            Y_dict={"bad_labels": self.Y},
        )
        with self.assertRaisesRegex(ValueError, "labels missing"):
            self.slice_model.make_slice_dataloader(dataset=bad_data_dataset,
                                                   S=self.S)
예제 #8
0
        # Get the number of classes included in dataset to use in task-specific head later
        num_classes = len(output_label_to_int_dict.keys())

        # Define dictionary keys for the data, dataset and task of the given task
        task_data_name = f"{task_name}_data"
        task_formal_name = f"{task_name}_task"
        task_dataset_name = f"{task_name}Dataset"

        for split, X, Y in (
            ("train", train_X, train_y),
            ("valid", dev_X, dev_y),
            ("test", test_X, test_y),
        ):
            X_dict = {task_data_name: torch.tensor(X, dtype=torch.long)}
            Y_dict = {task_formal_name: torch.tensor(Y, dtype=torch.long)}
            dataset = DictDataset(task_dataset_name, split, X_dict, Y_dict)
            dataloader = DictDataLoader(dataset, batch_size=BATCH_SIZE)
            dataloaders.append(dataloader)

        # Define a one-layer prediction "head" module specific to each task
        head_module = task_type_function_mapping[task_type]["head_module"](
            hidden_layer_size, num_classes)

        task_head_name = f"{task_name}_head_module"

        # The module pool contains all the modules this task uses
        module_pool = nn.ModuleDict({
            "bert_module": bert_module,
            task_head_name: head_module
        })
예제 #9
0
    def test_classifier_dataloader(self):
        """Unit test of DictDataLoader"""

        x1 = [
            torch.Tensor([1]),
            torch.Tensor([1, 2]),
            torch.Tensor([1, 2, 3]),
            torch.Tensor([1, 2, 3, 4]),
            torch.Tensor([1, 2, 3, 4, 5]),
        ]

        y1 = torch.Tensor([0, 0, 0, 0, 0])

        x2 = [
            torch.Tensor([1, 2, 3, 4, 5]),
            torch.Tensor([1, 2, 3, 4]),
            torch.Tensor([1, 2, 3]),
            torch.Tensor([1, 2]),
            torch.Tensor([1]),
        ]

        y2 = torch.Tensor([1, 1, 1, 1, 1])

        dataset = DictDataset(
            name="new_data",
            split="train",
            X_dict={"data1": x1, "data2": x2},
            Y_dict={"task1": y1, "task2": y2},
        )

        dataloader1 = DictDataLoader(dataset=dataset, batch_size=2)

        x_batch, y_batch = next(iter(dataloader1))

        # Check if the dataloader is correctly constructed
        self.assertEqual(dataloader1.dataset.split, "train")
        self.assertTrue(torch.equal(x_batch["data1"], torch.Tensor([[1, 0], [1, 2]])))
        self.assertTrue(
            torch.equal(
                x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0]])
            )
        )
        self.assertTrue(torch.equal(y_batch["task1"], torch.Tensor([0, 0])))
        self.assertTrue(torch.equal(y_batch["task2"], torch.Tensor([1, 1])))

        dataloader2 = DictDataLoader(dataset=dataset, batch_size=3)

        x_batch, y_batch = next(iter(dataloader2))

        # Check if the dataloader with differet batch size is correctly constructed
        self.assertEqual(dataloader2.dataset.split, "train")
        self.assertTrue(
            torch.equal(
                x_batch["data1"], torch.Tensor([[1, 0, 0], [1, 2, 0], [1, 2, 3]])
            )
        )
        self.assertTrue(
            torch.equal(
                x_batch["data2"],
                torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0], [1, 2, 3, 0, 0]]),
            )
        )
        self.assertTrue(torch.equal(y_batch["task1"], torch.Tensor([0, 0, 0])))
        self.assertTrue(torch.equal(y_batch["task2"], torch.Tensor([1, 1, 1])))

        y3 = [
            torch.Tensor([2]),
            torch.Tensor([2]),
            torch.Tensor([2]),
            torch.Tensor([2]),
            torch.Tensor([2]),
        ]

        dataset.Y_dict["task2"] = y3

        x_batch, y_batch = next(iter(dataloader1))
        # Check dataloader is correctly updated with update dataset
        self.assertTrue(
            torch.equal(
                x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0]])
            )
        )
        self.assertTrue(torch.equal(y_batch["task2"], torch.Tensor([[2], [2]])))

        x_batch, y_batch = next(iter(dataloader2))
        self.assertTrue(
            torch.equal(
                x_batch["data2"],
                torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0], [1, 2, 3, 0, 0]]),
            )
        )
        self.assertTrue(torch.equal(y_batch["task2"], torch.Tensor([[2], [2], [2]])))
예제 #10
0
# `DictDataloader` is a wrapper for `torch.utils.data.Dataloader`, which handles the collate function for `DictDataset` appropriately.

# %%
import torch
from snorkel.classification import DictDataset, DictDataLoader

dataloaders = []
for task_name in ["circle", "square"]:
    for split, X, Y in (
        ("train", X_train, Y_train),
        ("valid", X_valid, Y_valid),
        ("test", X_test, Y_test),
    ):
        X_dict = {f"{task_name}_data": torch.FloatTensor(X[task_name])}
        Y_dict = {f"{task_name}_task": torch.LongTensor(Y[task_name])}
        dataset = DictDataset(f"{task_name}Dataset", split, X_dict, Y_dict)
        dataloader = DictDataLoader(dataset, batch_size=32)
        dataloaders.append(dataloader)

# %% [markdown]
# We now have 6 data loaders, one for each split (`train`, `valid`, `test`) of each task (`circle_task` and `square_task`).

# %% [markdown]
# ## Define Model

# %% [markdown]
# Now we'll define the `MultitaskClassifier` model, a PyTorch multi-task classifier.
# We'll instantiate it from a list of `Tasks`.

# %% [markdown]
# ### Tasks
예제 #11
0
import random
import unittest

import numpy as np
import torch

from snorkel.classification import DictDataLoader, DictDataset
from snorkel.classification.training.schedulers import (
    SequentialScheduler,
    ShuffledScheduler,
)

dataset1 = DictDataset(
    "d1",
    "train",
    X_dict={"data": [0, 1, 2, 3, 4]},
    Y_dict={"labels": torch.LongTensor([1, 1, 1, 1, 1])},
)
dataset2 = DictDataset(
    "d2",
    "train",
    X_dict={"data": [5, 6, 7, 8, 9]},
    Y_dict={"labels": torch.LongTensor([2, 2, 2, 2, 2])},
)

dataloader1 = DictDataLoader(dataset1, batch_size=2)
dataloader2 = DictDataLoader(dataset2, batch_size=2)
dataloaders = [dataloader1, dataloader2]


class SequentialTest(unittest.TestCase):
예제 #12
0
def create_dataset(X, Y, split, dataset_name, input_name, task_name):
    return DictDataset(name=dataset_name,
                       split=split,
                       X_dict={input_name: X},
                       Y_dict={task_name: Y})