Exemplo n.º 1
0
 def test_shuffled(self):
     set_seed(123)
     scheduler = ShuffledScheduler()
     data = []
     for (batch, dl) in scheduler.get_batches(dataloaders):
         X_dict, Y_dict = batch
         data.extend(X_dict["data"])
     self.assertNotEqual(data, sorted(data))
Exemplo n.º 2
0
    def setUpClass(cls):
        # Ensure deterministic runs
        set_seed(123)

        # Create raw data
        cls.N_TRAIN = 1500

        cls.cardinality = 2
        cls.df_train = create_data(cls.N_TRAIN)
Exemplo n.º 3
0
    def setUpClass(cls):
        # Ensure deterministic runs
        set_seed(123)

        # Create raw data
        cls.N_TRAIN = 1500
        cls.N_VALID = 300

        cls.df_train = create_data(cls.N_TRAIN)
        cls.df_valid = create_data(cls.N_VALID)
Exemplo n.º 4
0
    def test_score_shuffled(self):
        # Test scoring with a shuffled dataset

        set_seed(123)

        class SimpleVoter(nn.Module):
            def forward(self, x):
                """Set class 0 to -1 if x and 1 otherwise"""
                mask = x % 2 == 0
                out = torch.zeros(x.shape[0], 2)
                out[mask, 0] = 1  # class 0
                out[~mask, 1] = 1  # class 1
                return out

        # Create model
        task_name = "VotingTask"
        module_name = "simple_voter"
        module_pool = nn.ModuleDict({module_name: SimpleVoter()})
        op0 = Operation(module_name=module_name,
                        inputs=[("_input_", "data")],
                        name="op0")
        op_sequence = [op0]
        task = Task(name=task_name,
                    module_pool=module_pool,
                    op_sequence=op_sequence)
        model = MultitaskClassifier([task])

        # Create dataset
        y_list = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
        x_list = [i for i in range(len(y_list))]
        Y = torch.LongTensor(y_list * 100)
        X = torch.FloatTensor(x_list * 100)
        dataset = DictDataset(name="dataset",
                              split="train",
                              X_dict={"data": X},
                              Y_dict={task_name: Y})

        # Create dataloaders
        dataloader = DictDataLoader(dataset, batch_size=2, shuffle=False)
        scores = model.score([dataloader])

        self.assertEqual(scores["VotingTask/dataset/train/accuracy"], 0.6)

        dataloader_shuffled = DictDataLoader(dataset,
                                             batch_size=2,
                                             shuffle=True)
        scores_shuffled = model.score([dataloader_shuffled])
        self.assertEqual(scores_shuffled["VotingTask/dataset/train/accuracy"],
                         0.6)
Exemplo n.º 5
0
    def fit(
        self,
        L_train: np.ndarray,
        Y_dev: Optional[np.ndarray] = None,
        class_balance: Optional[List[float]] = None,
        **kwargs: Any,
    ) -> None:
        """Train label model.

        Train label model to estimate mu, the parameters used to combine LFs.

        Parameters
        ----------
        L_train
            An [n,m] matrix with values in {-1,0,1,...,k-1}
        Y_dev
            Gold labels for dev set for estimating class_balance, by default None
        class_balance
            Each class's percentage of the population, by default None
        **kwargs
            Arguments for changing train config defaults

        Raises
        ------
        Exception
            If loss in NaN

        Examples
        --------
        >>> L = np.array([[0, 0, -1], [-1, 0, 1], [1, -1, 0]])
        >>> Y_dev = [0, 1, 0]
        >>> label_model = LabelModel(verbose=False)
        >>> label_model.fit(L)
        >>> label_model.fit(L, Y_dev=Y_dev)
        >>> label_model.fit(L, class_balance=[0.7, 0.3])
        """
        # Set random seed
        self.train_config: TrainConfig = merge_config(  # type:ignore
            TrainConfig(),
            kwargs  # type:ignore
        )
        # Update base config so that it includes all parameters
        set_seed(self.train_config.seed)

        L_shift = L_train + 1  # convert to {0, 1, ..., k}
        if L_shift.max() > self.cardinality:
            raise ValueError(
                f"L_train has cardinality {L_shift.max()}, cardinality={self.cardinality} passed in."
            )

        self._set_class_balance(class_balance, Y_dev)
        self._set_constants(L_shift)
        self._create_tree()
        lf_analysis = LFAnalysis(L_train)
        self.coverage = lf_analysis.lf_coverages()

        # Compute O and initialize params
        if self.config.verbose:  # pragma: no cover
            logging.info("Computing O...")
        self._generate_O(L_shift)
        self._init_params()

        # Estimate \mu
        if self.config.verbose:  # pragma: no cover
            logging.info("Estimating \mu...")

        # Set model to train mode
        self.train()

        # Move model to GPU
        if self.config.verbose and self.config.device != "cpu":  # pragma: no cover
            logging.info("Using GPU...")
        self.to(self.config.device)

        # Set training components
        self._set_logger()
        self._set_optimizer()
        self._set_lr_scheduler()

        # Restore model if necessary
        start_iteration = 0

        # Train the model
        metrics_hist = {}  # The most recently seen value for all metrics
        for epoch in range(start_iteration, self.train_config.n_epochs):
            self.running_loss = 0.0
            self.running_examples = 0

            # Zero the parameter gradients
            self.optimizer.zero_grad()

            # Forward pass to calculate the average loss per example
            loss = self._loss_mu(l2=self.train_config.l2)
            if torch.isnan(loss):
                msg = "Loss is NaN. Consider reducing learning rate."
                raise Exception(msg)

            # Backward pass to calculate gradients
            # Loss is an average loss per example
            loss.backward()

            # Perform optimizer step
            self.optimizer.step()

            # Calculate metrics, log, and checkpoint as necessary
            metrics_dict = self._execute_logging(loss)
            metrics_hist.update(metrics_dict)

            # Update learning rate
            self._update_lr_scheduler(epoch)

        self.eval()

        # Print confusion matrix if applicable
        if self.config.verbose:  # pragma: no cover
            logging.info("Finished Training")
Exemplo n.º 6
0
# The purpose of this tutorial is to introduce the basic interfaces and flow of multi-task learning tools within Snorkel.
# We assume that you have prior experience with MTL, so we don't motivate or explain multi-task learning at large here.
#
# In this notebook, we will start by looking at a simple MTL model with only two tasks, each having distinct data and only one set of ground truth labels ("gold" labels). We'll also use a simple dataset where the raw data is directly usable as features, for simplicity (i.e., unlike text data, where we would first need to tokenize and transform the data into token ids).
# At the end, you'll fill in the missing details to add a third task to the model.

# %% [markdown] {"tags": ["md-exclude"]}
# ## Environment Setup

# %% {"tags": ["md-exclude"]}
# %matplotlib inline

from snorkel.utils import set_seed

SEED = 123
set_seed(SEED)

# %% [markdown]
# ## Create Toy Data

# %% [markdown]
# We'll now create a toy dataset to work with.
# Our data points are 2D points in a square centered on the origin.
# Our tasks will be classifying whether these points are:
#
# 1. Inside a **unit circle** centered on the origin (label 0 = `False`, label 1 = `True`)
# 2. Inside a **unit square** centered on the origin (label 0 = `False`, label 1 = `True`)

# %% [markdown] {"tags": ["md-exclude"]}
# We'll visualize these decision boundaries in a few cells.
Exemplo n.º 7
0
# In this tutorial, we:
# 1. **Introduce _Slicing Functions (SFs)_** as a programming interface
# 1. **Monitor** application-critical data subsets
# 2. **Improve model performance** on slices

# %% [markdown] {"tags": ["md-exclude"]}
# First, we'll set up our notebook for reproducibility and proper logging.

# %% {"tags": ["md-exclude"]}
import logging
import os
from snorkel.utils import set_seed

# For reproducibility
os.environ["PYTHONHASHSEED"] = "0"
set_seed(111)

# Make sure we're running from the spam/ directory
if os.path.basename(os.getcwd()) == "snorkel-tutorials":
    os.chdir("spam")

# To visualize logs
logger = logging.getLogger()
logger.setLevel(logging.WARNING)

# %% [markdown] {"tags": ["md-exclude"]}
# If you want to display all comment text untruncated, change `DISPLAY_ALL_TEXT` to `True` below.

# %% {"tags": ["md-exclude"]}
import pandas as pd
Exemplo n.º 8
0
 def setUp(self):
     set_seed(123)
Exemplo n.º 9
0
 def setUpClass(cls):
     # Ensure deterministic runs
     set_seed(123)
Exemplo n.º 10
0
 def setUpClass(cls):
     set_seed(123)