Exemplo n.º 1
0
def test_column_specific_pad_indexes():
    """
    The first column should be padded to 10 elements; (10 - 1) + (10 - 4) + (10 - 10) of these should be -2.

    The second column should be padded to 10 elements; (10 - 1) + (10 - 5) + (10 - 10) of these should be -1.

    The third column should be padded to 10 elements; (10 - 1) + (10 - 6) + (10 - 10) of these should be 0.
    """
    bs = 3
    data = ((torch.tensor(1 * [1]), torch.tensor(1 * [2]),
             torch.tensor(1 * [3])), (torch.tensor(4 * [1]),
                                      torch.tensor(5 * [2]),
                                      torch.tensor(6 * [3])),
            (torch.tensor(10 * [1]), torch.tensor(10 * [2]),
             torch.tensor(10 * [3])))
    sampler = BaseSampler(batch_size=bs,
                          shuffle=False,
                          pad_index=(-2, -1, 0),
                          batch_first=False)
    sampler = sampler.sample(data)
    batch = next(sampler)
    a, b, c = batch

    assert (a == -2).sum() == (10 - 1) + (10 - 4) + (10 - 10)
    assert (b == -1).sum() == (10 - 1) + (10 - 5) + (10 - 10)
    assert (c == 0).sum() == (10 - 1) + (10 - 6) + (10 - 10)
Exemplo n.º 2
0
def trainer():
    return train.Trainer(dataset=DummyDataset(),
                         train_sampler=BaseSampler(),
                         val_sampler=BaseSampler(),
                         model=DummyModel(),
                         loss_fn=NLLLoss(),
                         metric_fn=NLLLoss(),
                         optimizer=Adam,
                         extra_validation_metrics=[NLLLoss()] * 3)
Exemplo n.º 3
0
    def __init__(self,
                 dataset: Dataset,
                 model: Module,
                 metric_fn: Metric,
                 eval_sampler: Optional[Sampler] = None,
                 eval_data: str = 'test',
                 device: Optional[str] = None,
                 save_preds: bool = False,
                 teacher: Module = None,
                 save_targets: bool = False,
                 gen_style: str = 'greedy') -> None:
        self.eval_sampler = eval_sampler or BaseSampler(batch_size=16,
                                                        shuffle=False)
        self.model = model
        self.metric_fn = metric_fn
        self.eval_metric = None
        self.dataset = dataset

        self.device = select_device(device)

        data = getattr(dataset, eval_data)
        self._eval_iterator = self.eval_sampler.sample(data)

        # By default, no prefix applied to tb logs
        self.tb_log_prefix = None

        self.save_preds = save_preds
        self.decode_data = None
        self.teacher = teacher
        self.save_targets = save_targets
        self.targets = None
        self.gen_style = gen_style
        self.register_attrs('decode_data', 'targets')
Exemplo n.º 4
0
def test_incorrect_num_column_specific_pad_indexes_raises_error():
    """
    The first column should be padded to 10 elements; (10 - 1) + (10 - 4) + (10 - 10) of these should be -2.

    The second column should be padded to 10 elements; (10 - 1) + (10 - 5) + (10 - 10) of these should be -1.

    The third column should be padded to 10 elements; (10 - 1) + (10 - 6) + (10 - 10) of these should be 0.
    """
    bs = 3
    data = ((torch.tensor(1 * [1]), torch.tensor(1 * [2]),
             torch.tensor(1 * [3])), (torch.tensor(4 * [1]),
                                      torch.tensor(5 * [2]),
                                      torch.tensor(6 * [3])),
            (torch.tensor(10 * [1]), torch.tensor(10 * [2]),
             torch.tensor(10 * [3])))
    num_cols = len(data)

    sampler = BaseSampler(batch_size=bs,
                          shuffle=False,
                          pad_index=(num_cols + 99) * (0, ),
                          batch_first=False)
    sampler = sampler.sample(data)
    with pytest.raises(Exception):
        batch = next(sampler)
Exemplo n.º 5
0
    def __init__(self,
                 dataset: Dataset,
                 model: Module,
                 metric_fn: Metric,
                 eval_sampler: Optional[Sampler] = None,
                 eval_data: str = 'test',
                 device: Optional[str] = None) -> None:
        """Initialize the evaluator.

        Parameters
        ----------
        dataset : Dataset
            The dataset to run evaluation on
        model : Module
            The model to train
        metric_fn: Metric
            The metric to use for evaluation
        eval_sampler : Optional[Sampler]
            The sampler to use over validation examples. By default
            it will use `BaseSampler` with batch size 16 and without
            shuffling.
        eval_data: str
            The data split to evaluate on: one of train, val or test
        device: str, optional
            The device to use in the computation.

        """
        self.eval_sampler = eval_sampler or BaseSampler(batch_size=16,
                                                        shuffle=False)
        self.model = model
        self.metric_fn = metric_fn
        self.eval_metric = None
        self.dataset = dataset

        # Select right device
        if device is not None:
            self.device = device
        else:
            self.device = "cuda" if torch.cuda.is_available() else "cpu"

        data = getattr(dataset, eval_data)
        self._eval_iterator = self.eval_sampler.sample(data)

        # By default, no prefix applied to tb logs
        self.tb_log_prefix = None
Exemplo n.º 6
0
def run_experiment(
        name=DEFAULT_HYPER_PARAMS['experiment_name'],
        max_steps=DEFAULT_HYPER_PARAMS['max_steps'],
        iter_per_step=DEFAULT_HYPER_PARAMS['iter_per_step'],
        embedding_dim=DEFAULT_HYPER_PARAMS['embedding_dim'],
        n_layers=DEFAULT_HYPER_PARAMS['n_layers'],
        rnn_type=DEFAULT_HYPER_PARAMS['rnn_type'],
        hidden_size=DEFAULT_HYPER_PARAMS['hidden_size'],
        rnn_dropout=DEFAULT_HYPER_PARAMS['rnn_dropout'],
        embedding_dropout=DEFAULT_HYPER_PARAMS['embedding_dropout']):
    # start off experiment progress at 0
    em.write_progress(0)

    # Dataset
    dataset = SSTDataset(transform={
        'text': TextField(),
        'label': LabelField()
    })

    # Model - takes params from front end GUI or from defaults in json
    model = RNNTextClassifier(vocab_size=dataset.text.vocab_size,
                              num_labels=dataset.label.vocab_size,
                              embedding_dim=embedding_dim,
                              n_layers=n_layers,
                              rnn_type=rnn_type,
                              hidden_size=hidden_size,
                              rnn_dropout=rnn_dropout,
                              embedding_dropout=embedding_dropout)

    # Trainer
    trainer = Trainer(
        dataset=dataset,
        model=model,
        train_sampler=BaseSampler(),
        val_sampler=BaseSampler(),
        loss_fn=torch.nn.NLLLoss(),
        metric_fn=Accuracy(),
        optimizer=torch.optim.Adam(params=model.trainable_params),
        max_steps=max_steps,  # Total number of times to evaluate the model
        iter_per_step=iter_per_step
    )  # Number of training iterations between steps

    # Run training
    current_iter_num = 0
    total_iters = max_steps * iter_per_step
    continue_ = True

    with TrialLogging(log_dir=TENSORBOARD_DIR + name,
                      verbose=False,
                      console_prefix=name,
                      capture_warnings=True):
        with tqdm(total=total_iters) as pbar:
            while continue_:
                continue_ = trainer.run(
                )  # returns a boolean indicating if you should keep going
                # Update CLI progress bar
                pbar.update(iter_per_step)  # N iterations per step

                # Update progress data in DB to reflect updates on GUI
                current_iter_num += iter_per_step
                em.write_progress(int(current_iter_num / total_iters * 100))
Exemplo n.º 7
0
def test_compose_padded_batches_from_nested_seq():
    """
    In the first two observations:
        * The max number of first-level elements is 4.
        * The max size of a first-level element is 8.
    In the second two observations:
        * The max number of first-level elements is 2.
        * The max size of a first-level element is 5.

    > The first batch should be of size (bs, 4, 8)
    > The second batch should be of size (bs, 2, 5)
    """
    bs = 2
    data = (
        (
            torch.tensor([7]),  # Not nested
            [torch.tensor([1, 2]),
             torch.tensor([3, 4, 5, 6, 7])],  # Nested
            [
                torch.tensor([1, 2]),
                torch.tensor([1, 2]),
                torch.tensor([3, 4, 5])
            ]  # Nested
        ),
        (
            torch.tensor([7, 8]),  # Not nested
            [
                torch.tensor([7, 8]),
                torch.tensor([7, 8]),
                torch.tensor([7, 8]),
                torch.tensor([7, 8])
            ],  # Nested
            [torch.tensor([1, 2, 3, 4, 5, 6, 7, 8])]  # Nested
        ),
        (
            torch.tensor([7, 8, 9]),  # Not nested
            [torch.tensor([1, 2, 3]),
             torch.tensor([4, 5])],  # Nested
            [torch.tensor([1, 2])]  # Nested
        ),
        (
            torch.tensor([7, 8, 9, 10]),  # Not nested
            [torch.tensor([1, 2, 3]),
             torch.tensor([4, 5, 6, 7, 8, 9])],  # Nested
            [torch.tensor([1, 2, 4, 5, 6, 7, 8])]  # Nested
        ),
    )
    sampler = BaseSampler(batch_size=bs,
                          shuffle=False,
                          pad_index=0,
                          batch_first=False)
    sampler = sampler.sample(data)

    batch = next(sampler)
    a, b, c = batch
    assert a.size() == (
        2, bs)  # In first batch, first col: largest element has length 2
    assert b.size() == (
        5, 4, bs
    )  # In first batch, second col: largest child has length 5; largest seq. of children has length 4
    assert c.size() == (
        8, 3, bs
    )  # In first batch, third col: largest child has length 8; largest seq. of children has length 3

    batch = next(sampler)
    a, b, c = batch
    assert a.size() == (
        4, bs)  # In second batch, first col: largest element has length 4
    assert b.size() == (
        6, 2, bs
    )  # In first batch, second col: largest child has length 6; largest seq. of children has length 2
    assert c.size() == (
        7, 1, bs
    )  # In first batch, third col: largest child has length 7; largest seq. of children has length 1