Пример #1
0
def test_pass_labels_with_unkown_1():
    """Test labels specified in the init"""
    dummy = ['LABEL1', 'LABEL3', 'LABEL2', 'LABEL2']

    field = LabelField(labels=['LABEL1', 'LABEL2'])
    with pytest.raises(ValueError):
        field.setup(dummy)
Пример #2
0
def test_label_frequencies_3():
    """Test label frequencies."""
    dummy = []

    field = LabelField()
    field.setup(dummy)

    assert len(field.vocab) == 0

    assert len(field.label_freq) == 0
    assert len(field.label_count) == 0
    assert len(field.label_inv_freq) == 0
Пример #3
0
def test_label_process_multilabel():
    """Test label nuemricalization."""
    dummy = ['LABEL1,LABEL2', 'LABEL3', 'LABEL2,LABEL1', 'LABEL2']

    field = LabelField()
    field.setup(dummy)
    assert len(field.vocab) == 4

    field = LabelField(multilabel_sep=',')
    field.setup(dummy)
    assert len(field.vocab) == 3

    assert list(field.process('LABEL1,LABEL2')) == [0, 1]
    assert list(field.process('LABEL2,LABEL1')) == [1, 0]
    assert int(field.process('LABEL2')) == 1
    assert int(field.process('LABEL3')) == 2
Пример #4
0
def test_dataset_transform_7():
    train = (
            ("Lorem ipsum dolor sit amet", "POSITIVE"),
            ("Sed ut perspiciatis unde", "NEGATIVE"))

    class DummyField(Field):
        def setup(self, *data: np.ndarray) -> None:
            pass

        def process(self, ex1, ex2):
            return torch.tensor(0)

    transform = {
        "text": {
            "field": DummyField(),
            "columns": [0, 1]
        },
        "other": {
            "field": DummyField(),
            "columns": [0, 1]
        },
        "other2": {
            "field": LabelField(),
            "columns": 0
        }
    }

    t = TabularDataset(train, transform=transform)
    assert t.train.cols() == 3
Пример #5
0
def test_label_frequencies_2():
    """Test label frequencies."""
    dummy = ['LABEL1'] * 80

    field = LabelField()
    field.setup(dummy)

    assert len(field.vocab) == 1

    assert len(field.label_freq) == 1
    assert isclose(field.label_freq[0].item(), 1, rtol=NUMERIC_PRECISION)

    assert len(field.label_count) == 1
    assert isclose(field.label_count[0].item(), 80, rtol=NUMERIC_PRECISION)

    assert len(field.label_inv_freq) == 1
    assert isclose(field.label_inv_freq[0].item(), 1, rtol=NUMERIC_PRECISION)
Пример #6
0
def test_dataset_transform_8():
    train = (("Lorem ipsum dolor sit amet", "POSITIVE"),
             ("Sed ut perspiciatis unde", "NEGATIVE"))

    transform = {"tx": {"field": LabelField(), "columns": [0, 1]}}

    with pytest.raises(TypeError):
        t = TabularDataset(train, transform=transform)
        t.train.cols()
Пример #7
0
def test_dataset_transform_with_invalid_named_cols():
    train = (("Lorem ipsum dolor sit amet", "POSITIVE"),
             ("Sed ut perspiciatis unde", "NEGATIVE"))

    transform = {"tx": {"field": LabelField(), "columns": 'none_existent'}}

    with pytest.raises(ValueError):
        TabularDataset(train,
                       transform=transform,
                       named_columns=['text', 'label'])
Пример #8
0
def test_dataset_transform_with_named_cols():
    train = (("Lorem ipsum dolor sit amet", "POSITIVE"),
             ("Sed ut perspiciatis unde", "NEGATIVE"))

    transform = {"tx": {"field": LabelField(), "columns": 'label'}}

    t = TabularDataset(train,
                       transform=transform,
                       named_columns=['text', 'label'])
    assert len(t.train[0]) == 1
Пример #9
0
def test_label_process_multilabel_one_hot_frequencies():
    """Test label numericalization."""
    dummy = ['LABEL1,LABEL2', 'LABEL3', 'LABEL2,LABEL1', 'LABEL2']

    field = LabelField(multilabel_sep=',', one_hot=True)
    field.setup(dummy)

    assert len(field.label_freq) == 3
    assert isclose(field.label_freq[0].item(), 0.333, rtol=NUMERIC_PRECISION)
    assert isclose(field.label_freq[1].item(), 0.5, rtol=NUMERIC_PRECISION)
    assert isclose(field.label_freq[2].item(), 0.166, rtol=NUMERIC_PRECISION)

    assert len(field.label_count) == 3
    assert isclose(field.label_count[0].item(), 2, rtol=NUMERIC_PRECISION)
    assert isclose(field.label_count[1].item(), 3, rtol=NUMERIC_PRECISION)
    assert isclose(field.label_count[2].item(), 1, rtol=NUMERIC_PRECISION)

    assert len(field.label_inv_freq) == 3
    assert isclose(field.label_inv_freq[0].item(), 3, rtol=NUMERIC_PRECISION)
    assert isclose(field.label_inv_freq[1].item(), 2, rtol=NUMERIC_PRECISION)
    assert isclose(field.label_inv_freq[2].item(), 6, rtol=NUMERIC_PRECISION)
Пример #10
0
def test_label_frequencies():
    """Test label frequencies."""
    dummy = ['LABEL1'] * 80
    dummy.extend(['LABEL2'] * 20)

    field = LabelField()
    field.setup(dummy)

    assert len(field.vocab) == 2

    assert len(field.label_freq) == 2
    assert isclose(field.label_freq[0].item(), 0.8, rtol=NUMERIC_PRECISION)
    assert isclose(field.label_freq[1].item(), 0.2, rtol=NUMERIC_PRECISION)

    assert len(field.label_count) == 2
    assert isclose(field.label_count[0].item(), 80, rtol=NUMERIC_PRECISION)
    assert isclose(field.label_count[1].item(), 20, rtol=NUMERIC_PRECISION)

    assert len(field.label_inv_freq) == 2
    assert isclose(field.label_inv_freq[0].item(), 1.25, rtol=NUMERIC_PRECISION)
    assert isclose(field.label_inv_freq[1].item(), 5, rtol=NUMERIC_PRECISION)
Пример #11
0
def test_dataset_transform():
    train = (("Lorem ipsum dolor sit amet", "POSITIVE"),
             ("Sed ut perspiciatis unde", "NEGATIVE"))

    transform = {"text": TextField(), "label": LabelField()}

    t = TabularDataset(train, transform=transform)

    assert hasattr(t, "text")
    assert hasattr(t, "label")

    assert t.label.vocab_size == 2
    assert t.text.vocab_size == 11
Пример #12
0
def test_dataset_transform_3():
    train = (("Lorem ipsum dolor sit amet", "POSITIVE"),
             ("Sed ut perspiciatis unde", "NEGATIVE"))

    transform = {
        "text": {
            "columns": 0
        },
        "label": {
            "field": LabelField(),
            "columns": 1
        }
    }

    with pytest.raises(ValueError):
        TabularDataset(train, transform=transform)
Пример #13
0
def test_label_process_one_hot():
    """Test label numericalization."""
    dummy = ['LABEL1', 'LABEL3', 'LABEL2', 'LABEL2']

    field = LabelField(one_hot=True)
    field.setup(dummy)

    assert len(field.vocab) == 3
    assert list(field.process('LABEL1')) == [1, 0, 0]
    assert list(field.process('LABEL2')) == [0, 0, 1]
    assert list(field.process('LABEL3')) == [0, 1, 0]
Пример #14
0
def test_label_process():
    """Test label nuemricalization."""
    dummy = ['LABEL1', 'LABEL3', 'LABEL2', 'LABEL2']

    field = LabelField()
    field.setup(dummy)

    assert len(field.vocab) == 3
    assert int(field.process('LABEL1')) == 0
    assert int(field.process('LABEL2')) == 2
    assert int(field.process('LABEL3')) == 1
Пример #15
0
def test_pass_bool_labels():
    """Test labels specified in the init"""
    dummy = [True, False, True, True]

    field = LabelField(labels=[False, True])
    field.setup(dummy)

    assert len(field.vocab) == 2
    assert int(field.process(False)) == 0
    assert int(field.process(True)) == 1

    field = LabelField(labels=[True, False])
    field.setup(dummy)

    assert len(field.vocab) == 2
    assert int(field.process(False)) == 1
    assert int(field.process(True)) == 0
Пример #16
0
def run_experiment(
        name=DEFAULT_HYPER_PARAMS['experiment_name'],
        max_steps=DEFAULT_HYPER_PARAMS['max_steps'],
        iter_per_step=DEFAULT_HYPER_PARAMS['iter_per_step'],
        embedding_dim=DEFAULT_HYPER_PARAMS['embedding_dim'],
        n_layers=DEFAULT_HYPER_PARAMS['n_layers'],
        rnn_type=DEFAULT_HYPER_PARAMS['rnn_type'],
        hidden_size=DEFAULT_HYPER_PARAMS['hidden_size'],
        rnn_dropout=DEFAULT_HYPER_PARAMS['rnn_dropout'],
        embedding_dropout=DEFAULT_HYPER_PARAMS['embedding_dropout']):
    # start off experiment progress at 0
    em.write_progress(0)

    # Dataset
    dataset = SSTDataset(transform={
        'text': TextField(),
        'label': LabelField()
    })

    # Model - takes params from front end GUI or from defaults in json
    model = RNNTextClassifier(vocab_size=dataset.text.vocab_size,
                              num_labels=dataset.label.vocab_size,
                              embedding_dim=embedding_dim,
                              n_layers=n_layers,
                              rnn_type=rnn_type,
                              hidden_size=hidden_size,
                              rnn_dropout=rnn_dropout,
                              embedding_dropout=embedding_dropout)

    # Trainer
    trainer = Trainer(
        dataset=dataset,
        model=model,
        train_sampler=BaseSampler(),
        val_sampler=BaseSampler(),
        loss_fn=torch.nn.NLLLoss(),
        metric_fn=Accuracy(),
        optimizer=torch.optim.Adam(params=model.trainable_params),
        max_steps=max_steps,  # Total number of times to evaluate the model
        iter_per_step=iter_per_step
    )  # Number of training iterations between steps

    # Run training
    current_iter_num = 0
    total_iters = max_steps * iter_per_step
    continue_ = True

    with TrialLogging(log_dir=TENSORBOARD_DIR + name,
                      verbose=False,
                      console_prefix=name,
                      capture_warnings=True):
        with tqdm(total=total_iters) as pbar:
            while continue_:
                continue_ = trainer.run(
                )  # returns a boolean indicating if you should keep going
                # Update CLI progress bar
                pbar.update(iter_per_step)  # N iterations per step

                # Update progress data in DB to reflect updates on GUI
                current_iter_num += iter_per_step
                em.write_progress(int(current_iter_num / total_iters * 100))
Пример #17
0
def train(args):
    """Run Training """

    global_step = 0
    best_metric = None
    best_model: Dict[str, torch.Tensor] = dict()
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    writer = SummaryWriter(log_dir=args.output_dir)

    # We use flambe to do the data preprocessing
    # More info at https://flambe.ai
    print("Performing preprocessing (possibly download embeddings).")
    embeddings = args.embeddings if args.use_pretrained_embeddings else None
    text_field = TextField(lower=args.lowercase,
                           embeddings=embeddings,
                           embeddings_format='gensim')
    label_field = LabelField()
    transforms = {'text': text_field, 'label': label_field}
    dataset = TabularDataset.from_path(
        args.train_path,
        args.val_path,
        sep=',' if args.file_type == 'csv' else '\t',
        transform=transforms)

    # Create samplers
    train_sampler = EpisodicSampler(dataset.train,
                                    n_support=args.n_support,
                                    n_query=args.n_query,
                                    n_episodes=args.n_episodes,
                                    n_classes=args.n_classes)

    # The train_eval_sampler is used to computer prototypes over the full dataset
    train_eval_sampler = BaseSampler(dataset.train,
                                     batch_size=args.eval_batch_size)
    val_sampler = BaseSampler(dataset.val, batch_size=args.eval_batch_size)

    if args.device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    else:
        device = args.device

    # Build model, criterion and optimizers
    model = PrototypicalTextClassifier(
        vocab_size=dataset.text.vocab_size,
        distance=args.distance,
        embedding_dim=args.embedding_dim,
        pretrained_embeddings=dataset.text.embedding_matrix,
        rnn_type='sru',
        n_layers=args.n_layers,
        hidden_dim=args.hidden_dim,
        freeze_pretrained_embeddings=True)

    loss_fn = nn.CrossEntropyLoss()

    parameters = (p for p in model.parameters() if p.requires_grad)
    optimizer = torch.optim.Adam(parameters, lr=args.learning_rate)

    print("Beginning training.")
    for epoch in range(args.num_epochs):

        ######################
        #       TRAIN        #
        ######################

        print(f'Epoch: {epoch}')

        model.train()

        with torch.enable_grad():
            for batch in train_sampler:
                # Zero the gradients and clear the accumulated loss
                optimizer.zero_grad()

                # Move to device
                batch = tuple(t.to(device) for t in batch)
                query, query_label, support, support_label = batch

                # Compute loss
                pred = model(query, support, support_label)
                loss = loss_fn(pred, query_label)
                loss.backward()

                # Clip gradients if necessary
                if args.max_grad_norm is not None:
                    clip_grad_norm_(model.parameters(), args.max_grad_norm)

                writer.add_scalar('Training/Loss', loss.item(), global_step)

                # Optimize
                optimizer.step()
                global_step += 1

            # Zero the gradients when exiting a train step
            optimizer.zero_grad()

        #########################
        #       EVALUATE        #
        #########################

        model.eval()

        with torch.no_grad():

            # First compute prototypes over the training data
            encodings, labels = [], []
            for text, label in train_eval_sampler:
                padding_mask = (text != model.padding_idx).byte()
                text_embeddings = model.embedding_dropout(
                    model.embedding(text))
                text_encoding = model.encoder(text_embeddings,
                                              padding_mask=padding_mask)
                labels.append(label.cpu())
                encodings.append(text_encoding.cpu())
            # Compute prototypes
            encodings = torch.cat(encodings, dim=0)
            labels = torch.cat(labels, dim=0)
            prototypes = model.compute_prototypes(encodings, labels).to(device)

            _preds, _targets = [], []
            for batch in val_sampler:
                # Move to device
                source, target = tuple(t.to(device) for t in batch)

                pred = model(source, prototypes=prototypes)
                _preds.append(pred.cpu())
                _targets.append(target.cpu())

            preds = torch.cat(_preds, dim=0)
            targets = torch.cat(_targets, dim=0)

            val_loss = loss_fn(preds, targets).item()
            val_metric = (pred.argmax(dim=1) == target).float().mean().item()

        # Update best model
        if best_metric is None or val_metric > best_metric:
            best_metric = val_metric
            best_model_state = model.state_dict()
            for k, t in best_model_state.items():
                best_model_state[k] = t.cpu().detach()
            best_model = best_model_state

        # Log metrics
        print(f'Validation loss: {val_loss}')
        print(f'Validation accuracy: {val_metric}')
        writer.add_scalar('Validation/Loss', val_loss, epoch)
        writer.add_scalar('Validation/Accuracy', val_metric, epoch)

    # Save the best model
    print("Finisehd training.")
    torch.save(best_model, os.path.join(args.output_dir, 'model.pt'))
Пример #18
0
def test_pass_labels():
    """Test labels specified in the init"""
    dummy = ['LABEL1', 'LABEL3', 'LABEL2', 'LABEL2']

    field = LabelField(labels=['LABEL1', 'LABEL2', 'LABEL3'])
    field.setup(dummy)

    assert len(field.vocab) == 3
    assert int(field.process('LABEL1')) == 0
    assert int(field.process('LABEL2')) == 1
    assert int(field.process('LABEL3')) == 2

    field = LabelField(labels=['LABEL3', 'LABEL1', 'LABEL2'])
    field.setup(dummy)

    assert len(field.vocab) == 3
    assert int(field.process('LABEL1')) == 1
    assert int(field.process('LABEL2')) == 2
    assert int(field.process('LABEL3')) == 0