Exemplo n.º 1
0
def test_build_vocab_lower():
    field = TextField(lower=True, pad_token=None, unk_token=None)

    dummy = ["justo Praesent luctus", "luctus praesent"]
    field.setup(dummy)

    vocab = {'justo': 0, 'praesent': 1, 'luctus': 2}
    assert field.vocab == vocab
Exemplo n.º 2
0
def test_build_vocab_empty():
    field = TextField(pad_token=None, unk_token=None)
    assert field.vocab == dict()

    dummy = ["justo Praesent luctus", "luctus praesent"]
    field.setup(dummy)

    vocab = {'justo': 0, 'Praesent': 1, 'luctus': 2, 'praesent': 3}
    assert field.vocab == vocab
Exemplo n.º 3
0
def test_build_vocab_setup_all_embeddings():
    """
    This test shows that all fields in the embeddings will be included.

    In embeddings and data:
        blue
        green
        yellow
    In embeddings only:
        purple
        gold
    In data only:
        white

    Expected vocab:
        blue
        green
        yellow
        purple
        gold
        white
    """

    model = KeyedVectors(10)
    model.add('purple', np.random.rand(10))
    model.add('gold', np.random.rand(10))
    model.add('<unk>', np.random.rand(10))
    model.add('blue', np.random.rand(10))
    model.add('green', np.random.rand(10))
    model.add('<pad>', np.random.rand(10))
    model.add('yellow', np.random.rand(10))

    field = TextField(
        model=model,
        setup_all_embeddings=True,
    )

    dummy = ["blue green", "yellow", 'white']

    field.setup(dummy)

    # assert vocab setup in expected order
    assert field.vocab == odict([
        ('<pad>', 0), ('<unk>', 1), ('blue', 2), ('green', 3),
        ('yellow', 4), ('white', 1), ('purple', 5), ('gold', 6),
    ])

    # assert embedding matrix organized in expected order
    assert torch.equal(
        field.embedding_matrix,
        torch.stack([
            torch.tensor(model['<pad>']), torch.tensor(model['<unk>']),
            torch.tensor(model['blue']), torch.tensor(model['green']),
            torch.tensor(model['yellow']), torch.tensor(model['purple']),
            torch.tensor(model['gold'])
        ]),
    )
Exemplo n.º 4
0
def test_load_embeddings():
    field = TextField(pad_token=None,
                      unk_init_all=False,
                      embeddings="tests/data/dummy_embeddings/test.txt")
    dummy = "a test !"
    field.setup([dummy])

    # Now we have embeddings to check against
    true_embeddings = torch.tensor([[0.9, 0.1, 0.2, 0.3], [0.4, 0.5, 0.6,
                                                           0.7]])
    assert len(field.embedding_matrix) == 3
    assert torch.all(torch.eq(field.embedding_matrix[1:3], true_embeddings))
Exemplo n.º 5
0
def test_build_vocab_list():
    field = TextField()
    dummy = [["justo Praesent luctus", "luctus praesent"],
             ["justo Praesent luctus", "luctus praesent est"]]
    field._build_vocab(dummy)

    vocab = {
        '<pad>': 0,
        '<unk>': 1,
        'justo': 2,
        'Praesent': 3,
        'luctus': 4,
        'praesent': 5,
        'est': 6
    }
    assert field.vocab == vocab
Exemplo n.º 6
0
def test_build_vocab():
    field = TextField(pad_token='<pad>', unk_token='<unk>')
    assert field.vocab == {'<pad>': 0, '<unk>': 1}

    dummy = ["justo Praesent luctus", "luctus praesent"]
    field.setup(dummy)

    vocab = {
        '<pad>': 0,
        '<unk>': 1,
        'justo': 2,
        'Praesent': 3,
        'luctus': 4,
        'praesent': 5
    }
    assert field.vocab == vocab
Exemplo n.º 7
0
def test_text_process_lower():
    field = TextField(lower=True)
    field.setup()

    dummy = "justo Praesent luctus justo praesent"
    assert list(field.process(dummy)) == [1, 1, 1, 1, 1]

    field.setup([dummy])
    assert list(field.process(dummy)) == [2, 3, 4, 2, 3]
Exemplo n.º 8
0
def test_dataset_transform_5():
    train = (("Lorem ipsum dolor sit amet", "POSITIVE"),
             ("Sed ut perspiciatis unde", "NEGATIVE"))

    transform = {
        "t1": {
            "field": TextField(),
            "columns": 0
        },
        "t2": {
            "field": TextField(),
            "columns": 0
        }
    }

    t = TabularDataset(train, transform=transform)
    assert t.train.cols() == 2
Exemplo n.º 9
0
def test_build_vocab_nested_list_in_dict():
    field = TextField()
    dummy = [{
        'text1': ["justo Praesent luctus", "luctus praesent"],
        'text2': ["justo Praesent luctus", "luctus praesent est"]
    }]
    field._build_vocab(dummy)

    vocab = {
        '<pad>': 0,
        '<unk>': 1,
        'justo': 2,
        'Praesent': 3,
        'luctus': 4,
        'praesent': 5,
        'est': 6
    }
    assert field.vocab == vocab
Exemplo n.º 10
0
def test_load_embeddings_with_extra_tokens():
    field = TextField.from_embeddings(
        embeddings="tests/data/dummy_embeddings/test.txt",
        pad_token=None,
        unk_init_all=False,
        additional_special_tokens=['<a>', '<b>', '<c>'])
    dummy = "a test ! <a> <b> "
    field.setup([dummy])
    assert '<a>' in field.vocab and '<b>' in field.vocab and '<c>' in field.vocab
    assert field.embedding_matrix[field.vocab['<a>']].size(-1) == 4
    assert field.embedding_matrix[field.vocab['<b>']].size(-1) == 4
    assert all(field.embedding_matrix[field.vocab['<b>']] !=
               field.embedding_matrix[field.vocab['<c>']])
Exemplo n.º 11
0
def test_dataset_transform():
    train = (("Lorem ipsum dolor sit amet", "POSITIVE"),
             ("Sed ut perspiciatis unde", "NEGATIVE"))

    transform = {"text": TextField(), "label": LabelField()}

    t = TabularDataset(train, transform=transform)

    assert hasattr(t, "text")
    assert hasattr(t, "label")

    assert t.label.vocab_size == 2
    assert t.text.vocab_size == 11
Exemplo n.º 12
0
def test_setup_with_extra_tokens():
    field = TextField.from_embeddings(
        embeddings="tests/data/dummy_embeddings/test.txt",
        pad_token=None,
        unk_init_all=False,
        additional_special_tokens=['<a>', '<b>', '<c>'])

    dummy = "this is a test"
    field.setup([dummy])
    assert recursive_tensor_to_list(field.process(dummy)) == [4, 5, 6, 7]

    dummy = "this is a test <a> <c>"
    assert recursive_tensor_to_list(field.process(dummy)) == [4, 5, 6, 7, 1, 3]
Exemplo n.º 13
0
def test_text_process_list():
    field = TextField(lower=True)
    field.setup()
    dummy = [["justo Praesent luctus", "luctus praesent"],
             ["justo Praesent luctus", "luctus praesent est"]]
    assert recursive_tensor_to_list(field.process(dummy)) == [[[1, 1, 1],
                                                               [1, 1]],
                                                              [[1, 1, 1],
                                                               [1, 1, 1]]]

    field.setup(dummy)
    assert recursive_tensor_to_list(field.process(dummy)) == [[[2, 3, 4],
                                                               [4, 3]],
                                                              [[2, 3, 4],
                                                               [4, 3, 5]]]
Exemplo n.º 14
0
def test_text_process_nested_list_in_dict():
    field = TextField(lower=True)
    field.setup()
    dummy = [{
        'text1': ["justo Praesent luctus", "luctus praesent"],
        'text2': ["justo Praesent luctus", "luctus praesent est"]
    }]
    assert recursive_tensor_to_list(field.process(dummy)) == [{
        'text1': [[1, 1, 1], [1, 1]],
        'text2': [[1, 1, 1], [1, 1, 1]]
    }]
    field.setup(dummy)
    assert recursive_tensor_to_list(field.process(dummy)) == [{
        'text1': [[2, 3, 4], [4, 3]],
        'text2': [[2, 3, 4], [4, 3, 5]]
    }]
Exemplo n.º 15
0
def test_text_process_dict():
    field = TextField(lower=True)
    field.setup()
    dummy = {
        'text1': "justo Praesent luctus luctus praesent",
        'text2': "justo Praesent luctus luctus praesent est"
    }
    assert recursive_tensor_to_list(field.process(dummy)) == {
        'text1': [1, 1, 1, 1, 1],
        'text2': [1, 1, 1, 1, 1, 1]
    }
    field.setup([dummy])
    assert recursive_tensor_to_list(field.process(dummy)) == {
        'text1': [2, 3, 4, 4, 3],
        'text2': [2, 3, 4, 4, 3, 5]
    }
Exemplo n.º 16
0
def test_load_embeddings_empty_voc():
    field = TextField.from_embeddings(
        embeddings="tests/data/dummy_embeddings/test.txt",
        pad_token=None,
        unk_init_all=True,
    )

    dummy = "justo Praesent luctus justo praesent"
    field.setup([dummy])

    # No embeddings in the data, so get zeros
    assert len(field.embedding_matrix) == 5

    field = TextField.from_embeddings(
        embeddings="tests/data/dummy_embeddings/test.txt",
        pad_token=None,
        unk_init_all=False,
    )

    dummy = "justo Praesent luctus justo praesent"
    field.setup([dummy])

    # No embeddings in the data, so get zeros
    assert len(field.embedding_matrix) == 1
Exemplo n.º 17
0
def test_build_vocab_decorators_missing_specials():
    field = TextField(pad_token=None, unk_token=None,
                      sos_token='<sos>', eos_token='<eos>')
    field._build_vocab()

    assert field.vocab == {'<sos>': 0, '<eos>': 1}
    dummy = ["justo Praesent luctus", "luctus praesent"]
    field._build_vocab(dummy)

    vocab = {'<sos>': 0, '<eos>': 1, 'justo': 2, 'Praesent': 3, 'luctus': 4, 'praesent': 5}
    assert field.vocab == vocab
Exemplo n.º 18
0
def test_dataset_transform_with_mixed_cols():
    train = (
            ("Lorem ipsum dolor sit amet", "POSITIVE"),
            ("Sed ut perspiciatis unde", "NEGATIVE"))

    transform = {
        "label": {
            "field": LabelField(),
            "columns": 1,
        },
        "text": {
            "field": TextField(),
            "columns": 'text',
        }
    }

    t = TabularDataset(train, transform=transform, named_columns=['text', 'label'])
    assert len(t.train) == 2
    assert len(t.train[0]) == 2
Exemplo n.º 19
0
def test_build_vocab_decorators():
    field = TextField(pad_token=None,
                      unk_token=None,
                      sos_token='<sos>',
                      eos_token='<eos>')

    assert field.vocab == {'<sos>': 0, '<eos>': 1}
    dummy = ["justo Praesent luctus", "luctus praesent"]
    field.setup(dummy)

    vocab = {
        '<sos>': 0,
        '<eos>': 1,
        'justo': 2,
        'Praesent': 3,
        'luctus': 4,
        'praesent': 5
    }
    assert field.vocab == vocab

    field = TextField(pad_token='<pad>',
                      unk_token='<unk>',
                      sos_token='<sos>',
                      eos_token='<eos>')

    assert field.vocab == {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3}
    dummy = ["justo Praesent luctus", "luctus praesent"]
    field.setup(dummy)

    vocab = {
        '<pad>': 0,
        '<unk>': 1,
        '<sos>': 2,
        '<eos>': 3,
        'justo': 4,
        'Praesent': 5,
        'luctus': 6,
        'praesent': 7
    }
    assert field.vocab == vocab
Exemplo n.º 20
0
def test_text_process_unk():
    field = TextField(unk_token=None)

    dummy = "justo Praesent luctus justo praesent"
    with pytest.raises(Exception):
        field.process(dummy)
Exemplo n.º 21
0
def train(args):
    """Run Training """

    global_step = 0
    best_metric = None
    best_model: Dict[str, torch.Tensor] = dict()
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    writer = SummaryWriter(log_dir=args.output_dir)

    # We use flambe to do the data preprocessing
    # More info at https://flambe.ai
    print("Performing preprocessing (possibly download embeddings).")
    embeddings = args.embeddings if args.use_pretrained_embeddings else None
    text_field = TextField(lower=args.lowercase,
                           embeddings=embeddings,
                           embeddings_format='gensim')
    label_field = LabelField()
    transforms = {'text': text_field, 'label': label_field}
    dataset = TabularDataset.from_path(
        args.train_path,
        args.val_path,
        sep=',' if args.file_type == 'csv' else '\t',
        transform=transforms)

    # Create samplers
    train_sampler = EpisodicSampler(dataset.train,
                                    n_support=args.n_support,
                                    n_query=args.n_query,
                                    n_episodes=args.n_episodes,
                                    n_classes=args.n_classes)

    # The train_eval_sampler is used to computer prototypes over the full dataset
    train_eval_sampler = BaseSampler(dataset.train,
                                     batch_size=args.eval_batch_size)
    val_sampler = BaseSampler(dataset.val, batch_size=args.eval_batch_size)

    if args.device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    else:
        device = args.device

    # Build model, criterion and optimizers
    model = PrototypicalTextClassifier(
        vocab_size=dataset.text.vocab_size,
        distance=args.distance,
        embedding_dim=args.embedding_dim,
        pretrained_embeddings=dataset.text.embedding_matrix,
        rnn_type='sru',
        n_layers=args.n_layers,
        hidden_dim=args.hidden_dim,
        freeze_pretrained_embeddings=True)

    loss_fn = nn.CrossEntropyLoss()

    parameters = (p for p in model.parameters() if p.requires_grad)
    optimizer = torch.optim.Adam(parameters, lr=args.learning_rate)

    print("Beginning training.")
    for epoch in range(args.num_epochs):

        ######################
        #       TRAIN        #
        ######################

        print(f'Epoch: {epoch}')

        model.train()

        with torch.enable_grad():
            for batch in train_sampler:
                # Zero the gradients and clear the accumulated loss
                optimizer.zero_grad()

                # Move to device
                batch = tuple(t.to(device) for t in batch)
                query, query_label, support, support_label = batch

                # Compute loss
                pred = model(query, support, support_label)
                loss = loss_fn(pred, query_label)
                loss.backward()

                # Clip gradients if necessary
                if args.max_grad_norm is not None:
                    clip_grad_norm_(model.parameters(), args.max_grad_norm)

                writer.add_scalar('Training/Loss', loss.item(), global_step)

                # Optimize
                optimizer.step()
                global_step += 1

            # Zero the gradients when exiting a train step
            optimizer.zero_grad()

        #########################
        #       EVALUATE        #
        #########################

        model.eval()

        with torch.no_grad():

            # First compute prototypes over the training data
            encodings, labels = [], []
            for text, label in train_eval_sampler:
                padding_mask = (text != model.padding_idx).byte()
                text_embeddings = model.embedding_dropout(
                    model.embedding(text))
                text_encoding = model.encoder(text_embeddings,
                                              padding_mask=padding_mask)
                labels.append(label.cpu())
                encodings.append(text_encoding.cpu())
            # Compute prototypes
            encodings = torch.cat(encodings, dim=0)
            labels = torch.cat(labels, dim=0)
            prototypes = model.compute_prototypes(encodings, labels).to(device)

            _preds, _targets = [], []
            for batch in val_sampler:
                # Move to device
                source, target = tuple(t.to(device) for t in batch)

                pred = model(source, prototypes=prototypes)
                _preds.append(pred.cpu())
                _targets.append(target.cpu())

            preds = torch.cat(_preds, dim=0)
            targets = torch.cat(_targets, dim=0)

            val_loss = loss_fn(preds, targets).item()
            val_metric = (pred.argmax(dim=1) == target).float().mean().item()

        # Update best model
        if best_metric is None or val_metric > best_metric:
            best_metric = val_metric
            best_model_state = model.state_dict()
            for k, t in best_model_state.items():
                best_model_state[k] = t.cpu().detach()
            best_model = best_model_state

        # Log metrics
        print(f'Validation loss: {val_loss}')
        print(f'Validation accuracy: {val_metric}')
        writer.add_scalar('Validation/Loss', val_loss, epoch)
        writer.add_scalar('Validation/Accuracy', val_metric, epoch)

    # Save the best model
    print("Finisehd training.")
    torch.save(best_model, os.path.join(args.output_dir, 'model.pt'))
Exemplo n.º 22
0
def run_experiment(
        name=DEFAULT_HYPER_PARAMS['experiment_name'],
        max_steps=DEFAULT_HYPER_PARAMS['max_steps'],
        iter_per_step=DEFAULT_HYPER_PARAMS['iter_per_step'],
        embedding_dim=DEFAULT_HYPER_PARAMS['embedding_dim'],
        n_layers=DEFAULT_HYPER_PARAMS['n_layers'],
        rnn_type=DEFAULT_HYPER_PARAMS['rnn_type'],
        hidden_size=DEFAULT_HYPER_PARAMS['hidden_size'],
        rnn_dropout=DEFAULT_HYPER_PARAMS['rnn_dropout'],
        embedding_dropout=DEFAULT_HYPER_PARAMS['embedding_dropout']):
    # start off experiment progress at 0
    em.write_progress(0)

    # Dataset
    dataset = SSTDataset(transform={
        'text': TextField(),
        'label': LabelField()
    })

    # Model - takes params from front end GUI or from defaults in json
    model = RNNTextClassifier(vocab_size=dataset.text.vocab_size,
                              num_labels=dataset.label.vocab_size,
                              embedding_dim=embedding_dim,
                              n_layers=n_layers,
                              rnn_type=rnn_type,
                              hidden_size=hidden_size,
                              rnn_dropout=rnn_dropout,
                              embedding_dropout=embedding_dropout)

    # Trainer
    trainer = Trainer(
        dataset=dataset,
        model=model,
        train_sampler=BaseSampler(),
        val_sampler=BaseSampler(),
        loss_fn=torch.nn.NLLLoss(),
        metric_fn=Accuracy(),
        optimizer=torch.optim.Adam(params=model.trainable_params),
        max_steps=max_steps,  # Total number of times to evaluate the model
        iter_per_step=iter_per_step
    )  # Number of training iterations between steps

    # Run training
    current_iter_num = 0
    total_iters = max_steps * iter_per_step
    continue_ = True

    with TrialLogging(log_dir=TENSORBOARD_DIR + name,
                      verbose=False,
                      console_prefix=name,
                      capture_warnings=True):
        with tqdm(total=total_iters) as pbar:
            while continue_:
                continue_ = trainer.run(
                )  # returns a boolean indicating if you should keep going
                # Update CLI progress bar
                pbar.update(iter_per_step)  # N iterations per step

                # Update progress data in DB to reflect updates on GUI
                current_iter_num += iter_per_step
                em.write_progress(int(current_iter_num / total_iters * 100))
Exemplo n.º 23
0
 def from_textfield(cls, textfield: field.TextField,
                    **kwargs) -> field.TextField:
     instance = cls(**kwargs)
     instance.load_state(textfield.get_state())
     return instance