def test_build_vocab_lower(): field = TextField(lower=True, pad_token=None, unk_token=None) dummy = ["justo Praesent luctus", "luctus praesent"] field.setup(dummy) vocab = {'justo': 0, 'praesent': 1, 'luctus': 2} assert field.vocab == vocab
def test_build_vocab_empty(): field = TextField(pad_token=None, unk_token=None) assert field.vocab == dict() dummy = ["justo Praesent luctus", "luctus praesent"] field.setup(dummy) vocab = {'justo': 0, 'Praesent': 1, 'luctus': 2, 'praesent': 3} assert field.vocab == vocab
def test_build_vocab_setup_all_embeddings(): """ This test shows that all fields in the embeddings will be included. In embeddings and data: blue green yellow In embeddings only: purple gold In data only: white Expected vocab: blue green yellow purple gold white """ model = KeyedVectors(10) model.add('purple', np.random.rand(10)) model.add('gold', np.random.rand(10)) model.add('<unk>', np.random.rand(10)) model.add('blue', np.random.rand(10)) model.add('green', np.random.rand(10)) model.add('<pad>', np.random.rand(10)) model.add('yellow', np.random.rand(10)) field = TextField( model=model, setup_all_embeddings=True, ) dummy = ["blue green", "yellow", 'white'] field.setup(dummy) # assert vocab setup in expected order assert field.vocab == odict([ ('<pad>', 0), ('<unk>', 1), ('blue', 2), ('green', 3), ('yellow', 4), ('white', 1), ('purple', 5), ('gold', 6), ]) # assert embedding matrix organized in expected order assert torch.equal( field.embedding_matrix, torch.stack([ torch.tensor(model['<pad>']), torch.tensor(model['<unk>']), torch.tensor(model['blue']), torch.tensor(model['green']), torch.tensor(model['yellow']), torch.tensor(model['purple']), torch.tensor(model['gold']) ]), )
def test_load_embeddings(): field = TextField(pad_token=None, unk_init_all=False, embeddings="tests/data/dummy_embeddings/test.txt") dummy = "a test !" field.setup([dummy]) # Now we have embeddings to check against true_embeddings = torch.tensor([[0.9, 0.1, 0.2, 0.3], [0.4, 0.5, 0.6, 0.7]]) assert len(field.embedding_matrix) == 3 assert torch.all(torch.eq(field.embedding_matrix[1:3], true_embeddings))
def test_build_vocab_list(): field = TextField() dummy = [["justo Praesent luctus", "luctus praesent"], ["justo Praesent luctus", "luctus praesent est"]] field._build_vocab(dummy) vocab = { '<pad>': 0, '<unk>': 1, 'justo': 2, 'Praesent': 3, 'luctus': 4, 'praesent': 5, 'est': 6 } assert field.vocab == vocab
def test_build_vocab(): field = TextField(pad_token='<pad>', unk_token='<unk>') assert field.vocab == {'<pad>': 0, '<unk>': 1} dummy = ["justo Praesent luctus", "luctus praesent"] field.setup(dummy) vocab = { '<pad>': 0, '<unk>': 1, 'justo': 2, 'Praesent': 3, 'luctus': 4, 'praesent': 5 } assert field.vocab == vocab
def test_text_process_lower(): field = TextField(lower=True) field.setup() dummy = "justo Praesent luctus justo praesent" assert list(field.process(dummy)) == [1, 1, 1, 1, 1] field.setup([dummy]) assert list(field.process(dummy)) == [2, 3, 4, 2, 3]
def test_dataset_transform_5(): train = (("Lorem ipsum dolor sit amet", "POSITIVE"), ("Sed ut perspiciatis unde", "NEGATIVE")) transform = { "t1": { "field": TextField(), "columns": 0 }, "t2": { "field": TextField(), "columns": 0 } } t = TabularDataset(train, transform=transform) assert t.train.cols() == 2
def test_build_vocab_nested_list_in_dict(): field = TextField() dummy = [{ 'text1': ["justo Praesent luctus", "luctus praesent"], 'text2': ["justo Praesent luctus", "luctus praesent est"] }] field._build_vocab(dummy) vocab = { '<pad>': 0, '<unk>': 1, 'justo': 2, 'Praesent': 3, 'luctus': 4, 'praesent': 5, 'est': 6 } assert field.vocab == vocab
def test_load_embeddings_with_extra_tokens(): field = TextField.from_embeddings( embeddings="tests/data/dummy_embeddings/test.txt", pad_token=None, unk_init_all=False, additional_special_tokens=['<a>', '<b>', '<c>']) dummy = "a test ! <a> <b> " field.setup([dummy]) assert '<a>' in field.vocab and '<b>' in field.vocab and '<c>' in field.vocab assert field.embedding_matrix[field.vocab['<a>']].size(-1) == 4 assert field.embedding_matrix[field.vocab['<b>']].size(-1) == 4 assert all(field.embedding_matrix[field.vocab['<b>']] != field.embedding_matrix[field.vocab['<c>']])
def test_dataset_transform(): train = (("Lorem ipsum dolor sit amet", "POSITIVE"), ("Sed ut perspiciatis unde", "NEGATIVE")) transform = {"text": TextField(), "label": LabelField()} t = TabularDataset(train, transform=transform) assert hasattr(t, "text") assert hasattr(t, "label") assert t.label.vocab_size == 2 assert t.text.vocab_size == 11
def test_setup_with_extra_tokens(): field = TextField.from_embeddings( embeddings="tests/data/dummy_embeddings/test.txt", pad_token=None, unk_init_all=False, additional_special_tokens=['<a>', '<b>', '<c>']) dummy = "this is a test" field.setup([dummy]) assert recursive_tensor_to_list(field.process(dummy)) == [4, 5, 6, 7] dummy = "this is a test <a> <c>" assert recursive_tensor_to_list(field.process(dummy)) == [4, 5, 6, 7, 1, 3]
def test_text_process_list(): field = TextField(lower=True) field.setup() dummy = [["justo Praesent luctus", "luctus praesent"], ["justo Praesent luctus", "luctus praesent est"]] assert recursive_tensor_to_list(field.process(dummy)) == [[[1, 1, 1], [1, 1]], [[1, 1, 1], [1, 1, 1]]] field.setup(dummy) assert recursive_tensor_to_list(field.process(dummy)) == [[[2, 3, 4], [4, 3]], [[2, 3, 4], [4, 3, 5]]]
def test_text_process_nested_list_in_dict(): field = TextField(lower=True) field.setup() dummy = [{ 'text1': ["justo Praesent luctus", "luctus praesent"], 'text2': ["justo Praesent luctus", "luctus praesent est"] }] assert recursive_tensor_to_list(field.process(dummy)) == [{ 'text1': [[1, 1, 1], [1, 1]], 'text2': [[1, 1, 1], [1, 1, 1]] }] field.setup(dummy) assert recursive_tensor_to_list(field.process(dummy)) == [{ 'text1': [[2, 3, 4], [4, 3]], 'text2': [[2, 3, 4], [4, 3, 5]] }]
def test_text_process_dict(): field = TextField(lower=True) field.setup() dummy = { 'text1': "justo Praesent luctus luctus praesent", 'text2': "justo Praesent luctus luctus praesent est" } assert recursive_tensor_to_list(field.process(dummy)) == { 'text1': [1, 1, 1, 1, 1], 'text2': [1, 1, 1, 1, 1, 1] } field.setup([dummy]) assert recursive_tensor_to_list(field.process(dummy)) == { 'text1': [2, 3, 4, 4, 3], 'text2': [2, 3, 4, 4, 3, 5] }
def test_load_embeddings_empty_voc(): field = TextField.from_embeddings( embeddings="tests/data/dummy_embeddings/test.txt", pad_token=None, unk_init_all=True, ) dummy = "justo Praesent luctus justo praesent" field.setup([dummy]) # No embeddings in the data, so get zeros assert len(field.embedding_matrix) == 5 field = TextField.from_embeddings( embeddings="tests/data/dummy_embeddings/test.txt", pad_token=None, unk_init_all=False, ) dummy = "justo Praesent luctus justo praesent" field.setup([dummy]) # No embeddings in the data, so get zeros assert len(field.embedding_matrix) == 1
def test_build_vocab_decorators_missing_specials(): field = TextField(pad_token=None, unk_token=None, sos_token='<sos>', eos_token='<eos>') field._build_vocab() assert field.vocab == {'<sos>': 0, '<eos>': 1} dummy = ["justo Praesent luctus", "luctus praesent"] field._build_vocab(dummy) vocab = {'<sos>': 0, '<eos>': 1, 'justo': 2, 'Praesent': 3, 'luctus': 4, 'praesent': 5} assert field.vocab == vocab
def test_dataset_transform_with_mixed_cols(): train = ( ("Lorem ipsum dolor sit amet", "POSITIVE"), ("Sed ut perspiciatis unde", "NEGATIVE")) transform = { "label": { "field": LabelField(), "columns": 1, }, "text": { "field": TextField(), "columns": 'text', } } t = TabularDataset(train, transform=transform, named_columns=['text', 'label']) assert len(t.train) == 2 assert len(t.train[0]) == 2
def test_build_vocab_decorators(): field = TextField(pad_token=None, unk_token=None, sos_token='<sos>', eos_token='<eos>') assert field.vocab == {'<sos>': 0, '<eos>': 1} dummy = ["justo Praesent luctus", "luctus praesent"] field.setup(dummy) vocab = { '<sos>': 0, '<eos>': 1, 'justo': 2, 'Praesent': 3, 'luctus': 4, 'praesent': 5 } assert field.vocab == vocab field = TextField(pad_token='<pad>', unk_token='<unk>', sos_token='<sos>', eos_token='<eos>') assert field.vocab == {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3} dummy = ["justo Praesent luctus", "luctus praesent"] field.setup(dummy) vocab = { '<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3, 'justo': 4, 'Praesent': 5, 'luctus': 6, 'praesent': 7 } assert field.vocab == vocab
def test_text_process_unk(): field = TextField(unk_token=None) dummy = "justo Praesent luctus justo praesent" with pytest.raises(Exception): field.process(dummy)
def train(args): """Run Training """ global_step = 0 best_metric = None best_model: Dict[str, torch.Tensor] = dict() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) writer = SummaryWriter(log_dir=args.output_dir) # We use flambe to do the data preprocessing # More info at https://flambe.ai print("Performing preprocessing (possibly download embeddings).") embeddings = args.embeddings if args.use_pretrained_embeddings else None text_field = TextField(lower=args.lowercase, embeddings=embeddings, embeddings_format='gensim') label_field = LabelField() transforms = {'text': text_field, 'label': label_field} dataset = TabularDataset.from_path( args.train_path, args.val_path, sep=',' if args.file_type == 'csv' else '\t', transform=transforms) # Create samplers train_sampler = EpisodicSampler(dataset.train, n_support=args.n_support, n_query=args.n_query, n_episodes=args.n_episodes, n_classes=args.n_classes) # The train_eval_sampler is used to computer prototypes over the full dataset train_eval_sampler = BaseSampler(dataset.train, batch_size=args.eval_batch_size) val_sampler = BaseSampler(dataset.val, batch_size=args.eval_batch_size) if args.device is None: device = 'cuda' if torch.cuda.is_available() else 'cpu' else: device = args.device # Build model, criterion and optimizers model = PrototypicalTextClassifier( vocab_size=dataset.text.vocab_size, distance=args.distance, embedding_dim=args.embedding_dim, pretrained_embeddings=dataset.text.embedding_matrix, rnn_type='sru', n_layers=args.n_layers, hidden_dim=args.hidden_dim, freeze_pretrained_embeddings=True) loss_fn = nn.CrossEntropyLoss() parameters = (p for p in model.parameters() if p.requires_grad) optimizer = torch.optim.Adam(parameters, lr=args.learning_rate) print("Beginning training.") for epoch in range(args.num_epochs): ###################### # TRAIN # ###################### print(f'Epoch: {epoch}') model.train() with torch.enable_grad(): for batch in train_sampler: # Zero the gradients and clear the accumulated loss optimizer.zero_grad() # Move to device batch = tuple(t.to(device) for t in batch) query, query_label, support, support_label = batch # Compute loss pred = model(query, support, support_label) loss = loss_fn(pred, query_label) loss.backward() # Clip gradients if necessary if args.max_grad_norm is not None: clip_grad_norm_(model.parameters(), args.max_grad_norm) writer.add_scalar('Training/Loss', loss.item(), global_step) # Optimize optimizer.step() global_step += 1 # Zero the gradients when exiting a train step optimizer.zero_grad() ######################### # EVALUATE # ######################### model.eval() with torch.no_grad(): # First compute prototypes over the training data encodings, labels = [], [] for text, label in train_eval_sampler: padding_mask = (text != model.padding_idx).byte() text_embeddings = model.embedding_dropout( model.embedding(text)) text_encoding = model.encoder(text_embeddings, padding_mask=padding_mask) labels.append(label.cpu()) encodings.append(text_encoding.cpu()) # Compute prototypes encodings = torch.cat(encodings, dim=0) labels = torch.cat(labels, dim=0) prototypes = model.compute_prototypes(encodings, labels).to(device) _preds, _targets = [], [] for batch in val_sampler: # Move to device source, target = tuple(t.to(device) for t in batch) pred = model(source, prototypes=prototypes) _preds.append(pred.cpu()) _targets.append(target.cpu()) preds = torch.cat(_preds, dim=0) targets = torch.cat(_targets, dim=0) val_loss = loss_fn(preds, targets).item() val_metric = (pred.argmax(dim=1) == target).float().mean().item() # Update best model if best_metric is None or val_metric > best_metric: best_metric = val_metric best_model_state = model.state_dict() for k, t in best_model_state.items(): best_model_state[k] = t.cpu().detach() best_model = best_model_state # Log metrics print(f'Validation loss: {val_loss}') print(f'Validation accuracy: {val_metric}') writer.add_scalar('Validation/Loss', val_loss, epoch) writer.add_scalar('Validation/Accuracy', val_metric, epoch) # Save the best model print("Finisehd training.") torch.save(best_model, os.path.join(args.output_dir, 'model.pt'))
def run_experiment( name=DEFAULT_HYPER_PARAMS['experiment_name'], max_steps=DEFAULT_HYPER_PARAMS['max_steps'], iter_per_step=DEFAULT_HYPER_PARAMS['iter_per_step'], embedding_dim=DEFAULT_HYPER_PARAMS['embedding_dim'], n_layers=DEFAULT_HYPER_PARAMS['n_layers'], rnn_type=DEFAULT_HYPER_PARAMS['rnn_type'], hidden_size=DEFAULT_HYPER_PARAMS['hidden_size'], rnn_dropout=DEFAULT_HYPER_PARAMS['rnn_dropout'], embedding_dropout=DEFAULT_HYPER_PARAMS['embedding_dropout']): # start off experiment progress at 0 em.write_progress(0) # Dataset dataset = SSTDataset(transform={ 'text': TextField(), 'label': LabelField() }) # Model - takes params from front end GUI or from defaults in json model = RNNTextClassifier(vocab_size=dataset.text.vocab_size, num_labels=dataset.label.vocab_size, embedding_dim=embedding_dim, n_layers=n_layers, rnn_type=rnn_type, hidden_size=hidden_size, rnn_dropout=rnn_dropout, embedding_dropout=embedding_dropout) # Trainer trainer = Trainer( dataset=dataset, model=model, train_sampler=BaseSampler(), val_sampler=BaseSampler(), loss_fn=torch.nn.NLLLoss(), metric_fn=Accuracy(), optimizer=torch.optim.Adam(params=model.trainable_params), max_steps=max_steps, # Total number of times to evaluate the model iter_per_step=iter_per_step ) # Number of training iterations between steps # Run training current_iter_num = 0 total_iters = max_steps * iter_per_step continue_ = True with TrialLogging(log_dir=TENSORBOARD_DIR + name, verbose=False, console_prefix=name, capture_warnings=True): with tqdm(total=total_iters) as pbar: while continue_: continue_ = trainer.run( ) # returns a boolean indicating if you should keep going # Update CLI progress bar pbar.update(iter_per_step) # N iterations per step # Update progress data in DB to reflect updates on GUI current_iter_num += iter_per_step em.write_progress(int(current_iter_num / total_iters * 100))
def from_textfield(cls, textfield: field.TextField, **kwargs) -> field.TextField: instance = cls(**kwargs) instance.load_state(textfield.get_state()) return instance