def test_pass_labels_with_unkown_1(): """Test labels specified in the init""" dummy = ['LABEL1', 'LABEL3', 'LABEL2', 'LABEL2'] field = LabelField(labels=['LABEL1', 'LABEL2']) with pytest.raises(ValueError): field.setup(dummy)
def test_label_frequencies_3(): """Test label frequencies.""" dummy = [] field = LabelField() field.setup(dummy) assert len(field.vocab) == 0 assert len(field.label_freq) == 0 assert len(field.label_count) == 0 assert len(field.label_inv_freq) == 0
def test_label_process_multilabel(): """Test label nuemricalization.""" dummy = ['LABEL1,LABEL2', 'LABEL3', 'LABEL2,LABEL1', 'LABEL2'] field = LabelField() field.setup(dummy) assert len(field.vocab) == 4 field = LabelField(multilabel_sep=',') field.setup(dummy) assert len(field.vocab) == 3 assert list(field.process('LABEL1,LABEL2')) == [0, 1] assert list(field.process('LABEL2,LABEL1')) == [1, 0] assert int(field.process('LABEL2')) == 1 assert int(field.process('LABEL3')) == 2
def test_dataset_transform_7(): train = ( ("Lorem ipsum dolor sit amet", "POSITIVE"), ("Sed ut perspiciatis unde", "NEGATIVE")) class DummyField(Field): def setup(self, *data: np.ndarray) -> None: pass def process(self, ex1, ex2): return torch.tensor(0) transform = { "text": { "field": DummyField(), "columns": [0, 1] }, "other": { "field": DummyField(), "columns": [0, 1] }, "other2": { "field": LabelField(), "columns": 0 } } t = TabularDataset(train, transform=transform) assert t.train.cols() == 3
def test_label_frequencies_2(): """Test label frequencies.""" dummy = ['LABEL1'] * 80 field = LabelField() field.setup(dummy) assert len(field.vocab) == 1 assert len(field.label_freq) == 1 assert isclose(field.label_freq[0].item(), 1, rtol=NUMERIC_PRECISION) assert len(field.label_count) == 1 assert isclose(field.label_count[0].item(), 80, rtol=NUMERIC_PRECISION) assert len(field.label_inv_freq) == 1 assert isclose(field.label_inv_freq[0].item(), 1, rtol=NUMERIC_PRECISION)
def test_dataset_transform_8(): train = (("Lorem ipsum dolor sit amet", "POSITIVE"), ("Sed ut perspiciatis unde", "NEGATIVE")) transform = {"tx": {"field": LabelField(), "columns": [0, 1]}} with pytest.raises(TypeError): t = TabularDataset(train, transform=transform) t.train.cols()
def test_dataset_transform_with_invalid_named_cols(): train = (("Lorem ipsum dolor sit amet", "POSITIVE"), ("Sed ut perspiciatis unde", "NEGATIVE")) transform = {"tx": {"field": LabelField(), "columns": 'none_existent'}} with pytest.raises(ValueError): TabularDataset(train, transform=transform, named_columns=['text', 'label'])
def test_dataset_transform_with_named_cols(): train = (("Lorem ipsum dolor sit amet", "POSITIVE"), ("Sed ut perspiciatis unde", "NEGATIVE")) transform = {"tx": {"field": LabelField(), "columns": 'label'}} t = TabularDataset(train, transform=transform, named_columns=['text', 'label']) assert len(t.train[0]) == 1
def test_label_process_multilabel_one_hot_frequencies(): """Test label numericalization.""" dummy = ['LABEL1,LABEL2', 'LABEL3', 'LABEL2,LABEL1', 'LABEL2'] field = LabelField(multilabel_sep=',', one_hot=True) field.setup(dummy) assert len(field.label_freq) == 3 assert isclose(field.label_freq[0].item(), 0.333, rtol=NUMERIC_PRECISION) assert isclose(field.label_freq[1].item(), 0.5, rtol=NUMERIC_PRECISION) assert isclose(field.label_freq[2].item(), 0.166, rtol=NUMERIC_PRECISION) assert len(field.label_count) == 3 assert isclose(field.label_count[0].item(), 2, rtol=NUMERIC_PRECISION) assert isclose(field.label_count[1].item(), 3, rtol=NUMERIC_PRECISION) assert isclose(field.label_count[2].item(), 1, rtol=NUMERIC_PRECISION) assert len(field.label_inv_freq) == 3 assert isclose(field.label_inv_freq[0].item(), 3, rtol=NUMERIC_PRECISION) assert isclose(field.label_inv_freq[1].item(), 2, rtol=NUMERIC_PRECISION) assert isclose(field.label_inv_freq[2].item(), 6, rtol=NUMERIC_PRECISION)
def test_label_frequencies(): """Test label frequencies.""" dummy = ['LABEL1'] * 80 dummy.extend(['LABEL2'] * 20) field = LabelField() field.setup(dummy) assert len(field.vocab) == 2 assert len(field.label_freq) == 2 assert isclose(field.label_freq[0].item(), 0.8, rtol=NUMERIC_PRECISION) assert isclose(field.label_freq[1].item(), 0.2, rtol=NUMERIC_PRECISION) assert len(field.label_count) == 2 assert isclose(field.label_count[0].item(), 80, rtol=NUMERIC_PRECISION) assert isclose(field.label_count[1].item(), 20, rtol=NUMERIC_PRECISION) assert len(field.label_inv_freq) == 2 assert isclose(field.label_inv_freq[0].item(), 1.25, rtol=NUMERIC_PRECISION) assert isclose(field.label_inv_freq[1].item(), 5, rtol=NUMERIC_PRECISION)
def test_dataset_transform(): train = (("Lorem ipsum dolor sit amet", "POSITIVE"), ("Sed ut perspiciatis unde", "NEGATIVE")) transform = {"text": TextField(), "label": LabelField()} t = TabularDataset(train, transform=transform) assert hasattr(t, "text") assert hasattr(t, "label") assert t.label.vocab_size == 2 assert t.text.vocab_size == 11
def test_dataset_transform_3(): train = (("Lorem ipsum dolor sit amet", "POSITIVE"), ("Sed ut perspiciatis unde", "NEGATIVE")) transform = { "text": { "columns": 0 }, "label": { "field": LabelField(), "columns": 1 } } with pytest.raises(ValueError): TabularDataset(train, transform=transform)
def test_label_process_one_hot(): """Test label numericalization.""" dummy = ['LABEL1', 'LABEL3', 'LABEL2', 'LABEL2'] field = LabelField(one_hot=True) field.setup(dummy) assert len(field.vocab) == 3 assert list(field.process('LABEL1')) == [1, 0, 0] assert list(field.process('LABEL2')) == [0, 0, 1] assert list(field.process('LABEL3')) == [0, 1, 0]
def test_label_process(): """Test label nuemricalization.""" dummy = ['LABEL1', 'LABEL3', 'LABEL2', 'LABEL2'] field = LabelField() field.setup(dummy) assert len(field.vocab) == 3 assert int(field.process('LABEL1')) == 0 assert int(field.process('LABEL2')) == 2 assert int(field.process('LABEL3')) == 1
def test_pass_bool_labels(): """Test labels specified in the init""" dummy = [True, False, True, True] field = LabelField(labels=[False, True]) field.setup(dummy) assert len(field.vocab) == 2 assert int(field.process(False)) == 0 assert int(field.process(True)) == 1 field = LabelField(labels=[True, False]) field.setup(dummy) assert len(field.vocab) == 2 assert int(field.process(False)) == 1 assert int(field.process(True)) == 0
def run_experiment( name=DEFAULT_HYPER_PARAMS['experiment_name'], max_steps=DEFAULT_HYPER_PARAMS['max_steps'], iter_per_step=DEFAULT_HYPER_PARAMS['iter_per_step'], embedding_dim=DEFAULT_HYPER_PARAMS['embedding_dim'], n_layers=DEFAULT_HYPER_PARAMS['n_layers'], rnn_type=DEFAULT_HYPER_PARAMS['rnn_type'], hidden_size=DEFAULT_HYPER_PARAMS['hidden_size'], rnn_dropout=DEFAULT_HYPER_PARAMS['rnn_dropout'], embedding_dropout=DEFAULT_HYPER_PARAMS['embedding_dropout']): # start off experiment progress at 0 em.write_progress(0) # Dataset dataset = SSTDataset(transform={ 'text': TextField(), 'label': LabelField() }) # Model - takes params from front end GUI or from defaults in json model = RNNTextClassifier(vocab_size=dataset.text.vocab_size, num_labels=dataset.label.vocab_size, embedding_dim=embedding_dim, n_layers=n_layers, rnn_type=rnn_type, hidden_size=hidden_size, rnn_dropout=rnn_dropout, embedding_dropout=embedding_dropout) # Trainer trainer = Trainer( dataset=dataset, model=model, train_sampler=BaseSampler(), val_sampler=BaseSampler(), loss_fn=torch.nn.NLLLoss(), metric_fn=Accuracy(), optimizer=torch.optim.Adam(params=model.trainable_params), max_steps=max_steps, # Total number of times to evaluate the model iter_per_step=iter_per_step ) # Number of training iterations between steps # Run training current_iter_num = 0 total_iters = max_steps * iter_per_step continue_ = True with TrialLogging(log_dir=TENSORBOARD_DIR + name, verbose=False, console_prefix=name, capture_warnings=True): with tqdm(total=total_iters) as pbar: while continue_: continue_ = trainer.run( ) # returns a boolean indicating if you should keep going # Update CLI progress bar pbar.update(iter_per_step) # N iterations per step # Update progress data in DB to reflect updates on GUI current_iter_num += iter_per_step em.write_progress(int(current_iter_num / total_iters * 100))
def train(args): """Run Training """ global_step = 0 best_metric = None best_model: Dict[str, torch.Tensor] = dict() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) writer = SummaryWriter(log_dir=args.output_dir) # We use flambe to do the data preprocessing # More info at https://flambe.ai print("Performing preprocessing (possibly download embeddings).") embeddings = args.embeddings if args.use_pretrained_embeddings else None text_field = TextField(lower=args.lowercase, embeddings=embeddings, embeddings_format='gensim') label_field = LabelField() transforms = {'text': text_field, 'label': label_field} dataset = TabularDataset.from_path( args.train_path, args.val_path, sep=',' if args.file_type == 'csv' else '\t', transform=transforms) # Create samplers train_sampler = EpisodicSampler(dataset.train, n_support=args.n_support, n_query=args.n_query, n_episodes=args.n_episodes, n_classes=args.n_classes) # The train_eval_sampler is used to computer prototypes over the full dataset train_eval_sampler = BaseSampler(dataset.train, batch_size=args.eval_batch_size) val_sampler = BaseSampler(dataset.val, batch_size=args.eval_batch_size) if args.device is None: device = 'cuda' if torch.cuda.is_available() else 'cpu' else: device = args.device # Build model, criterion and optimizers model = PrototypicalTextClassifier( vocab_size=dataset.text.vocab_size, distance=args.distance, embedding_dim=args.embedding_dim, pretrained_embeddings=dataset.text.embedding_matrix, rnn_type='sru', n_layers=args.n_layers, hidden_dim=args.hidden_dim, freeze_pretrained_embeddings=True) loss_fn = nn.CrossEntropyLoss() parameters = (p for p in model.parameters() if p.requires_grad) optimizer = torch.optim.Adam(parameters, lr=args.learning_rate) print("Beginning training.") for epoch in range(args.num_epochs): ###################### # TRAIN # ###################### print(f'Epoch: {epoch}') model.train() with torch.enable_grad(): for batch in train_sampler: # Zero the gradients and clear the accumulated loss optimizer.zero_grad() # Move to device batch = tuple(t.to(device) for t in batch) query, query_label, support, support_label = batch # Compute loss pred = model(query, support, support_label) loss = loss_fn(pred, query_label) loss.backward() # Clip gradients if necessary if args.max_grad_norm is not None: clip_grad_norm_(model.parameters(), args.max_grad_norm) writer.add_scalar('Training/Loss', loss.item(), global_step) # Optimize optimizer.step() global_step += 1 # Zero the gradients when exiting a train step optimizer.zero_grad() ######################### # EVALUATE # ######################### model.eval() with torch.no_grad(): # First compute prototypes over the training data encodings, labels = [], [] for text, label in train_eval_sampler: padding_mask = (text != model.padding_idx).byte() text_embeddings = model.embedding_dropout( model.embedding(text)) text_encoding = model.encoder(text_embeddings, padding_mask=padding_mask) labels.append(label.cpu()) encodings.append(text_encoding.cpu()) # Compute prototypes encodings = torch.cat(encodings, dim=0) labels = torch.cat(labels, dim=0) prototypes = model.compute_prototypes(encodings, labels).to(device) _preds, _targets = [], [] for batch in val_sampler: # Move to device source, target = tuple(t.to(device) for t in batch) pred = model(source, prototypes=prototypes) _preds.append(pred.cpu()) _targets.append(target.cpu()) preds = torch.cat(_preds, dim=0) targets = torch.cat(_targets, dim=0) val_loss = loss_fn(preds, targets).item() val_metric = (pred.argmax(dim=1) == target).float().mean().item() # Update best model if best_metric is None or val_metric > best_metric: best_metric = val_metric best_model_state = model.state_dict() for k, t in best_model_state.items(): best_model_state[k] = t.cpu().detach() best_model = best_model_state # Log metrics print(f'Validation loss: {val_loss}') print(f'Validation accuracy: {val_metric}') writer.add_scalar('Validation/Loss', val_loss, epoch) writer.add_scalar('Validation/Accuracy', val_metric, epoch) # Save the best model print("Finisehd training.") torch.save(best_model, os.path.join(args.output_dir, 'model.pt'))
def test_pass_labels(): """Test labels specified in the init""" dummy = ['LABEL1', 'LABEL3', 'LABEL2', 'LABEL2'] field = LabelField(labels=['LABEL1', 'LABEL2', 'LABEL3']) field.setup(dummy) assert len(field.vocab) == 3 assert int(field.process('LABEL1')) == 0 assert int(field.process('LABEL2')) == 1 assert int(field.process('LABEL3')) == 2 field = LabelField(labels=['LABEL3', 'LABEL1', 'LABEL2']) field.setup(dummy) assert len(field.vocab) == 3 assert int(field.process('LABEL1')) == 1 assert int(field.process('LABEL2')) == 2 assert int(field.process('LABEL3')) == 0