示例#1
0
    def test_caching_with_lazy_reader_in_multi_process_loader(self):
        data_file = (AllenNlpTestCase.FIXTURES_ROOT / "data" /
                     "text_classification_json" / "imdb_corpus.jsonl")
        reader = TextClassificationJsonReader(
            lazy=True, cache_directory=self.cache_directory)
        deque(DataLoader(reader.read(data_file),
                         collate_fn=lambda b: b[0],
                         num_workers=2),
              maxlen=0)

        # We shouldn't write to the cache when the data is being loaded from multiple
        # processes.
        cache_file = reader._get_cache_location_for_file_path(str(data_file))
        assert not os.path.exists(cache_file)

        # But try again from the main process and we should see the cache file.
        instances = list(reader.read(data_file))
        assert instances
        assert os.path.exists(cache_file)

        # Reading again from a multi-process loader should read from the cache.
        new_instances = list(
            DataLoader(reader.read(data_file),
                       collate_fn=lambda b: b[0],
                       num_workers=2))
        assert len(instances) == len(new_instances)
示例#2
0
    def test_batch_count(self):
        dataset = AllennlpDataset(self.instances, vocab=self.vocab)
        sampler = BucketBatchSampler(dataset, batch_size=2, padding_noise=0, sorting_keys=["text"])
        # We use a custom collate_fn for testing, which doesn't actually create tensors,
        # just the allennlp Batches.
        dataloader = DataLoader(dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x))

        assert len(dataloader) == 3
示例#3
0
def test_multi_processing_with_lazy_dataset_warns():
    def fake_instance_generator(file_name: str) -> Iterable[Instance]:
        yield from []

    with pytest.warns(UserWarning, match=r".*deadlocks.*"):
        DataLoader(AllennlpLazyDataset(fake_instance_generator,
                                       "nonexistent_file"),
                   num_workers=1)
示例#4
0
    def test_batch_of_entirely_empty_lists_works(self):
        dataset = AllennlpDataset([self.empty_instance, self.empty_instance],
                                  self.vocab)

        model = DummyModel(self.vocab)
        model.eval()
        loader = DataLoader(dataset, batch_size=2)
        batch = next(iter(loader))
        model.forward(**batch)
示例#5
0
 def test_max_instances_with_multi_process_loader(self, num_workers):
     data_file = (AllenNlpTestCase.FIXTURES_ROOT / "data" /
                  "text_classification_json" / "imdb_corpus.jsonl")
     reader = TextClassificationJsonReader(max_instances=2, lazy=True)
     instances = list(
         DataLoader(reader.read(data_file),
                    collate_fn=lambda b: b[0],
                    num_workers=num_workers))
     assert len(instances) == 2
示例#6
0
    def test_elmo_bilm(self):
        # get the raw data
        sentences, expected_lm_embeddings = self._load_sentences_embeddings()

        # load the test model
        elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file)

        # Deal with the data.
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for batch in zip(*sentences):
            for sentence in batch:
                tokens = [Token(token) for token in sentence.split()]
                field = TextField(tokens, {"character_ids": indexer})
                instance = Instance({"elmo": field})
                instances.append(instance)

        vocab = Vocabulary()
        dataset = AllennlpDataset(instances, vocab)
        # Now finally we can iterate through batches.
        loader = DataLoader(dataset, 3)
        for i, batch in enumerate(loader):
            lm_embeddings = elmo_bilm(
                batch["elmo"]["character_ids"]["elmo_tokens"])
            top_layer_embeddings, mask = remove_sentence_boundaries(
                lm_embeddings["activations"][2], lm_embeddings["mask"])

            # check the mask lengths
            lengths = mask.data.numpy().sum(axis=1)
            batch_sentences = [sentences[k][i] for k in range(3)]
            expected_lengths = [
                len(sentence.split()) for sentence in batch_sentences
            ]
            self.assertEqual(lengths.tolist(), expected_lengths)

            # get the expected embeddings and compare!
            expected_top_layer = [
                expected_lm_embeddings[k][i] for k in range(3)
            ]
            for k in range(3):
                self.assertTrue(
                    numpy.allclose(
                        top_layer_embeddings[k, :lengths[k], :].data.numpy(),
                        expected_top_layer[k],
                        atol=1.0e-6,
                    ))
示例#7
0
    def test_drop_last_works(self):
        dataset = AllennlpDataset(self.instances, vocab=self.vocab)
        sampler = BucketBatchSampler(
            dataset, batch_size=2, padding_noise=0, sorting_keys=["text"], drop_last=True,
        )
        # We use a custom collate_fn for testing, which doesn't actually create tensors,
        # just the allennlp Batches.
        dataloader = DataLoader(dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x))
        batches = [batch for batch in iter(dataloader)]
        stats = self.get_batches_stats(batches)

        # all batches have length batch_size
        assert all(batch_len == 2 for batch_len in stats["batch_lengths"])

        # we should have lost one instance by skipping the last batch
        assert stats["total_instances"] == len(self.instances) - 1
示例#8
0
def get_accuracy(model, dev_dataset, vocab, trigger_token_ids=None, snli=False):
    """
    When trigger_token_ids is None, gets accuracy on the dev_dataset. Otherwise, gets accuracy with
    triggers prepended for the whole dev_dataset.
    """
    model.get_metrics(reset=True)
    model.eval()  # model should be in eval() already, but just in case
    data_loader = DataLoader(dev_dataset,
                             batch_sampler=BucketBatchSampler(dev_dataset, batch_size=128))
    if trigger_token_ids is None:
        for batch in data_loader:
            evaluate_batch(model, batch, trigger_token_ids, snli)
        print("Without Triggers: " + str(model.get_metrics()['accuracy']))
    else:
        print_string = ""
        for idx in trigger_token_ids:
            print_string = print_string + vocab.get_token_from_index(idx) + ', '

        for batch in data_loader:
            evaluate_batch(model, batch, trigger_token_ids, snli)
        print("Current Triggers: " + print_string + " : " + str(model.get_metrics()['accuracy']))
示例#9
0
def test_loader_uses_all_instances_when_batches_per_epochs_set(lazy):
    NUM_INSTANCES = 20
    BATCH_SIZE = 2
    BATCHES_PER_EPOCH = 3
    EPOCHS = 4

    class FakeDatasetReader(DatasetReader):
        def _read(self, filename: str) -> Iterable[Instance]:
            for i in range(NUM_INSTANCES):
                yield Instance({"index": LabelField(i, skip_indexing=True)})

    reader = FakeDatasetReader(lazy=lazy)
    dataset = reader.read("blah")

    loader = DataLoader(dataset,
                        batch_size=BATCH_SIZE,
                        batches_per_epoch=BATCHES_PER_EPOCH)
    epoch_batches = []
    for epoch in range(EPOCHS):
        batches = []
        for batch in loader:
            instances = []
            for index in batch["index"]:
                instances.append(index)
            batches.append(instances)
        epoch_batches.append(batches)

    assert epoch_batches == [
        # Epoch 0.
        [[0, 1], [2, 3], [4, 5]],
        # Epoch 1.
        [[6, 7], [8, 9], [10, 11]],
        # Epoch 2.
        [[12, 13], [14, 15], [16, 17]],
        # Epoch 3.
        [[18, 19], [0, 1], [2, 3]],
    ]
示例#10
0
def fine_tune_model(model: Model,
                    params: Params,
                    serialization_dir: str,
                    extend_vocab: bool = False,
                    file_friendly_logging: bool = False,
                    batch_weight_key: str = "",
                    embedding_sources_mapping: Dict[str, str] = None,
                    in_fold = None,
                    num_folds = None,
                    ewc_weight=None) -> Model:
    """
    Fine tunes the given model, using a set of parameters that is largely identical to those used
    for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored,
    if it is present (as we are already given a ``Model`` here).

    The main difference between the logic done here and the logic done in ``train_model`` is that
    here we do not worry about vocabulary construction or creating the model object.  Everything
    else is the same.

    Parameters
    ----------
    model : ``Model``
        A model to fine tune.
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment
    serialization_dir : ``str``
        The directory in which to save results and logs.
    extend_vocab: ``bool``, optional (default=False)
        If ``True``, we use the new instances to extend your vocabulary.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    batch_weight_key : ``str``, optional (default="")
        If non-empty, name of metric used to weight the loss on a per-batch basis.
    embedding_sources_mapping: ``Dict[str, str]``, optional (default=None)
        mapping from model paths to the pretrained embedding filepaths
        used during fine-tuning.
    """
    prepare_environment(params)
    if os.path.exists(serialization_dir) and os.listdir(serialization_dir):
        raise ConfigurationError(f"Serialization directory ({serialization_dir}) "
                                 f"already exists and is not empty.")

    os.makedirs(serialization_dir, exist_ok=True)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    if params.pop('model', None):
        logger.warning("You passed parameters for the model in your configuration file, but we "
                       "are ignoring them, using instead the model parameters in the archive.")

    vocabulary_params = params.pop('vocabulary', {})
    if vocabulary_params.get('directory_path', None):
        logger.warning("You passed `directory_path` in parameters for the vocabulary in "
                       "your configuration file, but it will be ignored. ")

    all_datasets = datasets_from_params(params)
    vocab = model.vocab

    if extend_vocab:
        datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

        for dataset in datasets_for_vocab_creation:
            if dataset not in all_datasets:
                raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

        logger.info("Extending model vocabulary using %s data.", ", ".join(datasets_for_vocab_creation))
        vocab.extend_from_instances(vocabulary_params,
                                    (instance for key, dataset in all_datasets.items()
                                     for instance in dataset
                                     if key in datasets_for_vocab_creation))

        model.extend_embedder_vocab(embedding_sources_mapping)

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
                parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    dl_params = params.pop("data_loader")
    if test_data is not None:
        rand = random.Random(1234)
        test_data.index_with(vocab)
        shuffled_test = copy(test_data.instances)
        rand.shuffle(shuffled_test)
        extra_test = shuffled_test[:2000]

        keys = deepcopy(dl_params.as_dict())
        keys.update({"dataset": AllennlpDataset(extra_test, vocab)})
        extra_test_loader = DataLoader.from_params(params.pop("test_data_loader", keys))

        keys = deepcopy(dl_params.as_dict())
        keys.update({"dataset": test_data})
        test_loader = DataLoader.from_params(params.pop("test_data_loader", keys))

    master_model = model
    global_metrics = {}
    training_metrics = []
    final_metrics = {}
    master_trainer = trainer_params.as_dict()

    if num_folds is not None:

        rand = random.Random(1234)

        fold_train = []
        fold_test = []

        fold_train_loader = []
        fold_test_loader = []

        shuffled_instances = copy(train_data.instances)
        rand.shuffle(shuffled_instances)



        kfold = KFold(n_splits=num_folds, random_state=None, shuffle=False)
        computed_folds = list(kfold.split(shuffled_instances))

        for fold in range(num_folds):
            train_indexes, test_indexes = computed_folds[fold]
            new_train = [shuffled_instances[i] for i in train_indexes]
            new_test = [shuffled_instances[i] for i in test_indexes]
            fold_train.append(AllennlpDataset(new_train, vocab=vocab))
            fold_test.append(AllennlpDataset(new_test, vocab=vocab))

            keys = deepcopy(dl_params.as_dict())
            keys.update({"dataset": fold_test[-1]})
            fold_test_loader.append(DataLoader.from_params(params.pop("fold_test_data_loader",keys)))

            keys = deepcopy(dl_params.as_dict())
            keys.update({"dataset": fold_train[-1]})
            fold_train_loader.append(DataLoader.from_params(params.pop("fold_train_data_loader", keys)))

        for fold in ([in_fold] if in_fold is not None else range(num_folds)):
            fold_model = deepcopy(master_model)
            eval_epoch_callback = EvalEpochCallback(fold, fold_test_loader[fold], test_loader, global_metrics)
            callbacks = [eval_epoch_callback]
            if ewc_weight is not None:
                ewc = EWC(extra_test_loader)

                def ewc_forward(*args, **kwargs) -> Dict[str, torch.Tensor]:
                    ewc_loss = 0
                    if ewc.model.training:
                        ewc_loss = ewc.penalty(ewc.model)
                    ret = ewc.model.old_forward(*args, **kwargs)
                    ret["loss"] += ewc_weight * ewc_loss
                    return ret

                fold_model.old_forward = fold_model.forward
                fold_model.forward = ewc_forward
                callbacks.append(CallLossCallback(ewc))

            trainer = Trainer.from_params(model=fold_model,
                                          serialization_dir=serialization_dir,
                                          data_loader=fold_train_loader[fold],
                                          train_data=train_data,
                                          validation_data=None,
                                          params=Params(deepcopy(master_trainer)),
                                          validation_data_loader=None,
                                          epoch_callbacks=callbacks)

            training_metrics.append(trainer.train())
            del fold_model
            del trainer
            del eval_epoch_callback

            state = glob(serialization_dir+"/*.th")
            for file in state:
                logger.info("deleting state - {}".format(file))
                os.unlink(file)
    else:
        callbacks = []
        if ewc_weight is not None:
            ewc = EWC(extra_test_loader)

            def ewc_forward(*args, **kwargs) -> Dict[str, torch.Tensor]:
                ewc_loss = 0
                if ewc.model.training:
                    ewc_loss = ewc.penalty(ewc.model)
                ret = ewc.model.old_forward(*args, **kwargs)
                ret["loss"] += ewc_weight * ewc_loss
                return ret

            model.old_forward = model.forward
            model.forward = ewc_forward
            callbacks.append(CallLossCallback(ewc))

        keys = deepcopy(dl_params.as_dict())
        keys.update({"dataset": train_data})
        train_data.index_with(vocab)
        train_data_loader = DataLoader.from_params(params.pop("train_loader",keys))

        if validation_data is not None:
            validation_data.index_with(vocab)
            keys = deepcopy(dl_params.as_dict())
            keys.update({"dataset": validation_data})

            validation_data_loader = DataLoader.from_params(params.pop("validation_loader", keys))
        else:
            validation_data_loader = None

        if "finetune" in dir(model):
            model.finetune()
            logger.info("Fine tuning model")
        trainer = Trainer.from_params(model=model,
                                      serialization_dir=serialization_dir,
                                      data_loader=train_data_loader,
                                      train_data=train_data,
                                      validation_data=None,
                                      params=Params(deepcopy(master_trainer)),
                                      validation_data_loader=validation_data_loader,
                                      epoch_callbacks=callbacks)

        training_metrics = trainer.train()
        archive_model(serialization_dir)

    final_metrics["fine_tune"] = global_metrics
    final_metrics["training"] = training_metrics

    metrics_json = json.dumps(final_metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)
    return model
示例#11
0
def run(all_code_types,
        d_embedding,
        embedding_dropout_p,
        min_count,
        batch_size,
        verbose,
        epochs,
        lr,
        wd,
        logsig,
        sig_depth,
        run_name,
        patience,
        add_time,
        leadlag,
        t_scale,
        t_max,
        use_timestamps,
        feedforward_num_layers,
        feedforward_hidden_dims,
        feedforward_activations,
        feedforward_dropout,
        training_proportion=1,
        testing_subsample_size=None,
        split_paths=False,
        tensorboard_log=False,
        evaluate_on_test=True):
    """Run the experiment for either cross validation or testing"""

    dataset, dataset_test, vocab = generate_ml_data(
        all_code_types,
        min_count,
        batch_size,
        verbose=verbose,
        allen_mode=True,
        dataset_path=None,
        training_proportion=training_proportion,
        testing_subsample_size=testing_subsample_size,
        split_paths=split_paths)

    logger.info("Using k-fold cross validation")
    # Allen kfold
    metrics_by_fold = []
    cross_validator = StratifiedKFold(n_splits=K_FOLDS, shuffle=True)

    n_splits = cross_validator.get_n_splits(dataset)

    for fold_index, (train_indices, validation_indices) in enumerate(
            cross_validator(dataset)):
        logger.info(f"Fold {fold_index}/{n_splits - 1}")
        train_dataset = Subset(
            dataset,
            train_indices,
        )
        validation_dataset = Subset(dataset, validation_indices)
        train_loader = DataLoader(dataset=train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True)
        validation_loader = DataLoader(dataset=validation_dataset,
                                       batch_size=batch_size,
                                       shuffle=True)
        if tensorboard_log or evaluate_on_test:
            serialization_dir = os.path.join(TENSORBOARD_DIR, run_name,
                                             str(uuid.uuid4()),
                                             str(fold_index))
        else:
            serialization_dir = None

        model = init_sig(vocab, d_embedding, embedding_dropout_p, sig_depth,
                         logsig, all_code_types, feedforward_num_layers,
                         feedforward_hidden_dims, feedforward_activations,
                         feedforward_dropout, leadlag, add_time, t_max,
                         t_scale, use_timestamps, split_paths)
        if torch.cuda.is_available():
            cuda_device = 0
            model = model.cuda(cuda_device)
            logger.info('USING CUDA GPU')
        else:
            cuda_device = -1

        fold_metrics, model = train_model(model, lr, wd, train_loader,
                                          validation_loader, patience, epochs,
                                          cuda_device, serialization_dir)
        if serialization_dir is not None:
            ex.add_artifact(
                os.path.join(serialization_dir,
                             'best.th'))  # Add file location to sacred log

        metrics_by_fold.append(fold_metrics)

        if evaluate_on_test:
            if serialization_dir is None:
                raise Exception(
                    'serialization_dir needed to load best model from validation'
                )
            test_dataloader = DataLoader(dataset=dataset_test,
                                         batch_size=batch_size,
                                         shuffle=True)  # Held out test data
            metrics = evaluate(model, test_dataloader, cuda_device)
            return metrics
        torch.cuda.empty_cache()

    metrics = reformat_metrics(metrics_by_fold, ex)
    return metrics
示例#12
0
def main():
    # load the binary SST dataset.
    single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True)  # word tokenizer
    # use_subtrees gives us a bit of extra data by breaking down each example into sub sentences.
    reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class",
                                                    token_indexers={"tokens": single_id_indexer},
                                                    use_subtrees=True)
    train_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/train.txt')
    reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class",
                                                    token_indexers={"tokens": single_id_indexer})
    dev_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/dev.txt')
    # test_dataset = reader.read('data/sst/test.txt')

    vocab = Vocabulary.from_instances(train_data)
    train_data.index_with(vocab)
    dev_data.index_with(vocab)

    # Randomly initialize vectors
    if EMBEDDING_TYPE == "None":
        token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300)
        word_embedding_dim = 300

    # Load word2vec vectors
    elif EMBEDDING_TYPE == "w2v":
        embedding_path = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
        weight = _read_pretrained_embeddings_file(embedding_path,
                                                  embedding_dim=300,
                                                  vocab=vocab,
                                                  namespace="tokens")
        token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                    embedding_dim=300,
                                    weight=weight,
                                    trainable=False)
        word_embedding_dim = 300

    # Initialize model, cuda(), and optimizer
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(word_embedding_dim,
                                                  hidden_size=512,
                                                  num_layers=2,
                                                  batch_first=True))
    model = LstmClassifier(word_embeddings, encoder, vocab)
    model.cuda()

    # where to save the model
    model_path = "/tmp/" + EMBEDDING_TYPE + "_" + "model.th"
    vocab_path = "/tmp/" + EMBEDDING_TYPE + "_" + "vocab"
    # if the model already exists (its been trained), load the pre-trained weights and vocabulary
    if os.path.isfile(model_path):
        vocab = Vocabulary.from_files(vocab_path)
        model = LstmClassifier(word_embeddings, encoder, vocab)
        with open(model_path, 'rb') as f:
            model.load_state_dict(torch.load(f))
    # otherwise train model from scratch and save its weights
    else:
        train_sampler = BucketBatchSampler(train_data, batch_size=32, sorting_keys=[("tokens")])
        dev_sampler = BucketBatchSampler(dev_data, batch_size=32, sorting_keys=[("tokens")])
        train_loader = DataLoader(train_data, batch_sampler=train_sampler)
        dev_loader = DataLoader(dev_data, batch_sampler=dev_sampler)
        optimizer = optim.Adam(model.parameters())
        trainer = GradientDescentTrainer(model=model,
                                         optimizer=optimizer,
                                         data_loader=train_loader,
                                         validation_data_loader=dev_loader,
                                         num_epochs=5,
                                         patience=1,
                                         cuda_device=0)
        trainer.train()
        with open(model_path, 'wb') as f:
            torch.save(model.state_dict(), f)
        vocab.save_to_files(vocab_path)
    model.train().cuda()  # rnn cannot do backwards in eval mode

    # Register a gradient hook on the embeddings. This saves the gradient w.r.t. the word embeddings.
    # We use the gradient later in the attack.
    utils.add_hooks(model)
    embedding_weight = utils.get_embedding_weight(model)  # also save the word embedding matrix

    # Build k-d Tree if you are using gradient + nearest neighbor attack
    # tree = KDTree(embedding_weight.numpy())

    # filter the dataset to only positive or negative examples
    # (the trigger will cause the opposite prediction)
    dataset_label_filter = "0"
    targeted_dev_data = []
    for instance in dev_data:
        if instance['label'].label == dataset_label_filter:
            targeted_dev_data.append(instance)
    targeted_dev_data = AllennlpDataset(targeted_dev_data, vocab)

    # get accuracy before adding triggers
    utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids=None)
    model.train()  # rnn cannot do backwards in eval mode

    # initialize triggers which are concatenated to the input
    num_trigger_tokens = 3
    trigger_token_ids = [vocab.get_token_index("the")] * num_trigger_tokens

    # Use batches of size universal_perturb_batch_size for the attacks.
    universal_perturb_batch_size = 128
    targeted_sampler = BasicBatchSampler(sampler=SequentialSampler(targeted_dev_data),
                                         batch_size=universal_perturb_batch_size,
                                         drop_last=False)  # TODO don't drop last
    targeted_loader = DataLoader(targeted_dev_data, batch_sampler=targeted_sampler)
    # sample batches, update the triggers, and repeat
    for epoch in range(5):
        for batch in targeted_loader:
            # get accuracy with current triggers
            utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
            model.train()  # rnn cannot do backwards in eval mode

            # get gradient w.r.t. trigger embeddings for current batch
            averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids)

            # pass the gradients to a particular attack to generate token candidates for each token.
            cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad,
                                                            embedding_weight,
                                                            trigger_token_ids,
                                                            num_candidates=40,
                                                            increase_loss=True)
            # cand_trigger_token_ids = attacks.random_attack(embedding_weight,
            #                                                trigger_token_ids,
            #                                                num_candidates=40)
            # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad,
            #                                                        embedding_weight,
            #                                                        trigger_token_ids,
            #                                                        tree,
            #                                                        100,
            #                                                        num_candidates=40,
            #                                                        increase_loss=True)

            # Tries all of the candidates and returns the trigger sequence with highest loss.
            trigger_token_ids = utils.get_best_candidates(model,
                                                          batch,
                                                          trigger_token_ids,
                                                          cand_trigger_token_ids)

    # print accuracy after adding triggers
    utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
示例#13
0
def test_multi_processing_with_lazy_dataset_warns():
    with pytest.warns(UserWarning, match=r".*deadlocks.*"):
        DataLoader(AllennlpLazyDataset(fake_instance_generator, "nonexistent_file"), num_workers=1)
示例#14
0
 def evaluate_model(self):
     self.from_pretrained()
     print(evaluate(self.model, DataLoader(self.dev_data, 32), 0, None))
示例#15
0
    def train(self):
        if self.config.adjust_point:
            ram_set_flag("adjust_point")
        # ram_write('dist_reg', self.config.dist_reg)
        read_hyper_ = partial(read_hyper, self.config.task_id,
                              self.config.arch)
        num_epochs = int(read_hyper_("num_epochs"))
        batch_size = int(read_hyper_("batch_size"))
        logger.info(f"num_epochs: {num_epochs}, batch_size: {batch_size}")

        if self.config.model_name == 'tmp':
            p = pathlib.Path('saved/models/tmp')
            if p.exists():
                shutil.rmtree(p)

        # Maybe we will do some data augmentation here.
        if self.config.aug_data != '':
            log(f'Augment data from {self.config.aug_data}')
            aug_data = auto_create(
                f"{self.config.task_id}.{self.config.arch}.aug",
                lambda: self.reader.read(self.config.aug_data),
                cache=True)
            self.train_data.instances.extend(aug_data.instances)

        # Set up the adversarial training policy
        if self.config.arch == 'bert':
            model_vocab = embed_util.get_bert_vocab()
        else:
            model_vocab = self.vocab
        # yapf: disable
        adv_field = 'sent2' if is_sentence_pair(self.config.task_id) and self.config.arch != 'bert' else 'sent'
        policy_args = {
            "adv_iteration": self.config.adv_iter,
            "replace_num": self.config.adv_replace_num,
            "searcher": WordIndexSearcher(
                CachedWordSearcher(
                    "external_data/ibp-nbrs.json" if not self.config.big_nbrs else "external_data/euc-top8.json",
                    model_vocab.get_token_to_index_vocabulary("tokens"),
                    second_order=False
                ),
                word2idx=model_vocab.get_token_index,
                idx2word=model_vocab.get_token_from_index,
            ),
            'adv_field': adv_field
        }
        # yapf: enable
        if self.config.adv_policy == 'hot':
            if is_sentence_pair(
                    self.config.task_id) and self.config.arch != 'bert':
                policy_args['forward_order'] = 1
            adv_policy = adv_utils.HotFlipPolicy(**policy_args)
        elif self.config.adv_policy == 'rdm':
            adv_policy = adv_utils.RandomNeighbourPolicy(**policy_args)
        elif self.config.adv_policy == 'diy':
            adv_policy = adv_utils.DoItYourselfPolicy(self.config.adv_iter,
                                                      adv_field,
                                                      self.config.adv_step)
        else:
            adv_policy = adv_utils.NoPolicy

        # A collate_fn will do some transformation an instance before
        # fed into a model. If we want to train a model with some transformations
        # such as cropping/DAE, we can modify code here. e.g.,
        # collate_fn = partial(transform_collate, self.vocab, self.reader, Crop(0.3))
        collate_fn = allennlp_collate
        train_data_sampler = BucketBatchSampler(
            data_source=self.train_data,
            batch_size=batch_size,
        )
        # Set callbacks

        if self.config.task_id == 'SNLI' and self.config.arch != 'bert':
            epoch_callbacks = []
            if self.config.model_pretrain != "":
                epoch_callbacks = [WarmupCallback(2)]
                if self.config.model_pretrain == 'auto':
                    self.config.model_pretrain = {
                        "biboe": "SNLI-fix-biboe-sum",
                        "datt": "SNLI-fix-datt"
                    }[self.config.arch]
                logger.warning(
                    f"Try loading weights from pretrained model {self.config.model_pretrain}"
                )
                pretrain_ckpter = CheckpointerX(
                    f"saved/models/{self.config.model_pretrain}")
                self.model.load_state_dict(pretrain_ckpter.best_model_state())
        else:
            epoch_callbacks = []
        # epoch_callbacks = []
        batch_callbacks = []

        opt = self.model.get_optimizer()
        if self.config.arch == 'bert':
            scl = SlantedTriangular(opt, num_epochs,
                                    len(self.train_data) // batch_size)
        else:
            scl = None

        trainer = AdvTrainer(
            model=self.model,
            optimizer=opt,
            learning_rate_scheduler=scl,
            validation_metric='+accuracy',
            adv_policy=adv_policy,
            data_loader=DataLoader(
                self.train_data,
                batch_sampler=train_data_sampler,
                collate_fn=collate_fn,
            ),
            validation_data_loader=DataLoader(
                self.dev_data,
                batch_size=batch_size,
            ),
            num_epochs=num_epochs,
            patience=None,
            grad_clipping=1.,
            cuda_device=0,
            epoch_callbacks=epoch_callbacks,
            batch_callbacks=batch_callbacks,
            serialization_dir=f'saved/models/{self.config.model_name}',
            num_serialized_models_to_keep=20)
        trainer.train()