Пример #1
0
 def test_fine_tune_nograd_regex(self):
     original_model = load_archive(self.model_archive).model
     name_parameters_original = dict(original_model.named_parameters())
     regex_lists = [[],
                    [".*attend_feedforward.*", ".*token_embedder.*"],
                    [".*compare_feedforward.*"]]
     for regex_list in regex_lists:
         params = Params.from_file(self.config_file)
         params["trainer"]["no_grad"] = regex_list
         shutil.rmtree(self.serialization_dir, ignore_errors=True)
         tuned_model = fine_tune_model(model=original_model,
                                       params=params,
                                       serialization_dir=self.serialization_dir)
         # If regex is matched, parameter name should have requires_grad False
         # If regex is matched, parameter name should have same requires_grad
         # as the originally loaded model
         for name, parameter in tuned_model.named_parameters():
             if any(re.search(regex, name) for regex in regex_list):
                 assert not parameter.requires_grad
             else:
                 assert parameter.requires_grad \
                 == name_parameters_original[name].requires_grad
     # If all parameters have requires_grad=False, then error.
     with pytest.raises(Exception) as _:
         params = Params.from_file(self.config_file)
         params["trainer"]["no_grad"] = ["*"]
         shutil.rmtree(self.serialization_dir, ignore_errors=True)
         tuned_model = fine_tune_model(model=original_model,
                                       params=params,
                                       serialization_dir=self.serialization_dir)
Пример #2
0
def fine_tune_model_from_file_paths(model_archive_path: str,
                                    config_file: str,
                                    serialization_dir: str,
                                    overrides: str = "",
                                    file_friendly_logging: bool = False) -> Model:
    """
    A wrapper around :func:`fine_tune_model` which loads the model archive from a file.

    Parameters
    ----------
    model_archive_path : ``str``
        Path to a saved model archive that is the result of running the ``train`` command.
    config_file : ``str``
        A configuration file specifying how to continue training.  The format is identical to the
        configuration file for the ``train`` command, but any contents in the ``model`` section is
        ignored (as we are using the provided model archive instead).
    serialization_dir : ``str``
        The directory in which to save results and logs. We just pass this along to
        :func:`fine_tune_model`.
    overrides : ``str``
        A JSON string that we will use to override values in the input parameter file.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we make our output more friendly to saved model files.  We just pass this
        along to :func:`fine_tune_model`.
    """
    # We don't need to pass in `cuda_device` here, because the trainer will call `model.cuda()` if
    # necessary.
    archive = load_archive(model_archive_path)
    params = Params.from_file(config_file, overrides)
    return fine_tune_model(model=archive.model,
                           params=params,
                           serialization_dir=serialization_dir,
                           file_friendly_logging=file_friendly_logging)
Пример #3
0
    def test_fine_tune_runtime_errors_with_vocab_expansion(self):
        params = Params.from_file(self.config_file)
        params["train_data_path"] = str(self.FIXTURES_ROOT / 'data' / 'snli2.jsonl')

        model = load_archive(self.model_archive).model

        # If we do vocab expansion, we get a runtime error because of the embedding.
        with pytest.raises(RuntimeError):
            fine_tune_model(model, params, self.serialization_dir, extend_vocab=True)
Пример #4
0
    def test_fine_tune_does_not_expand_vocab_by_default(self):
        params = Params.from_file(self.config_file)
        # snli2 has a new token in it
        params["train_data_path"] = str(self.FIXTURES_ROOT / 'data' / 'snli2.jsonl')

        model = load_archive(self.model_archive).model

        # By default, no vocab expansion.
        fine_tune_model(model, params, self.serialization_dir)
Пример #5
0
    def ensure_model_can_train_save_and_load(
        self,
        param_file: Union[PathLike, str],
        tolerance: float = 1e-4,
        cuda_device: int = -1,
        gradients_to_ignore: Set[str] = None,
        overrides: str = "",
        metric_to_check: str = None,
        metric_terminal_value: float = None,
        metric_tolerance: float = 1e-4,
        disable_dropout: bool = True,
    ):
        """
        # Parameters

        param_file : `str`
            Path to a training configuration file that we will use to train the model for this
            test.
        tolerance : `float`, optional (default=`1e-4`)
            When comparing model predictions between the originally-trained model and the model
            after saving and loading, we will use this tolerance value (passed as `rtol` to
            `numpy.testing.assert_allclose`).
        cuda_device : `int`, optional (default=`-1`)
            The device to run the test on.
        gradients_to_ignore : `Set[str]`, optional (default=`None`)
            This test runs a gradient check to make sure that we're actually computing gradients
            for all of the parameters in the model.  If you really want to ignore certain
            parameters when doing that check, you can pass their names here.  This is not
            recommended unless you're `really` sure you don't need to have non-zero gradients for
            those parameters (e.g., some of the beam search / state machine models have
            infrequently-used parameters that are hard to force the model to use in a small test).
        overrides : `str`, optional (default = `""`)
            A JSON string that we will use to override values in the input parameter file.
        metric_to_check: `str`, optional (default = `None`)
            We may want to automatically perform a check that model reaches given metric when
            training (on validation set, if it is specified). It may be useful in CI, for example.
            You can pass any metric that is in your model returned metrics.
        metric_terminal_value: `str`, optional (default = `None`)
            When you set `metric_to_check`, you need to set the value this metric must converge to
        metric_tolerance: `float`, optional (default=`1e-4`)
            Tolerance to check you model metric against metric terminal value. One can expect some
            variance in model metrics when the training process is highly stochastic.
        disable_dropout : `bool`, optional (default = `True`)
            If True we will set all dropout to 0 before checking gradients. (Otherwise, with small
            datasets, you may get zero gradients because of unlucky dropout.)
        """
        save_dir = self.TEST_DIR / "save_and_load_test"
        archive_file = save_dir / "model.tar.gz"
        model = train_model_from_file(param_file,
                                      save_dir,
                                      overrides=overrides)
        metrics_file = save_dir / "metrics.json"
        if metric_to_check is not None:
            metrics = json.loads(metrics_file.read_text())
            metric_value = metrics.get(
                f"best_validation_{metric_to_check}") or metrics.get(
                    f"training_{metric_to_check}")
            assert metric_value is not None, f"Cannot find {metric_to_check} in metrics.json file"
            assert metric_terminal_value is not None, "Please specify metric terminal value"
            assert abs(metric_value - metric_terminal_value) < metric_tolerance
        archive = load_archive(archive_file, cuda_device=cuda_device)
        loaded_model = archive.model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(
                model.state_dict()[key].cpu().numpy(),
                loaded_model.state_dict()[key].cpu().numpy(),
                err_msg=key,
            )
        reader = archive.dataset_reader
        params = Params.from_file(param_file, params_overrides=overrides)

        print("Reading with original model")
        model_dataset = reader.read(params["validation_data_path"])
        model_dataset.index_with(model.vocab)

        print("Reading with loaded model")
        loaded_dataset = reader.read(params["validation_data_path"])
        loaded_dataset.index_with(loaded_model.vocab)

        # Need to duplicate params because DataLoader.from_params will consume.
        data_loader_params = params["data_loader"]
        data_loader_params["shuffle"] = False
        data_loader_params2 = Params(
            copy.deepcopy(data_loader_params.as_dict()))

        data_loader = DataLoader.from_params(dataset=model_dataset,
                                             params=data_loader_params)
        data_loader2 = DataLoader.from_params(dataset=loaded_dataset,
                                              params=data_loader_params2)

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        model_batch = next(iter(data_loader))

        loaded_batch = next(iter(data_loader2))

        # Check gradients are None for non-trainable parameters and check that
        # trainable parameters receive some gradient if they are trainable.
        self.check_model_computes_gradients_correctly(model, model_batch,
                                                      gradients_to_ignore,
                                                      disable_dropout)

        # The datasets themselves should be identical.
        assert model_batch.keys() == loaded_batch.keys()
        for key in model_batch.keys():
            self.assert_fields_equal(model_batch[key], loaded_batch[key], key,
                                     1e-6)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        # Models with stateful RNNs need their states reset to have consistent
        # behavior after loading.
        for model_ in [model, loaded_model]:
            for module in model_.modules():
                if hasattr(module, "stateful") and module.stateful:
                    module.reset_states()
        print("Predicting with original model")
        model_predictions = model(**model_batch)
        print("Predicting with loaded model")
        loaded_model_predictions = loaded_model(**loaded_batch)

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     name=key,
                                     tolerance=tolerance)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model.train()
        loaded_model_predictions = loaded_model(**loaded_batch)
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        return model, loaded_model
Пример #6
0
    def test_train_model_distributed_with_gradient_accumulation(
            self, max_instances, grad_acc, batch_size):
        if torch.cuda.device_count() >= 2:
            devices = [0, 1]
        else:
            devices = [-1, -1]

        params = lambda: Params({
            "model": {
                "type": "simple_tagger",
                "text_field_embedder": {
                    "token_embedders": {
                        "tokens": {
                            "type": "embedding",
                            "embedding_dim": 5
                        }
                    }
                },
                "encoder": {
                    "type": "lstm",
                    "input_size": 5,
                    "hidden_size": 7,
                    "num_layers": 2
                },
            },
            "dataset_reader": {
                "type": "sequence_tagging",
                "max_instances": max_instances
            },
            "train_data_path": SEQUENCE_TAGGING_DATA_PATH,
            "validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
            "data_loader": {
                "batch_size": batch_size
            },
            "trainer": {
                "num_epochs": 2,
                "optimizer": "adam",
                "num_gradient_accumulation_steps": grad_acc,
            },
            "distributed": {
                "cuda_devices": devices
            },
        })

        out_dir = os.path.join(self.TEST_DIR,
                               "test_distributed_train_with_grad_acc")
        train_model(params(), serialization_dir=out_dir)

        # Check that some logs specific to distributed
        # training are where we expect.
        serialized_files = os.listdir(out_dir)
        assert "out_worker0.log" in serialized_files
        assert "out_worker1.log" in serialized_files
        assert "model.tar.gz" in serialized_files
        assert "metrics.json" in serialized_files

        # Make sure the metrics look right.
        with open(os.path.join(out_dir, "metrics.json")) as f:
            metrics = json.load(f)
            assert metrics["peak_worker_0_memory_MB"] > 0
            assert metrics["peak_worker_1_memory_MB"] > 0
            if torch.cuda.device_count() >= 2:
                assert metrics["peak_gpu_0_memory_MB"] > 0
                assert metrics["peak_gpu_1_memory_MB"] > 0

        # Check we can load the serialized model
        assert load_archive(out_dir).model
Пример #7
0
def main():
    # Load SNLI dataset

    bert_indexer = PretrainedTransformerIndexer('bert-base-uncased')
    tokenizer = PretrainedTransformerTokenizer(model_name='bert-base-uncased')
    reader = SnliReader(token_indexers={'tokens': bert_indexer},
                        tokenizer=tokenizer,
                        combine_input_fields=True)

    # single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True) # word tokenizer
    # tokenizer = WordTokenizer(end_tokens=["@@NULL@@"]) # add @@NULL@@ to the end of sentences
    # reader = SnliReader(token_indexers={'tokens': single_id_indexer}, tokenizer=tokenizer)
    dev_dataset = reader.read(
        'https://s3-us-west-2.amazonaws.com/allennlp/datasets/snli/snli_1.0_dev.jsonl'
    )
    # Load model and vocab
    model_type = "pred"
    # model_type = "merged"
    if model_type == "merged":
        model = load_archive(
            '/home/junliw/gradient-regularization/SNLI/archives/bert_models/merged_model.tar.gz'
        ).model
    elif model_type == "pred":
        model = load_archive(
            '/home/junliw/gradient-regularization/SNLI/archives/bert_models/bert_trained2.tar.gz'
        ).model
    model.eval().cuda()
    vocab = model.vocab

    # add hooks for embeddings so we can compute gradients w.r.t. to the input tokens
    utils.add_hooks(model)

    if model_type == "merged":
        embedding_weight = model.combined_model._text_field_embedder._modules[
            "token_embedder_tokens"].transformer_model.embeddings.word_embeddings.weight  # save the word embedding matrix
    else:
        embedding_weight = model._text_field_embedder._modules[
            "token_embedder_tokens"].transformer_model.embeddings.word_embeddings.weight
    # print(model.combined_model._text_field_embedder._modules["token_embedder_tokens"].transformer_model.embeddings.word_embeddings)
    # print(embedding_weight.size())
    # Batches of examples to construct triggers
    universal_perturb_batch_size = 32

    # iterator = DataIterator(batch_size=universal_perturb_batch_size)
    # iterator.index_with(vocab)

    # Subsample the dataset to one class to do a universal attack on that class
    dataset_label_filter = 'entailment'  # only entailment examples
    # dataset_label_filter = 'contradiction' # only contradiction examples
    # dataset_label_filter = 'neutral' # only neutral examples
    subset_dev_dataset = []
    for instance in dev_dataset:
        if instance['label'].label == dataset_label_filter:
            subset_dev_dataset.append(instance)
    print(len(subset_dev_dataset))
    print(len(dev_dataset))
    # the attack is targeted towards a specific class
    # target_label = "0" # flip to entailment
    target_label = "1"  # flip to contradiction
    # target_label = "2" # flip to neutral

    # A k-d tree if you want to do gradient + nearest neighbors
    #tree = KDTree(embedding_weight.numpy())

    # Get original accuracy before adding universal triggers
    utils.get_accuracy(model,
                       subset_dev_dataset,
                       vocab,
                       tokenizer,
                       model_type,
                       trigger_token_ids=None,
                       snli=True)
    model.train()  # rnn cannot do backwards in train mode

    # Initialize triggers
    num_trigger_tokens = 2  # one token prepended
    start_tok = tokenizer.tokenizer.encode("a")[1]
    print(start_tok)
    trigger_token_ids = [start_tok] * num_trigger_tokens
    # sample batches, update the triggers, and repeat

    subset_dev_dataset_dataset = AllennlpDataset(dev_dataset, vocab)
    train_sampler = BucketBatchSampler(subset_dev_dataset_dataset,
                                       batch_size=universal_perturb_batch_size,
                                       sorting_keys=["tokens"])
    train_dataloader = DataLoader(subset_dev_dataset_dataset,
                                  batch_sampler=train_sampler)
    # for batch in lazy_groups_of(iterators(subset_dev_dataset, num_epochs=10, shuffle=True), group_size=1):
    for batch in train_dataloader:
        # get model accuracy with current triggers
        utils.get_accuracy(model,
                           subset_dev_dataset,
                           vocab,
                           tokenizer,
                           model_type,
                           trigger_token_ids,
                           snli=True)
        model.train()  # rnn cannot do backwards in train mode

        # get grad of triggers
        averaged_grad = utils.get_average_grad(model,
                                               batch,
                                               trigger_token_ids,
                                               target_label,
                                               snli=True)
        # find attack candidates using an attack method
        cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad,
                                                        embedding_weight,
                                                        trigger_token_ids,
                                                        increase_loss=False,
                                                        num_candidates=40)
        print("------")
        print(cand_trigger_token_ids)
        # cand_trigger_token_ids = attacks.random_attack(embedding_weight,
        #                                                trigger_token_ids,
        #                                                num_candidates=40)
        # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad,
        #                                                        embedding_weight,
        #                                                        trigger_token_ids,
        #                                                        tree,
        #                                                        100,
        #                                                        decrease_prob=True)
        # query the model to get the best candidates
        trigger_token_ids = utils.get_best_candidates(model,
                                                      batch,
                                                      trigger_token_ids,
                                                      cand_trigger_token_ids,
                                                      snli=True)
Пример #8
0
Файл: data_cli.py Проект: j5bd/q
def build_specter_vectors(hf_dataset: str,
                          specter_path: str,
                          output_path: str,
                          cuda_device: int = -1,
                          batch_size: int = 32,
                          vector_size: int = 768,
                          override=False):
    """
    Run with: $ ./data_cli.py build_specter_vectors paperswithcode_aspects ./specter_archive ./output/pwc_doc_id2specter.w2v.txt --cuda_device=5

    Download specter:
    $ wget https://ai2-s2-research-public.s3-us-west-2.amazonaws.com/specter/archive.tar.gz
    $ tar -xzvf archive.tar.gz

    :param vector_size:
    :param output_path: ./output
    :param override:
    :param cuda_device:
    :param batch_size:
    :param hf_dataset:
    :param specter_path: Path to specter
    :return:
    """
    from specter.predict_command import predictor_from_archive
    from allennlp.models import load_archive

    # load to register
    from specter.model import Model
    from specter.data import DataReader, DataReaderFromPickled
    from specter.predictor import SpecterPredictor

    if Model and DataReader and SpecterPredictor:
        pass

    if os.path.exists(output_path) and not override:
        logger.error(f'Output file exists already: {output_path}')
        return

    # Dataset
    docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='docs',
                           cache_dir='./data/nlp_cache',
                           split='docs')
    logger.info(f'Documents loaded: {len(docs_ds):,}')
    papers_to_embed = [doc for doc in docs_ds]

    # Specter settings
    archive_path = os.path.join(specter_path, 'model.tar.gz')
    metadata_path = os.path.join(specter_path, 'metadata_sample.json')
    included_text_fields = 'abstract title'
    vocab_dir = os.path.join(specter_path, 'data/vocab/')

    cuda_device = int(cuda_device)

    overrides = f"{{'model':{{'predict_mode':'true','include_venue':'false'}},'dataset_reader':{{'type':'specter_data_reader','predict_mode':'true','paper_features_path':'{metadata_path}','included_text_fields': '{included_text_fields}'}},'vocabulary':{{'directory_path':'{vocab_dir}'}}}}"

    logger.info(f'SPECTER overrides: {overrides}')

    archive = load_archive(archive_path,
                           cuda_device=cuda_device,
                           overrides=overrides)

    predictor = predictor_from_archive(archive,
                                       predictor_name='specter_predictor',
                                       paper_features_path=metadata_path)

    # Batches
    def chunks(lst, chunk_size):
        """Splits a longer list to respect batch size"""
        for i in range(0, len(lst), chunk_size):
            yield lst[i:i + chunk_size]

    batches_count = int(len(papers_to_embed) / batch_size)
    batch_embed_papers = []

    # 30min on GPU
    for batch in tqdm(chunks(papers_to_embed, batch_size),
                      total=batches_count):
        batch_out = predictor.predict_batch_json(batch)
        batch_embed_papers += batch_out

    # To keyed vectors
    doc_model = KeyedVectors(vector_size=vector_size)

    for embed_paper in tqdm(batch_embed_papers):
        doc_model.add([embed_paper['paper_id']], [embed_paper['embedding']])

    # Save to disk
    doc_model.save_word2vec_format(output_path)

    logger.info('Done')
Пример #9
0
    parser.add_argument('--db', type=str, help='/path/to/saved/db.db')
    parser.add_argument('--drqa-model', type=str, help='/path/to/saved/db.db')
    parser.add_argument('--rte-model', type=str, help='/path/to/saved/db.db')
    parser.add_argument('--max-page', type=int, default=5)
    parser.add_argument('--max-sent', type=int, default=5)
    parser.add_argument("--cuda-device",
                        type=int,
                        default=-1,
                        help='id of GPU to use (if any)')
    args = parser.parse_args()

    logger.info("Load DB")
    db = FeverDocDB(args.db)

    logger.info("Load RTE-Model")
    archive = load_archive(args.rte_model, cuda_device=args.cuda_device)

    logger.info("Init Retriever")
    evidence_retriever = EvidenceRetrieval(db, args.drqa_model, args.max_page,
                                           args.max_sent)

    config = archive.config
    ds_params = config["dataset_reader"]
    model = archive.model
    model.eval()

    reader = FEVERReader(
        db,
        sentence_level=ds_params.pop("sentence_level", False),
        wiki_tokenizer=Tokenizer.from_params(
            ds_params.pop('wiki_tokenizer',
Пример #10
0
 def load(self, path: str):
     self.predictor = Predictor.from_archive(
         load_archive(path, cuda_device=self.cuda), self.task)
Пример #11
0
            mentions += random.choices(mentions, k=sample_size - len(mentions))

        return mentions

    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        mentions = json_dict['mentions']

        assert len(mentions) == self._dataset_reader.sentence_sample
        instance = self._dataset_reader.text_to_instance(sentences=mentions)
        return instance


if __name__ == '__main__':
    model_path = sys.argv[1]

    archive = load_archive(model_path, overrides=PersonPredictor.overrides)

    predictor = Predictor.from_archive(archive, 'person-predictor')

    result = predictor.predict_json({
        "mentions":
        PersonPredictor.select_mentions([
            "@@mb@@ Perelman @@me@@ is Russian writer",
            "Millennium Prize Problem was solved by @@mb@@ him @@me@@ in 1998 and then he died",
        ], predictor._dataset_reader.sentence_sample)
    })

    labels = archive.model.vocab.get_index_to_token_vocabulary("labels")

    predicted_labels = dict((labels[idx], prob)
                            for idx, prob in enumerate(result['predictions'])
Пример #12
0
    def __init__(self,
                 vocab: Vocabulary,
                 token_representation_dim: int,
                 encoder: Optional[Seq2SeqEncoder] = None,
                 decoder: Optional[Union[FeedForward, str]] = None,
                 use_crf: bool = False,
                 constrain_crf_decoding: bool = False,
                 include_start_end_transitions: bool = True,
                 label_encoding: Optional[str] = None,
                 contextualizer: Optional[Contextualizer] = None,
                 calculate_per_label_f1: bool = False,
                 calculate_span_f1: bool = False,
                 calculate_perplexity: bool = False,
                 loss_average: str = "batch",
                 pretrained_file: Optional[str] = None,
                 transfer_contextualizer_from_pretrained_file: bool = False,
                 transfer_encoder_from_pretrained_file: bool = False,
                 freeze_encoder: bool = False,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(Tagger, self).__init__(vocab, regularizer)

        self._num_classes = self.vocab.get_vocab_size("labels")
        self._token_representation_dim = token_representation_dim
        self._contextualizer = contextualizer
        if encoder is None:
            encoder = PassThroughEncoder(input_dim=token_representation_dim)
        self._encoder = encoder

        # Load the contextualizer and encoder weights from the
        # pretrained_file if applicable
        if pretrained_file:
            archive = None
            if self._contextualizer and transfer_contextualizer_from_pretrained_file:
                logger.info("Attempting to load contextualizer weights from "
                            "pretrained_file at {}".format(pretrained_file))
                archive = load_archive(cached_path(pretrained_file))
                contextualizer_state = archive.model._contextualizer.state_dict()
                contextualizer_layer_num = self._contextualizer._layer_num
                logger.info("contextualizer_layer_num {}".format(contextualizer_layer_num))
                self._contextualizer.load_state_dict(contextualizer_state)
                if contextualizer_layer_num is not None:
                    logger.info("Setting layer num to {}".format(
                        contextualizer_layer_num))
                    self._contextualizer.set_layer_num(contextualizer_layer_num)
                else:
                    self._contextualizer.reset_layer_num()
                logger.info("Successfully loaded contextualizer weights!")
            if transfer_encoder_from_pretrained_file:
                logger.info("Attempting to load encoder weights from "
                            "pretrained_file at {}".format(pretrained_file))
                if archive is None:
                    archive = load_archive(cached_path(pretrained_file))
                encoder_state = archive.model._encoder.state_dict()
                self._encoder.load_state_dict(encoder_state)
                logger.info("Successfully loaded encoder weights!")

        self._freeze_encoder = freeze_encoder
        for parameter in self._encoder.parameters():
            # If freeze is true, requires_grad should be false and vice versa.
            parameter.requires_grad_(not self._freeze_encoder)

        if decoder is None or decoder == "linear":
            # Create the default decoder (logistic regression) if it is not provided.
            decoder = FeedForward.from_params(Params(
                {"input_dim": self._encoder.get_output_dim(),
                 "num_layers": 1,
                 "hidden_dims": self._num_classes,
                 "activations": "linear"}))
            logger.info("No decoder provided to model, using default "
                        "decoder: {}".format(decoder))
        elif decoder == "mlp":
            # Create the MLP decoder
            decoder = FeedForward.from_params(Params(
                {"input_dim": self._encoder.get_output_dim(),
                 "num_layers": 2,
                 "hidden_dims": [1024, self._num_classes],
                 "activations": ["relu", "linear"]}))
            logger.info("Using MLP decoder: {}".format(decoder))

        self._decoder = TimeDistributed(decoder)
        self._use_crf = use_crf
        self._constrain_crf_decoding = constrain_crf_decoding
        self._crf = None
        if use_crf:
            logger.info("Using CRF on top of decoder outputs")
            if constrain_crf_decoding:
                if label_encoding is None:
                    raise ConfigurationError(
                        "constrain_crf_decoding is True, but "
                        "label_encoding was not provided. label_encoding "
                        "must be provided.")
                logger.info("Constraining CRF decoding with label "
                            "encoding {}".format(label_encoding))
                labels = self.vocab.get_index_to_token_vocabulary("labels")
                constraints = allowed_transitions(label_encoding, labels)
            else:
                constraints = None
            self._crf = ConditionalRandomField(
                self._num_classes, constraints,
                include_start_end_transitions=include_start_end_transitions)

        check_dimensions_match(self._token_representation_dim, self._encoder.get_input_dim(),
                               "dimensionality of token representation", "encoder input dim")
        check_dimensions_match(self._encoder.get_output_dim(), self._decoder._module.get_input_dim(),
                               "encoder output dim", "decoder input dim")
        check_dimensions_match(self._decoder._module.get_output_dim(), self._num_classes,
                               "decoder output dim", "number of classes")
        if loss_average not in {"batch", "token"}:
            raise ConfigurationError("loss_average is {}, expected one of batch "
                                     "or token".format(loss_average))
        self.loss_average = loss_average
        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "accuracy3": CategoricalAccuracy(top_k=3)
        }

        self.calculate_perplexity = calculate_perplexity
        if calculate_perplexity:
            self.metrics["perplexity"] = Perplexity()

        self.calculate_per_label_f1 = calculate_per_label_f1
        self.calculate_span_f1 = calculate_span_f1
        if label_encoding and label_encoding not in ["BIO", "BIOUL", "IOB1"]:
            raise ConfigurationError("If not None, label encoding must be one of BIO, BIOUL, "
                                     "or IOB1. Got {}".format(label_encoding))
        self.label_encoding = label_encoding

        label_metric_name = "label_{}" if self.calculate_per_label_f1 else "_label_{}"
        for label_name, label_index in self.vocab._token_to_index["labels"].items():
            self.metrics[label_metric_name.format(label_name)] = F1Measure(positive_label=label_index)

        if self.calculate_span_f1:
            if not self.label_encoding:
                raise ConfigurationError("label_encoding must be provided when "
                                         "calculating_span_f1 is true.")
            else:
                # Set up span-based F1 measure
                self.metrics["span_based_f1"] = SpanBasedF1Measure(self.vocab,
                                                                   tag_namespace="labels",
                                                                   label_encoding=self.label_encoding)

        # Whether to run in error analysis mode or not, see commands.error_analysis
        self.error_analysis = False
        logger.info("Applying initializer...")
        initializer(self)
    def from_params(cls, vocab: Vocabulary,
                    params: Params) -> 'ESIMPtrExtractor':

        entailment_params = params.pop("entailment_esim")
        fix_entailment_params = params.pop('fix_entailment_params', False)
        if 'archive_file' in entailment_params:
            model = load_archive(entailment_params.pop('archive_file')).model
            if model._combine_feedforward is not None:
                model._entailment_esim._combine_feedforward = model._combine_feedforward
            if model._aggregate_feedforward is not None:
                model._entailment_esim._aggregate_feedforward = model._aggregate_feedforward
            entailment_esim = model._entailment_esim

            fix_entailment_params = entailment_params.pop(
                'fix_entailment_params', True)
            if fix_entailment_params:
                for parameter in entailment_esim.parameters():
                    parameter.requires_grad = False
        elif entailment_params.pop('model', None) == 'feature_model':
            weights_file = entailment_params.pop('weights_file', None)
            entailment_esim = FeatureModel(**entailment_params)
            if weights_file is not None:
                entailment_esim.load_state_dict(torch.load(weights_file))
        else:
            entailment_esim = ESIM.from_params(vocab, entailment_params)

        sentence_selection_params = params.pop("sentence_esim")
        pretrained_ptr_extractor = None
        fix_sentence_selection_esim_params = False
        if 'archive_file' in sentence_selection_params:
            archive_file = sentence_selection_params.pop('archive_file')
            pretrained_ptr_extractor = load_archive(archive_file).model
            sentence_selection_esim = pretrained_ptr_extractor._entailment_esim

            fix_sentence_selection_esim_params = sentence_selection_params.pop(
                'fix_sentence_selection_esim_params', False)
            if fix_sentence_selection_esim_params:
                for parameter in sentence_selection_esim.parameters():
                    parameter.requires_grad = False
        elif sentence_selection_params.pop('model', None) == 'feature_model':
            sentence_selection_esim = FeatureModel(**sentence_selection_params)
        else:
            sentence_selection_esim = ESIM.from_params(
                vocab,
                sentence_selection_params,
                vocab_weight=entailment_esim._text_field_embedder.
                token_embedder_tokens.weight.data)

        ptr_extract_summ_params = params.pop('ptr_extract_summ')
        fix_ptr_extract_summ_params = False
        if 'archive_file' in ptr_extract_summ_params:
            archive_file = ptr_extract_summ_params.pop('archive_file')
            if pretrained_ptr_extractor is None:
                pretrained_ptr_extractor = load_archive(archive_file).model
            ptr_extract_summ_params[
                'pretrained'] = pretrained_ptr_extractor._ptr_extract_summ

            fix_ptr_extract_summ_params = ptr_extract_summ_params.pop(
                'fix_ptr_extract_summ_params', False)
            if fix_ptr_extract_summ_params:
                for parameter in ptr_extract_summ_params[
                        'pretrained'].parameters():
                    parameter.requires_grad = False

        ptr_extract_summ = ActorCritic(**ptr_extract_summ_params)

        initializer = InitializerApplicator.from_params(
            params.pop('initializer', []))
        regularizer = RegularizerApplicator.from_params(
            params.pop('regularizer', []))

        ei_reward_weight = params.pop("ei_reward_weight", 1)
        nei_label = params.pop("nei_label", 0)
        train_gold_evidence = params.pop("train_gold_evidence", False)
        use_decoder_states = params.pop("use_decoder_states", False)
        beam_size = params.pop("beam_size", 5)

        fix_sentence_extraction_params = params.pop(
            "fix_sentence_extraction_params", False)

        params.assert_empty(cls.__name__)

        return cls(
            vocab=vocab,
            sentence_selection_esim=sentence_selection_esim,
            entailment_esim=entailment_esim,
            ptr_extract_summ=ptr_extract_summ,
            initializer=initializer,
            regularizer=regularizer,
            ei_reward_weight=ei_reward_weight,
            fix_entailment_params=fix_entailment_params,
            fix_sentence_extraction_params=fix_sentence_extraction_params or
            fix_ptr_extract_summ_params and fix_sentence_selection_esim_params,
            nei_label=nei_label,
            train_gold_evidence=train_gold_evidence,
            use_decoder_states=use_decoder_states,
            beam_size=beam_size)
Пример #14
0
    single_id_indexer = SingleIdTokenIndexer(
        lowercase_tokens=True)  # word tokenizer
    # use_subtrees gives us a bit of extra data by breaking down each example into sub sentences.

    tokenizer = WordTokenizer(
        end_tokens=["@@NULL@@"])  # add @@NULL@@ to the end of sentences

    reader = SnliReader(token_indexers={'tokens': single_id_indexer},
                        tokenizer=tokenizer)

    dev_dataset = reader.read(
        'https://s3-us-west-2.amazonaws.com/allennlp/datasets/snli/snli_1.0_dev.jsonl'
    )
    # Load model and vocab
    model = load_archive(
        'https://allennlp.s3-us-west-2.amazonaws.com/models/esim-glove-snli-2019.04.23.tar.gz'
    ).model
    model.train().cuda()
    snli_vocab = model.vocab

    mask_word_ARAE = []
    ARAE_words = list(ARAE_word2idx.keys())
    for word in ARAE_words:
        if snli_vocab.get_token_index(word) == 1:

            mask_word_ARAE.append(ARAE_word2idx[word])

    mask_word_ARAE = np.array(list(set(mask_word_ARAE)))

    mask_ARAE_logits = np.zeros((1, 1, len(ARAE_words)))
    mask_ARAE_logits[:, :, mask_word_ARAE] = -float("Inf")
Пример #15
0
    def ensure_model_can_train_save_and_load(self, param_file: str):
        save_dir = os.path.join(self.TEST_DIR, "save_and_load_test")
        archive_file = os.path.join(save_dir, "model.tar.gz")
        model = train_model_from_file(param_file, save_dir)
        loaded_model = load_archive(archive_file).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(model.state_dict()[key].numpy(),
                            loaded_model.state_dict()[key].numpy(),
                            err_msg=key)
        params = Params.from_file(self.param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])
        iterator = DataIterator.from_params(params['iterator'])

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        model_dataset = reader.read(params['validation_data_path'])
        model_dataset.index_instances(model.vocab)
        model_batch_arrays = next(iterator(model_dataset, shuffle=False))
        model_batch = arrays_to_variables(model_batch_arrays,
                                          for_training=False)
        loaded_dataset = reader.read(params['validation_data_path'])
        loaded_dataset.index_instances(loaded_model.vocab)
        loaded_batch_arrays = next(iterator(loaded_dataset, shuffle=False))
        loaded_batch = arrays_to_variables(loaded_batch_arrays,
                                           for_training=False)

        # The datasets themselves should be identical.
        for key in model_batch.keys():
            field = model_batch[key]
            if isinstance(field, dict):
                for subfield in field:
                    self.assert_fields_equal(model_batch[key][subfield],
                                             loaded_batch[key][subfield],
                                             tolerance=1e-6,
                                             name=key + '.' + subfield)
            else:
                self.assert_fields_equal(model_batch[key], loaded_batch[key],
                                         1e-6, key)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        model_predictions = model.forward(**model_batch)
        loaded_model_predictions = loaded_model.forward(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     tolerance=1e-4,
                                     name=key)

        return model, loaded_model
Пример #16
0
    def ensure_model_can_train_save_and_load(
        self,
        param_file: str,
        tolerance: float = 1e-4,
        cuda_device: int = -1,
        gradients_to_ignore: Set[str] = None,
        overrides: str = "",
        disable_dropout: bool = True,
    ):
        """
        # Parameters

        param_file : ``str``
            Path to a training configuration file that we will use to train the model for this
            test.
        tolerance : ``float``, optional (default=1e-4)
            When comparing model predictions between the originally-trained model and the model
            after saving and loading, we will use this tolerance value (passed as ``rtol`` to
            ``numpy.testing.assert_allclose``).
        cuda_device : ``int``, optional (default=-1)
            The device to run the test on.
        gradients_to_ignore : ``Set[str]``, optional (default=None)
            This test runs a gradient check to make sure that we're actually computing gradients
            for all of the parameters in the model.  If you really want to ignore certain
            parameters when doing that check, you can pass their names here.  This is not
            recommended unless you're `really` sure you don't need to have non-zero gradients for
            those parameters (e.g., some of the beam search / state machine models have
            infrequently-used parameters that are hard to force the model to use in a small test).
        overrides : ``str``, optional (default = "")
            A JSON string that we will use to override values in the input parameter file.
        disable_dropout : ``bool``, optional (default = True)
            If True we will set all dropout to 0 before checking gradients. (Otherwise, with small
            datasets, you may get zero gradients because of unlucky dropout.)
        """
        save_dir = self.TEST_DIR / "save_and_load_test"
        archive_file = save_dir / "model.tar.gz"
        model = train_model_from_file(param_file, save_dir, overrides=overrides)
        loaded_model = load_archive(archive_file, cuda_device=cuda_device).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(
                model.state_dict()[key].cpu().numpy(),
                loaded_model.state_dict()[key].cpu().numpy(),
                err_msg=key,
            )
        params = Params.from_file(param_file, params_overrides=overrides)
        reader = DatasetReader.from_params(params["dataset_reader"])

        # Need to duplicate params because Iterator.from_params will consume.
        iterator_params = params["iterator"]
        iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict()))

        iterator = DataIterator.from_params(iterator_params)
        iterator2 = DataIterator.from_params(iterator_params2)

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        model_dataset = reader.read(params["validation_data_path"])
        iterator.index_with(model.vocab)
        model_batch = next(iterator(model_dataset, shuffle=False))

        loaded_dataset = reader.read(params["validation_data_path"])
        iterator2.index_with(loaded_model.vocab)
        loaded_batch = next(iterator2(loaded_dataset, shuffle=False))

        # Check gradients are None for non-trainable parameters and check that
        # trainable parameters receive some gradient if they are trainable.
        self.check_model_computes_gradients_correctly(
            model, model_batch, gradients_to_ignore, disable_dropout
        )

        # The datasets themselves should be identical.
        assert model_batch.keys() == loaded_batch.keys()
        for key in model_batch.keys():
            self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        # Models with stateful RNNs need their states reset to have consistent
        # behavior after loading.
        for model_ in [model, loaded_model]:
            for module in model_.modules():
                if hasattr(module, "stateful") and module.stateful:
                    module.reset_states()
        model_predictions = model(**model_batch)
        loaded_model_predictions = loaded_model(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(
                model_predictions[key], loaded_model_predictions[key], name=key, tolerance=tolerance
            )

        return model, loaded_model
Пример #17
0
    def test_train_model_distributed_without_sharded_reader(self, lazy: bool):
        if torch.cuda.device_count() >= 2:
            devices = [0, 1]
        else:
            devices = [-1, -1]

        num_epochs = 2
        params = lambda: Params({
            "model": {
                "type": "simple_tagger",
                "text_field_embedder": {
                    "token_embedders": {
                        "tokens": {
                            "type": "embedding",
                            "embedding_dim": 5
                        }
                    }
                },
                "encoder": {
                    "type": "lstm",
                    "input_size": 5,
                    "hidden_size": 7,
                    "num_layers": 2
                },
            },
            "dataset_reader": {
                "type": "sequence_tagging",
                "lazy": lazy
            },
            "train_data_path": SEQUENCE_TAGGING_DATA_PATH,
            "validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
            "data_loader": {
                "batch_size": 1
            },
            "trainer": {
                "num_epochs":
                num_epochs,
                "optimizer":
                "adam",
                "batch_callbacks":
                ["tests.commands.train_test.TrainingDataLoggerBatchCallback"],
            },
            "distributed": {
                "cuda_devices": devices
            },
        })

        out_dir = os.path.join(self.TEST_DIR, "test_distributed_train")
        train_model(params(), serialization_dir=out_dir)

        # Check that some logs specific to distributed
        # training are where we expect.
        serialized_files = os.listdir(out_dir)
        assert "stderr_worker0.log" in serialized_files
        assert "stdout_worker0.log" in serialized_files
        assert "stderr_worker1.log" in serialized_files
        assert "stdout_worker1.log" in serialized_files
        assert "model.tar.gz" in serialized_files

        # Check we can load the serialized model
        archive = load_archive(out_dir)
        assert archive.model

        # Check that we created a vocab from all the shards.
        tokens = set(archive.model.vocab._token_to_index["tokens"].keys())
        assert tokens == {
            "@@PADDING@@",
            "@@UNKNOWN@@",
            "are",
            ".",
            "animals",
            "cats",
            "dogs",
            "snakes",
            "birds",
        }

        train_complete = "completed its entire epoch (training)."
        validation_complete = "completed its entire epoch (validation)."

        import re

        pattern = re.compile(r"First word from training data: '([^']*)'")
        first_word_counts = Counter()
        with open(os.path.join(out_dir, "stdout_worker0.log")) as f:
            worker0_log = f.read()
            assert train_complete in worker0_log
            assert validation_complete in worker0_log
            for first_word in pattern.findall(worker0_log):
                first_word_counts[first_word] += 1

        with open(os.path.join(out_dir, "stdout_worker1.log")) as f:
            worker1_log = f.read()
            assert train_complete in worker1_log
            assert validation_complete in worker1_log
            for first_word in pattern.findall(worker1_log):
                first_word_counts[first_word] += 1

        assert first_word_counts == {
            "cats": num_epochs,
            "dogs": num_epochs,
            "snakes": num_epochs,
            "birds": num_epochs,
        }
Пример #18
0
    def ensure_model_can_train_save_and_load(self,
                                             param_file: str,
                                             tolerance: float = 1e-4,
                                             cuda_device: int = -1,
                                             gradients_to_ignore: Set[str] = None,
                                             overrides: str = ""):
        """
        Parameters
        ----------
        param_file : ``str``
            Path to a training configuration file that we will use to train the model for this
            test.
        tolerance : ``float``, optional (default=1e-4)
            When comparing model predictions between the originally-trained model and the model
            after saving and loading, we will use this tolerance value (passed as ``rtol`` to
            ``numpy.testing.assert_allclose``).
        cuda_device : ``int``, optional (default=-1)
            The device to run the test on.
        gradients_to_ignore : ``Set[str]``, optional (default=None)
            This test runs a gradient check to make sure that we're actually computing gradients
            for all of the parameters in the model.  If you really want to ignore certain
            parameters when doing that check, you can pass their names here.  This is not
            recommended unless you're `really` sure you don't need to have non-zero gradients for
            those parameters (e.g., some of the beam search / state machine models have
            infrequently-used parameters that are hard to force the model to use in a small test).
        overrides : ``str``, optional (default = "")
            A JSON string that we will use to override values in the input parameter file.
        """
        save_dir = self.TEST_DIR / "save_and_load_test"
        archive_file = save_dir / "model.tar.gz"
        model = train_model_from_file(param_file, save_dir, overrides=overrides)
        loaded_model = load_archive(archive_file, cuda_device=cuda_device).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(model.state_dict()[key].cpu().numpy(),
                            loaded_model.state_dict()[key].cpu().numpy(),
                            err_msg=key)
        params = Params.from_file(param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])

        # Need to duplicate params because Iterator.from_params will consume.
        iterator_params = params['iterator']
        iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict()))

        iterator = DataIterator.from_params(iterator_params)
        iterator2 = DataIterator.from_params(iterator_params2)

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        model_dataset = reader.read(params['validation_data_path'])
        iterator.index_with(model.vocab)
        model_batch = next(iterator(model_dataset, shuffle=False))

        loaded_dataset = reader.read(params['validation_data_path'])
        iterator2.index_with(loaded_model.vocab)
        loaded_batch = next(iterator2(loaded_dataset, shuffle=False))

        # Check gradients are None for non-trainable parameters and check that
        # trainable parameters receive some gradient if they are trainable.
        self.check_model_computes_gradients_correctly(model, model_batch, gradients_to_ignore)

        # The datasets themselves should be identical.
        assert model_batch.keys() == loaded_batch.keys()
        for key in model_batch.keys():
            self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        # Models with stateful RNNs need their states reset to have consistent
        # behavior after loading.
        for model_ in [model, loaded_model]:
            for module in model_.modules():
                if hasattr(module, 'stateful') and module.stateful:
                    module.reset_states()
        model_predictions = model(**model_batch)
        loaded_model_predictions = loaded_model(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     name=key,
                                     tolerance=tolerance)

        return model, loaded_model
Пример #19
0
base_path = os.path.abspath('')
sys.path.append(base_path + "/")
import import_folders

from squad1_reader import Squad1Reader
load_dataset_from_disk = 0
load_pretrained_BiDAF = 1
build_model_from_scratch = 0

"""
################ LOAD PRETRAINED MODEL ###############
We load the pretrained model and we can see what parts it has and maybe reused if needed.
"""

if (load_pretrained_BiDAF):
    archive = load_archive("https://s3-us-west-2.amazonaws.com/allennlp/models/bidaf-model-2017.09.15-charpad.tar.gz")
    
    # Get the model and the config file
    model = archive.model
    config = archive.config.duplicate()
    
    keys_config = list(config.keys())
    print ("Key config list: ", keys_config)
    for key in keys_config:
        print ("Params of %s"%(key))
        print (config[key])
    ### Get the elements
    ## Data Readers ##
    dataset_reader_params = config["dataset_reader"]
    dataset_reader = DatasetReader.from_params(dataset_reader_params)
    ## Vocabulary ##
Пример #20
0
def main(file, embeddings, model, emb_wt_key, namespace, output_dir):
    archive = load_archive(model)
    config = archive.config
    os.makedirs(output_dir, exist_ok=True)
    config.to_file(os.path.join(output_dir, CONFIG_NAME))

    model = archive.model
    # first expand the vocabulary
    dataset_reader = DatasetReader.from_params(config.pop('dataset_reader'))
    instances = dataset_reader.read(file)
    vocab = model.vocab

    # get all the tokens in the new file
    namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(
        lambda: defaultdict(int))
    for instance in Tqdm.tqdm(instances):
        instance.count_vocab_items(namespace_token_counts)
    old_token_size = vocab.get_vocab_size(namespace)
    print("Before expansion: Number of instances in {} namespace: {}".format(
        namespace, old_token_size))
    if namespace not in namespace_token_counts:
        logger.error(
            "No tokens found for namespace: {} in the new input file".format(
                namespace))
    # identify the new tokens in the new instances
    token_to_add = set()
    token_hits = 0
    for token, count in namespace_token_counts[namespace].items():
        if token not in vocab._token_to_index[namespace]:
            # new token, must add
            token_to_add.add(token)
        else:
            token_hits += 1
    print("Found {} existing tokens and {} new tokens in {}".format(
        token_hits, len(token_to_add), file))

    # add the new tokens to the vocab
    for token in token_to_add:
        vocab.add_token_to_namespace(token=token, namespace=namespace)
    archived_parameters = dict(model.named_parameters())

    # second, expand the embedding matrix
    for name, weights in archived_parameters.items():
        # find the wt matrix for the embeddings
        if name == emb_wt_key:
            if weights.dim() != 2:
                logger.error(
                    "Expected an embedding matrix for the parameter: {} instead"
                    "found {} tensor".format(emb_wt_key, weights.shape))
            emb_dim = weights.shape[-1]
            print("Before expansion: Size of emb matrix: {}".format(
                weights.shape))
            # Loading embeddings for old and new tokens since that is cleaner than copying all
            # the embedding loading logic here
            all_embeddings = _read_pretrained_embeddings_file(
                embeddings, emb_dim, vocab, namespace)
            # concatenate the new entries i.e last token_to_add embeddings to the original weights
            if len(token_to_add) > 0:
                weights.data = torch.cat(
                    [weights.data, all_embeddings[-len(token_to_add):, :]])
            print("After expansion: Size of emb matrix: {}".format(
                weights.shape))

    # save the files needed by the model archiver
    model_path = os.path.join(output_dir, "weight.th")
    model_state = model.state_dict()
    torch.save(model_state, model_path)
    vocab.save_to_files(os.path.join(output_dir, "vocabulary"))
    archive_model(output_dir, weights="weight.th")

    # more debug messages
    new_token_size = vocab.get_vocab_size(namespace)
    for name, weights in archived_parameters.items():
        if name == emb_wt_key:
            print("Size of emb matrix: {}".format(weights.shape))
    print("After expansion: Number of instances in {} namespace: {}".format(
        namespace, new_token_size))
Пример #21
0
def eval_model(db: FeverDocDB, args) -> Model:
    archive = load_archive(args.archive_file, cuda_device=args.cuda_device)

    config = archive.config
    ds_params = config["dataset_reader"]

    model = archive.model
    model.eval()

    reader = FEVERReader(db,
                         sentence_level=ds_params.pop("sentence_level", False),
                         wiki_tokenizer=Tokenizer.from_params(
                             ds_params.pop('wiki_tokenizer', {})),
                         claim_tokenizer=Tokenizer.from_params(
                             ds_params.pop('claim_tokenizer', {})),
                         token_indexers=FEVERReader.custom_dict_from_params(
                             ds_params.pop('token_indexers', {})),
                         ner_facts=args.ner_facts)

    logger.info("Reading training data from %s", args.in_file)
    data = reader.read(args.in_file)

    actual = []
    predicted = []

    if args.log is not None:
        f = open(args.log, "w+")

    for item in tqdm(data):
        if item.fields["premise"] is None or item.fields[
                "premise"].sequence_length() == 0:
            cls = "NOT ENOUGH INFO"
        else:
            prediction = model.forward_on_instance(item)
            cls = model.vocab._index_to_token["labels"][np.argmax(
                prediction["label_probs"])]

        if "label" in item.fields:
            actual.append(item.fields["label"].label)
            if args.ner_missing is not None:
                if args.ner_missing == 'oracle' and item.fields[
                        "label"].label == "NOT ENOUGH INFO" and cls != "NOT ENOUGH INFO":
                    if item.fields["metadata"].metadata["ner_missing"]:
                        cls = "NOT ENOUGH INFO"

                if args.ner_missing == 'oracle' and item.fields[
                        "label"].label == "SUPPORTS" and cls != "SUPPORTS":
                    if item.fields["metadata"].metadata["ner_missing"]:
                        cls = "SUPPORTS"

                if args.ner_missing == 'oracle' and item.fields[
                        "label"].label == "REFUTES" and cls != "REFUTES":
                    if item.fields["metadata"].metadata["ner_missing"]:
                        cls = "REFUTES"

                if args.ner_missing == 'naive' and cls == 'SUPPORTS':
                    if item.fields["metadata"].metadata["ner_missing"]:
                        highest = np.argmax(prediction["label_probs"])
                        lowest = np.argmin(prediction["label_probs"])
                        copy = []
                        for pred in prediction["label_probs"]:
                            copy.append(pred)

                        copy[highest] = prediction["label_probs"][lowest]

                        original_logits = prediction["label_logits"][highest]
                        chosen_logits = prediction["label_logits"][np.argmax(
                            copy)]
                        difference_logits = original_logits - chosen_logits

                        if difference_logits < 3.0:
                            cls = model.vocab._index_to_token["labels"][
                                np.argmax(copy)]

        predicted.append(cls)

        if args.log is not None:
            if "label" in item.fields:
                f.write(
                    json.dumps({
                        "actual": item.fields["label"].label,
                        "predicted": cls
                    }) + "\n")
            else:
                f.write(json.dumps({"predicted": cls}) + "\n")

    if args.log is not None:
        f.close()

    if len(actual) > 0:
        print(accuracy_score(actual, predicted))
        print(classification_report(actual, predicted))
        print(confusion_matrix(actual, predicted))

    return model
Пример #22
0
def _load_predictor(archive_file: str, predictor_name: str) -> Predictor:
    """
    Helper to load the desired predictor from the given archive.
    """
    archive = load_archive(archive_file)
    return Predictor.from_archive(archive, predictor_name)
Пример #23
0
    optparser.add_argument("--batch_size",
                           type=int,
                           default=None,
                           help="Overwrite batch size.")
    optparser.add_argument("--parse_on_cpu",
                           action="store_true",
                           default=False,
                           help="Enforce parsing on the CPU.")

    args = optparser.parse_args()

    if args.beam < 1:
        print("Beam size must be at least 1")
        sys.exit()

    archive = load_archive(args.archive_file, args.cuda_device)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()
    model.k_best = args.beam
    model.parse_on_gpu = not args.parse_on_cpu

    pipelinepieces = PipelineTrainerPieces.from_params(config)

    if args.batch_size is not None and args.batch_size > 0:
        assert isinstance(pipelinepieces.annotator.data_iterator,
                          SameFormalismIterator)
        iterator: SameFormalismIterator = pipelinepieces.annotator.data_iterator
        pipelinepieces.annotator.data_iterator = SameFormalismIterator(
            iterator.formalisms, args.batch_size)
def test_predictor():
    question_json = {
        "id":
        "1700",
        "question_tokens": [
            "@start@", "For", "what", "does", "a", "stove", "generally",
            "generate", "heat", "?", "@end@"
        ],
        "choice_tokens_list":
        [["@start@", "warming", "the", "air", "in", "the", "area", "@end@"],
         [
             "@start@", "heating", "nutrients", "to", "appropriate",
             "temperatures", "@end@"
         ],
         [
             "@start@", "entertaining", "various", "visitors", "and", "guests",
             "@end@"
         ], ["@start@", "to", "create", "electrical", "charges", "@end@"]],
        "facts_tokens_list":
        [["@start@", "UML", "can", "generate", "code", "@end@"],
         ["@start@", "generate", "is", "a", "synonym", "of", "beget", "@end@"],
         ["@start@", "Heat", "is", "generated", "by", "a", "stove", "@end@"],
         [
             "@start@", "A", "sonnet", "is", "generally", "very", "structured",
             "@end@"
         ],
         [
             "@start@", "A", "fundamentalist", "is", "generally", "right", "-",
             "wing", "@end@"
         ], ["@start@", "menstruation", "is", "generally", "crampy", "@end@"],
         [
             "@start@", "an", "erection", "is", "generally", "pleasurable",
             "@end@"
         ], ["@start@", "gunfire", "is", "generally", "lethal", "@end@"],
         ["@start@", "ejaculating", "is", "generally", "pleasurable", "@end@"],
         ["@start@", "Huddersfield", "is", "generally", "urban", "@end@"],
         [
             "@start@", "warming", "is", "a", "synonym", "of", "calefacient",
             "@end@"
         ],
         ["@start@", "heat", "is", "related", "to", "warming", "air", "@end@"],
         ["@start@", "a", "stove", "is", "for", "warming", "food", "@end@"],
         [
             "@start@", "an", "air", "conditioning", "is", "for", "warming",
             "@end@"
         ], ["@start@", "The", "earth", "is", "warming", "@end@"],
         [
             "@start@", "a", "heat", "source", "is", "for", "warming", "up",
             "@end@"
         ],
         ["@start@", "A", "foyer", "is", "an", "enterance", "area", "@end@"],
         ["@start@", "Being", "nosey", "is", "not", "appropriate", "@end@"],
         [
             "@start@", "seize", "is", "a", "synonym", "of", "appropriate",
             "@end@"
         ],
         [
             "@start@", "a", "fitting", "room", "is", "used", "for",
             "something", "appropriate", "@end@"
         ],
         [
             "@start@", "appropriate", "is", "a", "synonym", "of", "allow",
             "@end@"
         ],
         [
             "@start@", "appropriate", "is", "similar", "to", "befitting",
             "@end@"
         ],
         [
             "@start@", "appropriate", "is", "similar", "to", "grade", "-",
             "appropriate", "@end@"
         ],
         [
             "@start@", "grade", "-", "appropriate", "is", "similar", "to",
             "appropriate", "@end@"
         ],
         [
             "@start@", "A", "parlor", "is", "used", "for", "entertaining",
             "guests", "@end@"
         ],
         [
             "@start@", "a", "back", "courtyard", "is", "for", "entertaining",
             "guests", "@end@"
         ], ["@start@", "guest", "is", "a", "type", "of", "visitor", "@end@"],
         [
             "@start@", "a", "family", "room", "is", "for", "entertaining",
             "guests", "@end@"
         ],
         [
             "@start@", "cooking", "a", "meal", "is", "for", "entertaining",
             "guests", "@end@"
         ],
         [
             "@start@", "buying", "a", "house", "is", "for", "entertaining",
             "guests", "@end@"
         ],
         [
             "@start@", "having", "a", "party", "is", "for", "entertaining",
             "guests", "@end@"
         ],
         [
             "@start@", "a", "dining", "area", "is", "used", "for",
             "entertaining", "guests", "@end@"
         ], ["@start@", "visitor", "is", "related", "to", "guest", "@end@"],
         ["@start@", "guest", "is", "related", "to", "visitor", "@end@"],
         ["@start@", "Electrical", "charges", "are", "additive", "@end@"],
         ["@start@", "Lightning", "is", "an", "electrical", "charge", "@end@"],
         ["@start@", "electrons", "have", "electrical", "charge", "@end@"],
         [
             "@start@", "A", "judge", "is", "in", "charge", "in", "a",
             "courtroom", "@end@"
         ],
         [
             "@start@", "charge", "is", "a", "synonym", "of", "accusation",
             "@end@"
         ],
         [
             "@start@", "A", "consultant", "can", "charge", "a", "fee", "to",
             "a", "client", "@end@"
         ],
         [
             "@start@", "charge", "is", "a", "synonym", "of", "commission",
             "@end@"
         ],
         [
             "@start@", "charge", "is", "a", "synonym", "of", "cathexis",
             "@end@"
         ], ["@start@", "charge", "is", "not", "cash", "@end@"],
         ["@start@", "arraign", "entails", "charge", "@end@"],
         [
             "@start@", "a", "stove", "generates", "heat", "for", "cooking",
             "usually", "@end@"
         ],
         [
             "@start@", "preferences", "are", "generally", "learned",
             "characteristics", "@end@"
         ],
         [
             "@start@", "a", "windmill", "does", "not", "create", "pollution",
             "@end@"
         ],
         [
             "@start@", "temperature", "is", "a", "measure", "of", "heat",
             "energy", "@end@"
         ],
         [
             "@start@", "a", "hot", "something", "is", "a", "source", "of",
             "heat", "@end@"
         ],
         [
             "@start@", "the", "moon", "does", "not", "contain", "water",
             "@end@"
         ], ["@start@", "sunlight", "produces", "heat", "@end@"],
         ["@start@", "an", "oven", "is", "a", "source", "of", "heat", "@end@"],
         [
             "@start@", "a", "hot", "substance", "is", "a", "source", "of",
             "heat", "@end@"
         ],
         [
             "@start@", "a", "car", "engine", "is", "a", "source", "of",
             "heat", "@end@"
         ],
         [
             "@start@", "as", "the", "amount", "of", "rainfall", "increases",
             "in", "an", "area", ",", "the", "amount", "of", "available",
             "water", "in", "that", "area", "will", "increase", "@end@"
         ], ["@start@", "sound", "can", "travel", "through", "air", "@end@"],
         [
             "@start@", "the", "greenhouse", "effect", "is", "when", "carbon",
             "in", "the", "air", "heats", "a", "planet", "'s", "atmosphere",
             "@end@"
         ],
         [
             "@start@", "a", "community", "is", "made", "of", "many", "types",
             "of", "organisms", "in", "an", "area", "@end@"
         ], ["@start@", "air", "is", "a", "vehicle", "for", "sound", "@end@"],
         [
             "@start@", "rainfall", "is", "the", "amount", "of", "rain", "an",
             "area", "receives", "@end@"
         ],
         [
             "@start@", "an", "animal", "requires", "air", "for", "survival",
             "@end@"
         ],
         [
             "@start@", "humidity", "is", "the", "amount", "of", "water",
             "vapor", "in", "the", "air", "@end@"
         ],
         [
             "@start@", "if", "some", "nutrients", "are", "in", "the", "soil",
             "then", "those", "nutrients", "are", "in", "the", "food", "chain",
             "@end@"
         ],
         [
             "@start@", "as", "heat", "is", "transferred", "from", "something",
             "to", "something", "else", ",", "the", "temperature", "of",
             "that", "something", "will", "decrease", "@end@"
         ], ["@start@", "uneven", "heating", "causes", "convection", "@end@"],
         [
             "@start@", "as", "temperature", "during", "the", "day",
             "increases", ",", "the", "temperature", "in", "an", "environment",
             "will", "increase", "@end@"
         ],
         [
             "@start@", "uneven", "heating", "of", "the", "Earth", "'s",
             "surface", "cause", "wind", "@end@"
         ],
         [
             "@start@", "an", "animal", "needs", "to", "eat", "food", "for",
             "nutrients", "@end@"
         ],
         [
             "@start@", "soil", "contains", "nutrients", "for", "plants",
             "@end@"
         ],
         [
             "@start@", "if", "two", "objects", "have", "the", "same",
             "charge", "then", "those", "two", "materials", "will", "repel",
             "each", "other", "@end@"
         ],
         ["@start@", "water", "is", "an", "electrical", "conductor", "@end@"],
         [
             "@start@", "a", "battery", "is", "a", "source", "of",
             "electrical", "energy", "@end@"
         ],
         [
             "@start@", "metal", "is", "an", "electrical", "energy",
             "conductor", "@end@"
         ],
         [
             "@start@", "when", "an", "electrical", "circuit", "is", "working",
             "properly", ",", "electrical", "current", "runs", "through",
             "the", "wires", "in", "that", "circuit", "@end@"
         ],
         ["@start@", "brick", "is", "an", "electrical", "insulator", "@end@"],
         [
             "@start@", "wood", "is", "an", "electrical", "energy",
             "insulator", "@end@"
         ],
         [
             "@start@", "a", "toaster", "converts", "electrical", "energy",
             "into", "heat", "energy", "for", "toasting", "@end@"
         ]],
        "gold_label":
        1,
        "gold_facts": {
            "fact1": "a stove generates heat for cooking usually",
            "fact2":
            "cooking involves heating nutrients to higher temperatures"
        },
        "label_probs": [
            0.002615198493003845, 0.9686304330825806, 0.008927381597459316,
            0.01982697658240795
        ],
        "label_ranks": [3, 0, 2, 1],
        "predicted_label":
        1,
    }

    inputs = question_to_predictor_input(question_json)
    inputs = predictor_input_to_pred_input_with_full_question_text(inputs)
    print(json.dumps(inputs, indent=4))

    archive = load_archive('_trained_models/model_CN5_1202.tar.gz')
    predictor = Predictor.from_archive(archive,
                                       'predictor-qa-mc-with-know-visualize')

    result = predictor.predict_json(inputs)

    print(result)
Пример #25
0
def eval_model(db: FeverDocDB, args) -> Model:
    archive = load_archive(args.archive_file,
                           cuda_device=args.cuda_device,
                           overrides=args.overrides)

    config = archive.config
    ds_params = config["dataset_reader"]

    model = archive.model
    model.eval()

    reader = FEVERReader(db,
                         sentence_level=ds_params.pop("sentence_level", False),
                         wiki_tokenizer=Tokenizer.from_params(
                             ds_params.pop('wiki_tokenizer', {})),
                         claim_tokenizer=Tokenizer.from_params(
                             ds_params.pop('claim_tokenizer', {})),
                         token_indexers=TokenIndexer.dict_from_params(
                             ds_params.pop('token_indexers', {})))

    while True:

        claim = input("enter claim (or q to quit) >>")
        if claim.lower() == "q":
            break

        ranker = retriever.get_class('tfidf')(tfidf_path=args.model)

        p_lines = []
        pages, _ = ranker.closest_docs(claim, 5)

        for page in pages:
            lines = db.get_doc_lines(page)
            lines = [
                line.split("\t")[1] if len(line.split("\t")[1]) > 1 else ""
                for line in lines.split("\n")
            ]

            p_lines.extend(zip(lines, [page] * len(lines), range(len(lines))))

        scores = tf_idf_sim(claim, [pl[0] for pl in p_lines])
        scores = list(
            zip(scores, [pl[1] for pl in p_lines], [pl[2] for pl in p_lines],
                [pl[0] for pl in p_lines]))
        scores = list(filter(lambda score: len(score[3].strip()), scores))
        sentences_l = list(
            sorted(scores, reverse=True, key=lambda elem: elem[0]))

        sentences = [s[3] for s in sentences_l[:5]]
        evidence = " ".join(sentences)

        print("Best pages: {0}".format(repr(pages)))

        print("Evidence:")
        for idx, sentence in enumerate(sentences_l[:5]):
            print("{0}\t{1}\t\t{2}\t{3}".format(idx + 1, sentence[0],
                                                sentence[1], sentence[3]))

        item = reader.text_to_instance(evidence, claim)

        prediction = model.forward_on_instance(item, args.cuda_device)
        cls = model.vocab._index_to_token["labels"][np.argmax(
            prediction["label_probs"])]
        print("PREDICTED: {0}".format(cls))
        print()
Пример #26
0
    def __init__(self,
                 vocab: Vocabulary,
                 token_representation_dim: int,
                 encoder: Optional[Seq2SeqEncoder] = None,
                 decoder: Optional[Union[FeedForward, str]] = None,
                 contextualizer: Optional[Contextualizer] = None,
                 pretrained_file: Optional[str] = None,
                 transfer_contextualizer_from_pretrained_file: bool = False,
                 transfer_encoder_from_pretrained_file: bool = False,
                 freeze_encoder: bool = False,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(SelectiveRegressor, self).__init__(vocab, regularizer)

        self._token_representation_dim = token_representation_dim
        self._contextualizer = contextualizer
        if encoder is None:
            encoder = PassThroughEncoder(
                input_dim=self._token_representation_dim)
        self._encoder = encoder

        # Load the contextualizer and encoder weights from the
        # pretrained_file if applicable
        if pretrained_file:
            archive = None
            if self._contextualizer and transfer_contextualizer_from_pretrained_file:
                logger.info("Attempting to load contextualizer weights from "
                            "pretrained_file at {}".format(pretrained_file))
                archive = load_archive(cached_path(pretrained_file))
                contextualizer_state = archive.model._contextualizer.state_dict(
                )
                contextualizer_layer_num = self._contextualizer._layer_num
                self._contextualizer.load_state_dict(contextualizer_state)
                if contextualizer_layer_num is not None:
                    logger.info("Setting layer num to {}".format(
                        contextualizer_layer_num))
                    self._contextualizer.set_layer_num(
                        contextualizer_layer_num)
                else:
                    self._contextualizer.reset_layer_num()
                logger.info("Successfully loaded contextualizer weights!")
            if transfer_encoder_from_pretrained_file:
                logger.info("Attempting to load encoder weights from "
                            "pretrained_file at {}".format(pretrained_file))
                if archive is None:
                    archive = load_archive(cached_path(pretrained_file))
                encoder_state = archive.model._encoder.state_dict()
                self._encoder.load_state_dict(encoder_state)
                logger.info("Successfully loaded encoder weights!")

        self._freeze_encoder = freeze_encoder
        for parameter in self._encoder.parameters():
            # If freeze is true, requires_grad should be false and vice versa.
            parameter.requires_grad_(not self._freeze_encoder)

        if decoder is None or decoder == "linear":
            # Create the default decoder (logistic regression) if it is not provided.
            decoder = FeedForward.from_params(
                Params({
                    "input_dim": self._encoder.get_output_dim(),
                    "num_layers": 1,
                    "hidden_dims": 1,
                    "activations": "linear"
                }))
            logger.info("No decoder provided to model, using default "
                        "decoder: {}".format(decoder))
        elif decoder == "mlp":
            # Create the MLP decoder
            decoder = FeedForward.from_params(
                Params({
                    "input_dim": self._encoder.get_output_dim(),
                    "num_layers": 2,
                    "hidden_dims": [1024, 1],
                    "activations": ["relu", "linear"]
                }))
            logger.info("Using MLP decoder: {}".format(decoder))
        self._decoder = decoder

        check_dimensions_match(self._token_representation_dim,
                               self._encoder.get_input_dim(),
                               "token representation dim", "encoder input dim")
        check_dimensions_match(self._encoder.get_output_dim(),
                               self._decoder.get_input_dim(),
                               "encoder output dim", "decoder input dim")
        check_dimensions_match(self._decoder.get_output_dim(), 1,
                               "decoder output dim",
                               "1, since we're predicting a real value")
        # SmoothL1Loss as described in "Neural Models of Factuality" (NAACL 2018)
        self.loss = torch.nn.SmoothL1Loss(reduction="none")
        self.metrics = {
            "mae": MeanAbsoluteError(),
            "pearson_r": PearsonCorrelation()
        }

        # Whether to run in error analysis mode or not, see commands.error_analysis
        self.error_analysis = False
        logger.info("Applying initializer...")
        initializer(self)
Пример #27
0
    def test_train_model_distributed(self):
        if torch.cuda.device_count() >= 2:
            devices = [0, 1]
        else:
            devices = [-1, -1]

        params = lambda: Params({
            "model": {
                "type": "simple_tagger",
                "text_field_embedder": {
                    "token_embedders": {
                        "tokens": {
                            "type": "embedding",
                            "embedding_dim": 5
                        }
                    }
                },
                "encoder": {
                    "type": "lstm",
                    "input_size": 5,
                    "hidden_size": 7,
                    "num_layers": 2
                },
            },
            "dataset_reader": {
                "type": "sequence_tagging"
            },
            "train_data_path": SEQUENCE_TAGGING_DATA_PATH,
            "validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
            "data_loader": {
                "batch_size": 2
            },
            "trainer": {
                "num_epochs":
                2,
                "optimizer":
                "adam",
                # Need to use the fully qualified name here so the distributed workers
                # can import it.
                "callbacks":
                ["tests.commands.train_test.TrainingPrimaryCheckCallback"],
            },
            "distributed": {
                "cuda_devices": devices
            },
        })

        out_dir = os.path.join(self.TEST_DIR, "test_distributed_train")
        train_model(params(), serialization_dir=out_dir)

        # Check that some logs specific to distributed
        # training are where we expect.
        serialized_files = os.listdir(out_dir)
        assert "out_worker0.log" in serialized_files
        assert "out_worker1.log" in serialized_files
        assert "model.tar.gz" in serialized_files
        assert "metrics.json" in serialized_files

        # Make sure the metrics look right.
        with open(os.path.join(out_dir, "metrics.json")) as f:
            metrics = json.load(f)
            assert metrics["peak_worker_0_memory_MB"] > 0
            assert metrics["peak_worker_1_memory_MB"] > 0
            if torch.cuda.device_count() >= 2:
                assert metrics["peak_gpu_0_memory_MB"] > 0
                assert metrics["peak_gpu_1_memory_MB"] > 0

        # Check we can load the serialized model
        assert load_archive(out_dir).model
Пример #28
0
from allennlp.common.util import JsonDict, import_submodules
from allennlp.data import Instance
from allennlp.predictors.predictor import Predictor


@Predictor.register('bistm_crf_predictor', exist_ok=True)
class bistm_crf_predictor(Predictor):
    def predict_json(self, inputs: JsonDict):
        abstract = inputs['abstract']
        return self.predict_line(abstract)

    def predict_line(self, line: str):
        instance = self._dataset_reader.text_to_instance(line)
        ouput_dict = self.predict_instance(instance)

        # return {'predict_title': ouput_dict['predict_title']}
        return {'input': line, 'predict_title': ouput_dict['predict_title']}


if __name__ == '__main__':
    import_submodules('using_allennlp')
    serialization_dir = "/home/liangjiaxi/TMP_PROJECT/pingan_event_extraction/tmp/debugger_train"
    archive = load_archive(os.path.join(serialization_dir, 'model.tar.gz'))
    predictor = Predictor.from_archive(
        archive,
        'bistm_crf_predictor',
        dataset_reader_to_load="zhaiyao_datareader")
    line = '东土科技(300353)公告,公司此前曾披露,控股股东、实控人、董事长李平拟于2017年10月23日起12个月内增持不低于1亿元,累计增持比例不超本公司已发行股份的2%。李平于2018年1月31日至2月8日增持212.68万股,增持资金2431万元。由于相关融资增持监管政策变化导致无法筹措增持资金,李平现申请终止履行未实施部分的增持计划。'
    a = predictor.predict_line(line)
    print(a)
Пример #29
0
def main():
    # Load SNLI dataset
    single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True) # word tokenizer
    tokenizer = WordTokenizer(end_tokens=["@@NULL@@"]) # add @@NULL@@ to the end of sentences
    reader = SnliReader(token_indexers={'tokens': single_id_indexer}, tokenizer=tokenizer)
    dev_dataset = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/snli/snli_1.0_dev.jsonl')
    # Load model and vocab
    model = load_archive('https://allennlp.s3-us-west-2.amazonaws.com/models/esim-glove-snli-2019.04.23.tar.gz').model
    model.eval().cuda()
    vocab = model.vocab

    # add hooks for embeddings so we can compute gradients w.r.t. to the input tokens
    utils.add_hooks(model)
    embedding_weight = utils.get_embedding_weight(model) # save the word embedding matrix

    # Batches of examples to construct triggers
    universal_perturb_batch_size = 32
    iterator = BasicIterator(batch_size=universal_perturb_batch_size)
    iterator.index_with(vocab)

    # Subsample the dataset to one class to do a universal attack on that class
    dataset_label_filter = 'entailment' # only entailment examples
    # dataset_label_filter = 'contradiction' # only contradiction examples
    # dataset_label_filter = 'neutral' # only neutral examples
    subset_dev_dataset = []
    for instance in dev_dataset:
        if instance['label'].label == dataset_label_filter:
            subset_dev_dataset.append(instance)
    # the attack is targeted towards a specific class
    # target_label = "0" # flip to entailment
    target_label = "1" # flip to contradiction
    # target_label = "2" # flip to neutral

    # A k-d tree if you want to do gradient + nearest neighbors
    #tree = KDTree(embedding_weight.numpy())

    # Get original accuracy before adding universal triggers
    utils.get_accuracy(model, subset_dev_dataset, vocab, trigger_token_ids=None, snli=True)
    model.train() # rnn cannot do backwards in train mode

    # Initialize triggers
    num_trigger_tokens = 1 # one token prepended
    trigger_token_ids = [vocab.get_token_index("a")] * num_trigger_tokens
    # sample batches, update the triggers, and repeat
    for batch in lazy_groups_of(iterator(subset_dev_dataset, num_epochs=10, shuffle=True), group_size=1):
        # get model accuracy with current triggers
        utils.get_accuracy(model, subset_dev_dataset, vocab, trigger_token_ids, snli=True)
        model.train() # rnn cannot do backwards in train mode

        # get grad of triggers
        averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids, target_label, snli=True)

        # find attack candidates using an attack method
        cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad,
                                                        embedding_weight,
                                                        trigger_token_ids,
                                                        num_candidates=40)
        # cand_trigger_token_ids = attacks.random_attack(embedding_weight,
        #                                                trigger_token_ids,
        #                                                num_candidates=40)
        # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad,
        #                                                        embedding_weight,
        #                                                        trigger_token_ids,
        #                                                        tree,
        #                                                        100,
        #                                                        decrease_prob=True)

        # query the model to get the best candidates
        trigger_token_ids = utils.get_best_candidates(model,
                                                      batch,
                                                      trigger_token_ids,
                                                      cand_trigger_token_ids,
                                                      snli=True)
Пример #30
0
import os

from allennlp.common.util import import_submodules
from allennlp.models import load_archive
from allennlp.predictors import Predictor

import_submodules('telegram_classifier')

archive_folder = 'trained-lstm/20190802-232055-less_regularized'

archive = load_archive(os.path.join(archive_folder, 'model.tar.gz'))
predictor = Predictor.from_archive(archive, 'roommate_pred')

while True:
    text = input("Phrase? ")
    output = predictor.predict_json({"text": text})

    print("Sender:", output['label'])
    for i in range(archive.model.vocab.get_vocab_size('labels')):
        labelName = archive.model.vocab.get_token_from_index(i, 'labels')
        percent = output['probs'][i] * 100.0
        print(f'  {labelName}: {percent:.1f}%')
Пример #31
0
    def ensure_model_can_train_save_and_load(self,
                                             param_file: str,
                                             tolerance: float = 1e-4,
                                             cuda_device: int = -1):
        save_dir = os.path.join(self.TEST_DIR, "save_and_load_test")
        archive_file = os.path.join(save_dir, "model.tar.gz")
        model = train_model_from_file(param_file, save_dir)
        loaded_model = load_archive(archive_file,
                                    cuda_device=cuda_device).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(model.state_dict()[key].cpu().numpy(),
                            loaded_model.state_dict()[key].cpu().numpy(),
                            err_msg=key)
        params = Params.from_file(self.param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])

        # Need to duplicate params because Iterator.from_params will consume.
        iterator_params = params['iterator']
        iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict()))

        iterator = DataIterator.from_params(iterator_params)
        iterator2 = DataIterator.from_params(iterator_params2)

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        model_dataset = reader.read(params['validation_data_path'])
        iterator.index_with(model.vocab)
        model_batch = next(
            iterator(model_dataset, shuffle=False, cuda_device=cuda_device))

        loaded_dataset = reader.read(params['validation_data_path'])
        iterator2.index_with(loaded_model.vocab)
        loaded_batch = next(
            iterator2(loaded_dataset, shuffle=False, cuda_device=cuda_device))

        # Check gradients are None for non-trainable parameters and check that
        # trainable parameters receive some gradient if they are trainable.
        self.check_model_computes_gradients_correctly(model, model_batch)

        # The datasets themselves should be identical.
        assert model_batch.keys() == loaded_batch.keys()
        for key in model_batch.keys():
            self.assert_fields_equal(model_batch[key], loaded_batch[key], key,
                                     1e-6)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        # Models with stateful RNNs need their states reset to have consistent
        # behavior after loading.
        for model_ in [model, loaded_model]:
            for module in model_.modules():
                if hasattr(module, 'stateful') and module.stateful:
                    module.reset_states()
        model_predictions = model(**model_batch)
        loaded_model_predictions = loaded_model(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     name=key,
                                     tolerance=tolerance)

        return model, loaded_model
Пример #32
0
        # update: the feverdatareader we are using from the fever code needs the name of trained model. EVen for training. wtf..
        # update: so moved it to outside this for loop, since we are accessing it only once using uofa_params.pop anyway

        #step 3 -read data

        objUofaTrainTest = UofaTrainTest()

        if (run_name == "annotation" and dataset == "fnc"):
            path_to_trained_models = path_to_trained_models_folder + name_of_trained_model_to_use
            convert_fnc_to_fever_and_annotate(
                FeverDocDB, path_to_trained_models, mithun_logger, cuda_device,
                path_to_pyproc_annotated_data_folder)

        db = FeverDocDB(path_to_saved_db)
        archive = load_archive(
            path_to_trained_models_folder + name_of_trained_model_to_use,
            cuda_device)
        config = archive.config
        ds_params = config["dataset_reader"]
        model = archive.model
        model.eval()
        mithun_logger.info(f"going to initiate FEVERReaderUofa.")
        fever_reader = FEVERReaderUofa(
            db,
            sentence_level=ds_params.pop("sentence_level", False),
            wiki_tokenizer=Tokenizer.from_params(
                ds_params.pop('wiki_tokenizer', {})),
            claim_tokenizer=Tokenizer.from_params(
                ds_params.pop('claim_tokenizer', {})),
            token_indexers=TokenIndexer.dict_from_params(
                ds_params.pop('token_indexers', {})))
Пример #33
0
    def test_train_model_distributed_with_sharded_reader(self, lazy):
        if torch.cuda.device_count() >= 2:
            devices = [0, 1]
        else:
            devices = [-1, -1]

        params = lambda: Params({
            "model": {
                "type": "simple_tagger",
                "text_field_embedder": {
                    "token_embedders": {
                        "tokens": {
                            "type": "embedding",
                            "embedding_dim": 5
                        }
                    }
                },
                "encoder": {
                    "type": "lstm",
                    "input_size": 5,
                    "hidden_size": 7,
                    "num_layers": 2
                },
            },
            "dataset_reader": {
                "type": "sharded",
                "base_reader": {
                    "type": "sequence_tagging"
                },
                "lazy": lazy,
            },
            "train_data_path": SEQUENCE_TAGGING_SHARDS_PATH,
            "validation_data_path": SEQUENCE_TAGGING_SHARDS_PATH,
            "data_loader": {
                "batch_size": 2
            },
            "trainer": {
                "num_epochs": 2,
                "optimizer": "adam"
            },
            "distributed": {
                "cuda_devices": devices
            },
        })

        out_dir = os.path.join(self.TEST_DIR, "test_distributed_train")
        train_model(params(), serialization_dir=out_dir)

        # Check that some logs specific to distributed
        # training are where we expect.
        serialized_files = os.listdir(out_dir)
        assert "stderr_worker0.log" in serialized_files
        assert "stdout_worker0.log" in serialized_files
        assert "stderr_worker1.log" in serialized_files
        assert "stdout_worker1.log" in serialized_files
        assert "model.tar.gz" in serialized_files

        # Check we can load the serialized model
        archive = load_archive(out_dir)
        assert archive.model

        # Check that we created a vocab from all the shards.
        tokens = archive.model.vocab._token_to_index["tokens"].keys()
        assert tokens == {
            "@@PADDING@@",
            "@@UNKNOWN@@",
            "are",
            ".",
            "animals",
            "plants",
            "vehicles",
            "cats",
            "dogs",
            "snakes",
            "birds",
            "ferns",
            "trees",
            "flowers",
            "vegetables",
            "cars",
            "buses",
            "planes",
            "rockets",
        }

        # TODO: This is somewhat brittle. Make these constants in trainer.py.
        train_early = "finishing training early!"
        validation_early = "finishing validation early!"
        train_complete = "completed its entire epoch (training)."
        validation_complete = "completed its entire epoch (validation)."

        # There are three shards, but only two workers, so the first worker will have to discard some data.
        with open(os.path.join(out_dir, "stdout_worker0.log")) as f:
            worker0_log = f.read()
            assert train_early in worker0_log
            assert validation_early in worker0_log
            assert train_complete not in worker0_log
            assert validation_complete not in worker0_log

        with open(os.path.join(out_dir, "stdout_worker1.log")) as f:
            worker1_log = f.read()
            assert train_early not in worker1_log
            assert validation_early not in worker1_log
            assert train_complete in worker1_log
            assert validation_complete in worker1_log
Пример #34
0
def main():
    # Read the SQuAD validation dataset using a word tokenizer
    single_id = SingleIdTokenIndexer(lowercase_tokens=True)
    reader = SquadReader(token_indexers={'tokens': single_id})
    dev_dataset = reader.read(
        'https://s3-us-west-2.amazonaws.com/allennlp/datasets/squad/squad-dev-v1.1.json'
    )
    # Load the model and its associated vocabulary.
    model = load_archive(
        'https://s3-us-west-2.amazonaws.com/allennlp/models/bidaf-glove-2019.05.09.tar.gz'
    ).model
    vocab = model.vocab
    model.eval().cuda()

    # filter to just certain `wh` questions
    who_questions_dev, what_questions_dev, where_questions_dev, when_questions_dev, what_questions_dev, \
        how_questions_dev, why_questions_dev, which_questions_dev, other_questions_dev = ([] for i in range(9))
    for item in dev_dataset:
        for word in item['question']:
            if word.text.lower() == 'who':
                who_questions_dev.append(item)
                break
            if word.text.lower() == 'what':
                what_questions_dev.append(item)
                break
            if word.text.lower() == 'where':
                where_questions_dev.append(item)
                break
            if word.text.lower() == 'when':
                when_questions_dev.append(item)
                break
            if word.text.lower() == 'how':
                how_questions_dev.append(item)
                break
            if word.text.lower() == 'why':
                why_questions_dev.append(item)
                break
            if word.text.lower() == 'which':
                which_questions_dev.append(item)
                break
            else:
                other_questions_dev.append(item)

    # Use batches to craft the universal perturbations
    universal_perturb_batch_size = 32
    iterator = BasicIterator(batch_size=universal_perturb_batch_size)
    iterator.index_with(vocab)

    # We register a gradient hook on the embeddings.
    utils.add_hooks(model)
    embedding_weight = utils.get_embedding_weight(
        model)  # save the word embedding matrix

    # Initialize the trigger. The first one is an intialization with all "the" tokens.
    # You can customize it. Make sure to set the fixed target answer and the question type.
    # The second is a trigger found after running as reported in our paper.
    trigger_init = "the the the the donald trump the the the the"
    target_answer = "donald trump"
    subsampled_dev_dataset = who_questions_dev  # universal attack on `who` questions
    # trigger_init = "why how ; known because : to kill american people ."
    # target_answer = "to kill american people"
    # subsampled_dev_dataset = why_questions_dev # universal attack on `who` questions

    # tokenizes the trigger, and finds the start/end span
    # make sure the trigger tokens are space separated
    trigger_token_ids = [
        vocab.get_token_index(t) for t in trigger_init.split(' ')
    ]
    span_start = trigger_init.split(' ').index(
        target_answer.split(' ')[0])  # start of target_answer
    span_end = trigger_init.split(' ').index(target_answer.split(' ')[-1])
    # we ignore replacement at the positions of the answer (answer is fixed)
    ignore_indices = [0]*(span_start) + \
        [1]*(span_end - span_start + 1) + [0]*(len(trigger_token_ids) - 1 - span_end)

    # if these parameters are bigger = better result, but slower
    num_candidates = 20
    beam_size = 5
    for _ in range(100):
        # Get targeted accuracy
        squad_utils.get_accuracy_squad(model, subsampled_dev_dataset, vocab,
                                       trigger_token_ids, target_answer,
                                       span_start, span_end)
        model.train()

        # Get the gradient for the appended tokens averaged over the batch.
        averaged_grad = squad_utils.get_average_grad_squad(
            model, vocab, trigger_token_ids, subsampled_dev_dataset,
            span_start, span_end)

        # Use an attack method to get the top candidates
        cand_trigger_token_ids = attacks.hotflip_attack(
            averaged_grad,
            embedding_weight,
            trigger_token_ids,
            num_candidates=num_candidates,
            increase_loss=False)

        # Query the model with the top candidates to find the best tokens.
        trigger_token_ids = squad_utils.get_best_candidates_squad(
            model, trigger_token_ids, cand_trigger_token_ids, vocab,
            subsampled_dev_dataset, beam_size, ignore_indices, span_start,
            span_end)
Пример #35
0
    def ensure_model_can_train_save_and_load(self,
                                             param_file: str,
                                             tolerance: float = 1e-4,
                                             cuda_device: int = -1):
        save_dir = os.path.join(self.TEST_DIR, "save_and_load_test")
        archive_file = os.path.join(save_dir, "model.tar.gz")
        model = train_model_from_file(param_file, save_dir)
        loaded_model = load_archive(archive_file, cuda_device=cuda_device).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(model.state_dict()[key].cpu().numpy(),
                            loaded_model.state_dict()[key].cpu().numpy(),
                            err_msg=key)
        params = Params.from_file(self.param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])

        # Need to duplicate params because Iterator.from_params will consume.
        iterator_params = params['iterator']
        iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict()))

        iterator = DataIterator.from_params(iterator_params)
        iterator2 = DataIterator.from_params(iterator_params2)

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        model_dataset = reader.read(params['validation_data_path'])
        iterator.index_with(model.vocab)
        model_batch = next(iterator(model_dataset, shuffle=False, cuda_device=cuda_device))

        loaded_dataset = reader.read(params['validation_data_path'])
        iterator2.index_with(loaded_model.vocab)
        loaded_batch = next(iterator2(loaded_dataset, shuffle=False, cuda_device=cuda_device))

        # Check gradients are None for non-trainable parameters and check that
        # trainable parameters receive some gradient if they are trainable.
        self.check_model_computes_gradients_correctly(model, model_batch)

        # The datasets themselves should be identical.
        assert model_batch.keys() == loaded_batch.keys()
        for key in model_batch.keys():
            self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        # Models with stateful RNNs need their states reset to have consistent
        # behavior after loading.
        for model_ in [model, loaded_model]:
            for module in model_.modules():
                if hasattr(module, 'stateful') and module.stateful:
                    module.reset_states()
        model_predictions = model(**model_batch)
        loaded_model_predictions = loaded_model(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     name=key,
                                     tolerance=tolerance)

        return model, loaded_model
Пример #36
0
from allenpipeline.Decoder import split_up
import allennlp.nn.util as util

if __name__ == "__main__":
    import_submodules("topdown_parser")
    from topdown_parser.dataset_readers.same_formalism_iterator import SameFormalismIterator

    optparser = argparse.ArgumentParser(
        add_help=True, description="Count trainable parameters.")

    optparser.add_argument('archive_file',
                           type=str,
                           help='the archived model to make predictions with')

    args = optparser.parse_args()

    archive = load_archive(args.archive_file)
    config = archive.config
    prepare_environment(config)
    model = archive.model

    total_params = 0
    # for module in model.modules():
    #     params = module.parameters()
    #     print(module, sum(p.numel() for p in params if p.requires_grad))
    for p in model.parameters():
        if p.requires_grad:
            total_params += p.numel()

    print(round(total_params / 1_000_000, 2), "M", "parameters")