示例#1
0
def run_config(config):
    params = Params(json.loads(config))
    params_copy = params.duplicate()

    if 'dataset_reader' in params:
        reader = DatasetReader.from_params(params.pop('dataset_reader'))
    else:
        raise RuntimeError('`dataset_reader` section is required')

    all_instances = []
    if 'train_data_path' in params:
        print('Reading the training data...')
        train_data = reader.read(params.pop('train_data_path'))
        all_instances.extend(train_data)
    else:
        raise RuntimeError('`train_data_path` section is required')

    validation_data = None
    if 'validation_data_path' in params:
        print('Reading the validation data...')
        validation_data = reader.read(params.pop('validation_data_path'))
        all_instances.extend(validation_data)

    print('Building the vocabulary...')
    vocab = Vocabulary.from_instances(all_instances)

    model = None
    iterator = None
    if 'model' not in params:
        # 'dataset' mode — just preview the (first 10) instances
        print('Showing the first 10 instances:')
        for inst in all_instances[:10]:
            print(inst)
    else:
        model = Model.from_params(vocab=vocab, params=params.pop('model'))

        loader_params = deepcopy(params.pop("data_loader"))
        train_data_loader = DataLoader.from_params(dataset=train_data,
                                                   params=loader_params)
        dev_data_loader = DataLoader.from_params(dataset=validation_data,
                                                 params=loader_params)
        train_data.index_with(vocab)

        # set up a temporary, empty directory for serialization
        with tempfile.TemporaryDirectory() as serialization_dir:
            trainer = Trainer.from_params(
                model=model,
                serialization_dir=serialization_dir,
                data_loader=train_data_loader,
                validation_data_loader=dev_data_loader,
                params=params.pop('trainer'))
            trainer.train()

    return {
        'params': params_copy,
        'dataset_reader': reader,
        'vocab': vocab,
        'iterator': iterator,
        'model': model
    }
示例#2
0
def run_config(config):
    params = Params(json.loads(config))
    params_copy = params.duplicate()

    if "dataset_reader" in params:
        reader = DatasetReader.from_params(params.pop("dataset_reader"))
    else:
        raise RuntimeError("`dataset_reader` section is required")

    loader_params = params.pop("data_loader")
    train_data_loader = DataLoader.from_params(
        reader=reader,
        data_path=params.pop("train_data_path"),
        params=loader_params.duplicate(),
    )
    dev_data_loader = DataLoader.from_params(
        reader=reader,
        data_path=params.pop("validation_data_path"),
        params=loader_params,
    )

    print("Building the vocabulary...")
    vocab = Vocabulary.from_instances(train_data_loader.iter_instances())

    if "model" not in params:
        # 'dataset' mode — just preview the (first 10) instances
        print("Showing the first 10 instances:")
        for inst in train_data_loader.iter_instances():
            print(inst)
            return None

    model = Model.from_params(vocab=vocab, params=params.pop("model"))

    train_data_loader.index_with(vocab)
    dev_data_loader.index_with(vocab)

    # set up a temporary, empty directory for serialization
    with tempfile.TemporaryDirectory() as serialization_dir:
        trainer = Trainer.from_params(
            model=model,
            serialization_dir=serialization_dir,
            data_loader=train_data_loader,
            validation_data_loader=dev_data_loader,
            params=params.pop("trainer"),
        )
        trainer.train()

    return {
        "params": params_copy,
        "dataset_reader": reader,
        "vocab": vocab,
        "model": model,
    }
示例#3
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger("allennlp.common.params").disabled = True
    logging.getLogger("allennlp.nn.initializers").disabled = True
    logging.getLogger("allennlp.modules.token_embedders.embedding").setLevel(
        logging.INFO)

    # Load from archive
    archive = load_archive(
        args.archive_file,
        weights_file=args.weights_file,
        cuda_device=args.cuda_device,
        overrides=args.overrides,
    )
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop("validation_dataset_reader",
                                                  None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(
            config.pop("dataset_reader"))
    evaluation_data_path = args.input_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    embedding_sources = (json.loads(args.embedding_sources_mapping)
                         if args.embedding_sources_mapping else {})

    if args.extend_vocab:
        logger.info("Vocabulary is being extended with test instances.")
        model.vocab.extend_from_instances(instances=instances)
        model.extend_embedder_vocab(embedding_sources)

    instances.index_with(model.vocab)
    data_loader_params = config.pop("validation_data_loader", None)
    if data_loader_params is None:
        data_loader_params = config.pop("data_loader")
    if args.batch_size:
        data_loader_params["batch_size"] = args.batch_size
    data_loader = DataLoader.from_params(dataset=instances,
                                         params=data_loader_params)

    metrics = evaluate(model, data_loader, args.cuda_device,
                       args.batch_weight_key)

    logger.info("Finished evaluating.")

    dump_metrics(args.output_file, metrics, log=True)

    return metrics
示例#4
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    common_logging.FILE_FRIENDLY_LOGGING = args.file_friendly_logging

    # Disable some of the more verbose logging statements
    logging.getLogger("allennlp.common.params").disabled = True
    logging.getLogger("allennlp.nn.initializers").disabled = True
    logging.getLogger("allennlp.modules.token_embedders.embedding").setLevel(
        logging.INFO)

    # Load from archive
    archive = load_archive(
        args.archive_file,
        weights_file=args.weights_file,
        cuda_device=args.cuda_device,
        overrides=args.overrides,
    )
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    dataset_reader = archive.validation_dataset_reader

    evaluation_data_path = args.input_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)

    data_loader_params = config.pop("validation_data_loader", None)
    if data_loader_params is None:
        data_loader_params = config.pop("data_loader")
    if args.batch_size:
        data_loader_params["batch_size"] = args.batch_size
    data_loader = DataLoader.from_params(params=data_loader_params,
                                         reader=dataset_reader,
                                         data_path=evaluation_data_path)

    embedding_sources = (json.loads(args.embedding_sources_mapping)
                         if args.embedding_sources_mapping else {})

    if args.extend_vocab:
        logger.info("Vocabulary is being extended with test instances.")
        model.vocab.extend_from_instances(
            instances=data_loader.iter_instances())
        model.extend_embedder_vocab(embedding_sources)

    data_loader.index_with(model.vocab)

    metrics = evaluate(
        model,
        data_loader,
        args.cuda_device,
        args.batch_weight_key,
        output_file=args.output_file,
        predictions_output_file=args.predictions_output_file,
    )

    logger.info("Finished evaluating.")

    return metrics
示例#5
0
 def setUp(self):
     super().setUp()
     param_file = self.FIXTURES_ROOT / "simple_tagger" / "experiment_with_regularization.json"
     self.set_up_model(param_file,
                       self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv")
     params = Params.from_file(param_file)
     self.reader = DatasetReader.from_params(params["dataset_reader"])
     self.data_loader = DataLoader.from_params(dataset=self.instances,
                                               params=params["data_loader"])
     self.trainer = Trainer.from_params(
         model=self.model,
         data_loader=self.data_loader,
         serialization_dir=self.TEST_DIR,
         params=params.get("trainer"),
     )
    def setUp(self):
        super().setUp()
        params = Params(
            {
                "model": {
                    "type": "simple_tagger",
                    "text_field_embedder": {
                        "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}}
                    },
                    "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2},
                },
                "dataset_reader": {"type": "sequence_tagging"},
                "train_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"),
                "validation_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"),
                "data_loader": {"batch_size": 2},
                "trainer": {"cuda_device": -1, "num_epochs": 2, "optimizer": "adam"},
            }
        )
        all_datasets = datasets_from_params(params)
        vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            instances=(instance for dataset in all_datasets.values() for instance in dataset),
        )
        model = Model.from_params(vocab=vocab, params=params.pop("model"))
        train_data = all_datasets["train"]
        train_data.index_with(vocab)

        data_loader = DataLoader.from_params(dataset=train_data, params=params.pop("data_loader"))
        trainer_params = params.pop("trainer")
        serialization_dir = os.path.join(self.TEST_DIR, "test_search_learning_rate")

        self.trainer = TrainerBase.from_params(
            model=model,
            serialization_dir=serialization_dir,
            data_loader=data_loader,
            train_data=train_data,
            params=trainer_params,
            validation_data=None,
            validation_iterator=None,
        )
def benchmark_xlmr_mdl():

    from allennlp.data import DataLoader
    from allennlp.training.util import evaluate

    xlmr = load_xlmr_coref_model()
    data_loader_params = xlmr.config.pop("data_loader")

    instances = xlmr.dataset_reader.load_dataset(testset)
    instances.index_with(xlmr.model.vocab)
    data_loader = DataLoader.from_params(dataset=instances,
                                         params=data_loader_params)

    start = time.time()

    metrics = evaluate(xlmr.model, data_loader)

    print('**XLM-R model**')
    print_speed_performance(start, num_sentences, num_tokens)
    print('Precision : ', metrics['coref_precision'])
    print('Recall : ', metrics['coref_recall'])
    print('F1 : ', metrics['coref_f1'])
    print('Mention Recall : ', metrics['mention_recall'])
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger("allennlp.common.params").disabled = True
    logging.getLogger("allennlp.nn.initializers").disabled = True
    logging.getLogger("transformers.modeling_utils").disabled = True
    logging.getLogger("transformers.tokenization_utils").disabled = True
    logging.getLogger("transformers.configuration_utils").disabled = True
    logging.basicConfig(level=logging.INFO)

    # Load from archive
    archive = load_archive(
        args.archive_file,
        weights_file=args.weights_file,
        cuda_device=args.cuda_device,
        overrides=args.overrides,
    )
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop("validation_dataset_reader", None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(config.pop("dataset_reader"))
    evaluation_data_path = args.input_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    embedding_sources = (
        json.loads(args.embedding_sources_mapping) if args.embedding_sources_mapping else {}
    )

    if args.extend_vocab:
        logger.info("Vocabulary is being extended with test instances.")
        model.vocab.extend_from_instances(instances=instances)
        model.extend_embedder_vocab(embedding_sources)

    instances.index_with(model.vocab)
    data_loader_params = config.pop("validation_data_loader", None)
    if data_loader_params is None:
        data_loader_params = config.pop("data_loader")
    if args.batch_size:
        data_loader_params["batch_size"] = args.batch_size
    data_loader = DataLoader.from_params(dataset=instances, params=data_loader_params)

    if "iter_norm" in dir(model.text_field_embedder._token_embedders['tokens']):
        iter_num = model.text_field_embedder._token_embedders['tokens'].iter_norm
    else:
        iter_num = None

    if iter_num:
        # Obtrain evaluation info for iterative normalization:
        iter_mean_eval = []
        for iter_norm_i in range(iter_num):
            logging.info("This is the {} time during iterative normalization for evaluation".format(iter_norm_i))
            mean, embeddings = get_iter_norm_mean_eval(model, data_loader, iter_mean_eval, args.cuda_device)
            logger.info("The degree of isotropy of vectors is {} ".format(degree_anisotropy(embeddings.t(), args.cuda_device)))
            iter_mean_eval.append(mean)

        model.text_field_embedder._token_embedders['tokens'].iter_norm = None 
        model.text_field_embedder._token_embedders['tokens']._matched_embedder.mean_emb_eval = iter_mean_eval
        model.text_field_embedder._token_embedders['tokens']._matched_embedder.is_train = False

    metrics = evaluate(model, data_loader, args.cuda_device, args.batch_weight_key)

    logger.info("Finished evaluating.")

    dump_metrics(args.output_file, metrics, log=True)

    return metrics
示例#9
0
    def ensure_model_can_train_save_and_load(
        self,
        param_file: Union[PathLike, str],
        tolerance: float = 1e-4,
        cuda_device: int = -1,
        gradients_to_ignore: Set[str] = None,
        overrides: str = "",
        metric_to_check: str = None,
        metric_terminal_value: float = None,
        metric_tolerance: float = 1e-4,
        disable_dropout: bool = True,
    ):
        save_dir = self.TEST_DIR / "save_and_load_test"
        archive_file = save_dir / "model.tar.gz"
        model = train_model_from_file(param_file, save_dir, overrides=overrides)
        metrics_file = save_dir / "metrics.json"
        if metric_to_check is not None:
            metric_value = metrics.get(f"best_validation_{metric_to_check}") or metrics.get(
                f"training_{metric_to_check}"
            )
            assert metric_value is not None, f"Cannot find {metric_to_check} in metrics.json file"
            assert metric_terminal_value is not None, "Please specify metric terminal value"
            assert abs(metric_value - metric_terminal_value) < metric_tolerance
        loaded_model = load_archive(archive_file, cuda_device=cuda_device).model
        assert state_keys == loaded_state_keys
        for key in state_keys:
            assert_allclose(
                model.state_dict()[key].cpu().numpy(),
                loaded_model.state_dict()[key].cpu().numpy(),
                err_msg=key,
            )
        params = Params.from_file(param_file, params_overrides=overrides)
        reader = DatasetReader.from_params(params["dataset_reader"])

        print("Reading with original model")
        model_dataset = reader.read(params["validation_data_path"])

        print("Reading with loaded model")
        loaded_dataset = reader.read(params["validation_data_path"])

        data_loader_params = params["data_loader"]
        data_loader_params["shuffle"] = False
        data_loader_params2 = Params(copy.deepcopy(data_loader_params.as_dict()))

        data_loader2 = DataLoader.from_params(dataset=loaded_dataset, params=data_loader_params2)

        model_batch = next(iter(data_loader))

        loaded_batch = next(iter(data_loader2))

        self.check_model_computes_gradients_correctly(
            model, model_batch, gradients_to_ignore, disable_dropout
        )

        for key in model_batch.keys():
            self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6)

        for model_ in [model, loaded_model]:
            for module in model_.modules():
                if hasattr(module, "stateful") and module.stateful:
                    module.reset_states()
        print("Predicting with original model")
        model_predictions = model(**model_batch)
        print("Predicting with loaded model")
        loaded_model_predictions = loaded_model(**loaded_batch)

            self.assert_fields_equal(
                model_predictions[key], loaded_model_predictions[key], name=key, tolerance=tolerance
            )
示例#10
0
    def ensure_model_can_train_save_and_load(
        self,
        param_file: str,
        tolerance: float = 1e-4,
        cuda_device: int = -1,
        gradients_to_ignore: Set[str] = None,
        overrides: str = "",
        metric_to_check: str = None,
        metric_terminal_value: float = None,
        metric_tolerance: float = 1e-4,
        disable_dropout: bool = True,
    ):
        """
        # Parameters

        param_file : `str`
            Path to a training configuration file that we will use to train the model for this
            test.
        tolerance : `float`, optional (default=1e-4)
            When comparing model predictions between the originally-trained model and the model
            after saving and loading, we will use this tolerance value (passed as `rtol` to
            `numpy.testing.assert_allclose`).
        cuda_device : `int`, optional (default=-1)
            The device to run the test on.
        gradients_to_ignore : `Set[str]`, optional (default=None)
            This test runs a gradient check to make sure that we're actually computing gradients
            for all of the parameters in the model.  If you really want to ignore certain
            parameters when doing that check, you can pass their names here.  This is not
            recommended unless you're `really` sure you don't need to have non-zero gradients for
            those parameters (e.g., some of the beam search / state machine models have
            infrequently-used parameters that are hard to force the model to use in a small test).
        overrides : `str`, optional (default = "")
            A JSON string that we will use to override values in the input parameter file.
        metric_to_check: `str`, optional (default = None)
            We may want to automatically perform a check that model reaches given metric when
            training (on validation set, if it is specified). It may be useful in CI, for example.
            You can pass any metric that is in your model returned metrics.
        metric_terminal_value: `str`, optional (default = None)
            When you set `metric_to_check`, you need to set the value this metric must converge to
        metric_tolerance: `float`, optional (default=1e-4)
            Tolerance to check you model metric against metric terminal value. One can expect some
            variance in model metrics when the training process is highly stochastic.
        disable_dropout : `bool`, optional (default = True)
            If True we will set all dropout to 0 before checking gradients. (Otherwise, with small
            datasets, you may get zero gradients because of unlucky dropout.)
        """
        save_dir = self.TEST_DIR / "save_and_load_test"
        archive_file = save_dir / "model.tar.gz"
        model = train_model_from_file(param_file,
                                      save_dir,
                                      overrides=overrides)
        metrics_file = save_dir / "metrics.json"
        if metric_to_check is not None:
            metrics = json.loads(metrics_file.read_text())
            metric_value = metrics.get(
                f"best_validation_{metric_to_check}") or metrics.get(
                    f"training_{metric_to_check}")
            assert metric_value is not None, f"Cannot find {metric_to_check} in metrics.json file"
            assert metric_terminal_value is not None, "Please specify metric terminal value"
            assert abs(metric_value - metric_terminal_value) < metric_tolerance
        loaded_model = load_archive(archive_file,
                                    cuda_device=cuda_device).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(
                model.state_dict()[key].cpu().numpy(),
                loaded_model.state_dict()[key].cpu().numpy(),
                err_msg=key,
            )
        params = Params.from_file(param_file, params_overrides=overrides)
        reader = DatasetReader.from_params(params["dataset_reader"])

        print("Reading with original model")
        model_dataset = reader.read(params["validation_data_path"])
        model_dataset.index_with(model.vocab)

        print("Reading with loaded model")
        loaded_dataset = reader.read(params["validation_data_path"])
        loaded_dataset.index_with(loaded_model.vocab)

        # Need to duplicate params because DataLoader.from_params will consume.
        data_loader_params = params["data_loader"]
        data_loader_params["shuffle"] = False
        data_loader_params2 = Params(
            copy.deepcopy(data_loader_params.as_dict()))

        data_loader = DataLoader.from_params(dataset=model_dataset,
                                             params=data_loader_params)
        data_loader2 = DataLoader.from_params(dataset=loaded_dataset,
                                              params=data_loader_params2)

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        model_batch = next(iter(data_loader))

        loaded_batch = next(iter(data_loader2))

        # Check gradients are None for non-trainable parameters and check that
        # trainable parameters receive some gradient if they are trainable.
        self.check_model_computes_gradients_correctly(model, model_batch,
                                                      gradients_to_ignore,
                                                      disable_dropout)

        # The datasets themselves should be identical.
        assert model_batch.keys() == loaded_batch.keys()
        for key in model_batch.keys():
            self.assert_fields_equal(model_batch[key], loaded_batch[key], key,
                                     1e-6)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        # Models with stateful RNNs need their states reset to have consistent
        # behavior after loading.
        for model_ in [model, loaded_model]:
            for module in model_.modules():
                if hasattr(module, "stateful") and module.stateful:
                    module.reset_states()
        print("Predicting with original model")
        model_predictions = model(**model_batch)
        print("Predicting with loaded model")
        loaded_model_predictions = loaded_model(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     name=key,
                                     tolerance=tolerance)

        return model, loaded_model
示例#11
0
def find_learning_rate_model(
    params: Params,
    serialization_dir: str,
    start_lr: float = 1e-5,
    end_lr: float = 10,
    num_batches: int = 100,
    linear_steps: bool = False,
    stopping_factor: float = None,
    force: bool = False,
) -> None:
    """
    Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir``

    # Parameters

    params : `Params`
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : `str`
        The directory in which to save results.
    start_lr : `float`
        Learning rate to start the search.
    end_lr : `float`
        Learning rate upto which search is done.
    num_batches : `int`
        Number of mini-batches to run Learning rate finder.
    linear_steps : `bool`
        Increase learning rate linearly if False exponentially.
    stopping_factor : `float`
        Stop the search when the current loss exceeds the best loss recorded by
        multiple of stopping factor. If `None` search proceeds till the `end_lr`
    force : `bool`
        If True and the serialization directory already exists, everything in it will
        be removed prior to finding the learning rate.
    """
    create_serialization_dir(params,
                             serialization_dir,
                             recover=False,
                             force=force)

    prepare_environment(params)

    cuda_device = params.params.get("trainer").get("cuda_device", -1)
    check_for_gpu(cuda_device)
    distributed_params = params.params.get("distributed")
    # See https://github.com/allenai/allennlp/issues/3658
    assert not distributed_params, "find-lr is not compatible with DistributedDataParallel."

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation),
    )
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        instances=(instance for key, dataset in all_datasets.items()
                   for instance in dataset
                   if key in datasets_for_vocab_creation),
    )

    train_data = all_datasets["train"]
    train_data.index_with(vocab)
    model = Model.from_params(vocab=vocab, params=params.pop("model"))
    data_loader = DataLoader.from_params(dataset=train_data,
                                         params=params.pop("data_loader"))

    trainer_params = params.pop("trainer")

    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    trainer_choice = trainer_params.pop("type", "gradient_descent")
    if trainer_choice != "gradient_descent":
        raise ConfigurationError(
            "currently find-learning-rate only works with the GradientDescentTrainer"
        )
    trainer: GradientDescentTrainer = Trainer.from_params(  # type: ignore
        model=model,
        serialization_dir=serialization_dir,
        data_loader=data_loader,
        params=trainer_params,
    )

    logger.info(
        f"Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations."
    )
    learning_rates, losses = search_learning_rate(
        trainer,
        start_lr=start_lr,
        end_lr=end_lr,
        num_batches=num_batches,
        linear_steps=linear_steps,
        stopping_factor=stopping_factor,
    )
    logger.info("Finished learning rate search.")
    losses = _smooth(losses, 0.98)

    _save_plot(learning_rates, losses,
               os.path.join(serialization_dir, "lr-losses.png"))
示例#12
0
文件: util.py 项目: jbrry/allennlp
def data_loaders_from_params(
    params: Params,
    train: bool = True,
    validation: bool = True,
    test: bool = True,
    serialization_dir: Optional[Union[str, PathLike]] = None,
) -> Dict[str, DataLoader]:
    """
    Instantiate data loaders specified by the config.
    """
    data_loaders: Dict[str, DataLoader] = {}

    train = train and ("train_data_path" in params)
    validation = validation and ("validation_data_path" in params)
    test = test and ("test_data_path" in params)
    if not any((train, validation, test)):
        # Return early so don't unnecessarily initialize the train data reader.
        return data_loaders

    dataset_reader_params = params.pop("dataset_reader")
    dataset_reader = DatasetReader.from_params(
        dataset_reader_params, serialization_dir=serialization_dir)
    data_loader_params = params.pop("data_loader")

    if train:
        train_data_path = params.pop("train_data_path")
        logger.info("Reading training data from %s", train_data_path)
        data_loaders["train"] = DataLoader.from_params(
            data_loader_params.duplicate(),
            reader=dataset_reader,
            data_path=train_data_path)

    if not validation and not test:
        # Return early so we don't unnecessarily initialize the validation/test data
        # reader.
        return data_loaders

    validation_and_test_dataset_reader: DatasetReader = dataset_reader
    validation_dataset_reader_params = params.pop("validation_dataset_reader",
                                                  None)
    if validation_dataset_reader_params is not None:
        logger.info(
            "Using a separate dataset reader to load validation and test data."
        )
        validation_and_test_dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params,
            serialization_dir=serialization_dir)

    validation_data_loader_params = params.pop("validation_data_loader",
                                               data_loader_params)

    if validation:
        validation_data_path = params.pop("validation_data_path")
        logger.info("Reading validation data from %s", validation_data_path)
        data_loaders["validation"] = DataLoader.from_params(
            validation_data_loader_params.duplicate(),
            reader=validation_and_test_dataset_reader,
            data_path=validation_data_path,
        )

    if test:
        test_data_path = params.pop("test_data_path")
        logger.info("Reading test data from %s", test_data_path)
        data_loaders["test"] = DataLoader.from_params(
            validation_data_loader_params,
            reader=validation_and_test_dataset_reader,
            data_path=test_data_path,
        )

    return data_loaders
    tempdir = tempfile.mkdtemp()
    with tarfile.open(resolved_archive_file, "r:gz") as archive:
        archive.extractall(tempdir)
    atexit.register(_cleanup_archive_dir, tempdir)
    serialization_dir = tempdir

config = Params.from_file(os.path.join(serialization_dir, "config.json"), "")
model = SemanticRoleLabeler.from_archive(args.archive_file)
archive = Archive(model=model, config=config)

prepare_environment(config)
model.eval()
validation_dataset_reader_params = config.pop("validation_dataset_reader",
                                              None)
if validation_dataset_reader_params is not None:
    dataset_reader = DatasetReader.from_params(
        validation_dataset_reader_params)
else:
    dataset_reader = DatasetReader.from_params(config.pop("dataset_reader"))
instances = dataset_reader.read(args.evaluation_data_path)
instances.index_with(model.vocab)
data_loader_params = config.pop("validation_data_loader", None)
if data_loader_params is None:
    data_loader_params = config.pop("data_loader")

data_loader = DataLoader.from_params(dataset=instances,
                                     params=data_loader_params)

metrics = evaluate(model, data_loader, -1, "")
dump_metrics(args.output_file, metrics)
示例#14
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    common_logging.FILE_FRIENDLY_LOGGING = args.file_friendly_logging

    # Disable some of the more verbose logging statements
    logging.getLogger("allennlp.common.params").disabled = True
    logging.getLogger("allennlp.nn.initializers").disabled = True
    logging.getLogger("allennlp.modules.token_embedders.embedding").setLevel(
        logging.INFO)

    # Load from archive
    archive = load_archive(
        args.archive_file,
        weights_file=args.weights_file,
        cuda_device=args.cuda_device,
        overrides=args.overrides,
    )
    config = deepcopy(archive.config)
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data
    dataset_reader = archive.validation_dataset_reader

    # split files
    evaluation_data_path_list = args.input_file.split(":")
    if args.output_file is not None:
        output_file_list = args.output_file.split(":")
        assert len(output_file_list) == len(
            evaluation_data_path_list
        ), "The number of `output_file` paths must be equal to the number of datasets being evaluated."
    if args.predictions_output_file is not None:
        predictions_output_file_list = args.predictions_output_file.split(":")
        assert len(predictions_output_file_list) == len(
            evaluation_data_path_list), (
                "The number of `predictions_output_file` paths must be equal" +
                "to the number of datasets being evaluated. ")

    # output file
    output_file_path = None
    predictions_output_file_path = None

    # embedding sources
    if args.extend_vocab:
        logger.info("Vocabulary is being extended with embedding sources.")
        embedding_sources = (json.loads(args.embedding_sources_mapping)
                             if args.embedding_sources_mapping else {})

    for index in range(len(evaluation_data_path_list)):
        config = deepcopy(archive.config)
        evaluation_data_path = evaluation_data_path_list[index]
        if args.output_file is not None:
            output_file_path = output_file_list[index]
        if args.predictions_output_file is not None:
            predictions_output_file_path = predictions_output_file_list[index]

        logger.info("Reading evaluation data from %s", evaluation_data_path)
        data_loader_params = config.get("validation_data_loader", None)
        if data_loader_params is None:
            data_loader_params = config.get("data_loader")
        if args.batch_size:
            data_loader_params["batch_size"] = args.batch_size
        data_loader = DataLoader.from_params(params=data_loader_params,
                                             reader=dataset_reader,
                                             data_path=evaluation_data_path)

        if args.extend_vocab:
            logger.info("Vocabulary is being extended with test instances.")
            model.vocab.extend_from_instances(
                instances=data_loader.iter_instances())
            model.extend_embedder_vocab(embedding_sources)

        data_loader.index_with(model.vocab)

        metrics = evaluate(
            model,
            data_loader,
            args.cuda_device,
            args.batch_weight_key,
            output_file=output_file_path,
            predictions_output_file=predictions_output_file_path,
        )
    logger.info("Finished evaluating.")

    return metrics
示例#15
0
    def test_model(self):
        xlmr_model = load_xlmr_coref_model()

        doc = [["Lotte", "arbejder", "med", "Mads", "."],
               ["Hun", "er", "tandlæge", "."]]

        # prediction
        preds = xlmr_model.predict(doc)

        self.assertEqual(preds['top_spans'], [[0, 0], [1, 3], [5, 5]])
        self.assertEqual(preds['antecedent_indices'],
                         [[0, 1, 2], [0, 1, 2], [0, 1, 2]])
        self.assertEqual(preds['predicted_antecedents'], [-1, -1, 0])
        self.assertEqual(preds['clusters'], [[[0, 0], [5, 5]]])

        # evaluation
        data_loader_params = xlmr_model.config.pop("data_loader")

        from collections import OrderedDict
        sentences = [[
            OrderedDict([('id', 1), ('form', 'Lotte'), ('lemma', 'Lotte'),
                         ('upos', 'PROPN'), ('coref_rel', '(1086)'),
                         ('doc_id', '1'), ('qid', '-')]),
            OrderedDict([('id', 2), ('form', 'arbejder'), ('lemma', 'arbejde'),
                         ('upos', 'VERB'), ('coref_rel', '-'), ('doc_id', '1'),
                         ('qid', '-')]),
            OrderedDict([('id', 3), ('form', 'med'), ('lemma', 'med'),
                         ('upos', 'ADV'), ('coref_rel', '-'), ('doc_id', '1'),
                         ('qid', '-')]),
            OrderedDict([('id', 4), ('form', 'Mads'), ('lemma', 'Mads'),
                         ('upos', 'PROPN'), ('coref_rel', '(902)'),
                         ('doc_id', '1'), ('qid', '-')]),
            OrderedDict([('id', 5), ('form', '.'), ('lemma', '.'),
                         ('upos', 'PUNCT'), ('coref_rel', '-'),
                         ('doc_id', '1'), ('qid', '-')])
        ],
                     [
                         OrderedDict([('id', 1), ('form', 'Hun'),
                                      ('lemma', 'hun'), ('upos', 'PRON'),
                                      ('coref_rel', '(1086)'), ('doc_id', '1'),
                                      ('qid', '-')]),
                         OrderedDict([('id', 2), ('form', 'er'),
                                      ('lemma', 'vær'), ('upos', 'VERB'),
                                      ('coref_rel', '-'), ('doc_id', '1'),
                                      ('qid', '-')]),
                         OrderedDict([('id', 3), ('form', 'tandlæge'),
                                      ('lemma', 'tandlæge'), ('upos', 'NOUN'),
                                      ('coref_rel', '-'), ('doc_id', '1'),
                                      ('qid', '-')]),
                         OrderedDict([('id', 5), ('form', '.'), ('lemma', '.'),
                                      ('upos', 'PUNCT'), ('coref_rel', '-'),
                                      ('doc_id', '1'), ('qid', '-')])
                     ]]

        instances = xlmr_model.dataset_reader.load_dataset(sentences)
        instances.index_with(xlmr_model.model.vocab)
        data_loader = DataLoader.from_params(dataset=instances,
                                             params=data_loader_params)

        metrics = evaluate(xlmr_model.model, data_loader)

        self.assertEqual(metrics['coref_precision'], 1.0)
示例#16
0
from allennlp.common.params import Params

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("cache_image_features.py")

CONFIG = "./vilbert_vqa_from_huggingface.jsonnet"

logger.info("Reading params")
params = Params.from_file(CONFIG)

logger.info("Instantiating validation dataset reader and data loader")
validation_reader = DatasetReader.from_params(
    params["validation_dataset_reader"])
validation_data_loader = DataLoader.from_params(
    params["data_loader"].duplicate(),
    reader=validation_reader,
    data_path=params["validation_data_path"],
)

for instance in validation_data_loader.iter_instances():
    pass

del validation_data_loader

logger.info("Instantiating train dataset reader and data loader")
train_reader = DatasetReader.from_params(params["dataset_reader"])
data_loader = DataLoader.from_params(
    params["data_loader"].duplicate(),
    reader=train_reader,
    data_path=params["train_data_path"],
)