Python DataIterator示例，allennlp.data.iterators.data_iterator.DataIterator Python示例

示例#1

0

显示文件

    def from_params(params: Params, serialization_dir: str, recover: bool = False) -> 'TrainerPieces':
        all_datasets = training_util.datasets_from_params(params)
        datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

        for dataset in datasets_for_vocab_creation:
            if dataset not in all_datasets:
                raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

        logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                    ", ".join(datasets_for_vocab_creation))

        if recover and os.path.exists(os.path.join(serialization_dir, "vocabulary")):
            vocab = Vocabulary.from_files(os.path.join(serialization_dir, "vocabulary"))
            params.pop("vocabulary", {})
        else:
            vocab = Vocabulary.from_params(
                    params.pop("vocabulary", {}),
                    (instance for key, dataset in all_datasets.items()
                     for instance in dataset
                     if key in datasets_for_vocab_creation)
            )

        model = Model.from_params(vocab=vocab, params=params.pop('model'))

        # Initializing the model can have side effect of expanding the vocabulary
        vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

        iterator = DataIterator.from_params(params.pop("iterator"))
        iterator.index_with(model.vocab)
        validation_iterator_params = params.pop("validation_iterator", None)
        if validation_iterator_params:
            validation_iterator = DataIterator.from_params(validation_iterator_params)
            validation_iterator.index_with(model.vocab)
        else:
            validation_iterator = None

        train_data = all_datasets['train']
        validation_data = all_datasets.get('validation')
        test_data = all_datasets.get('test')

        trainer_params = params.pop("trainer")
        no_grad_regexes = trainer_params.pop("no_grad", ())
        for name, parameter in model.named_parameters():
            if any(re.search(regex, name) for regex in no_grad_regexes):
                parameter.requires_grad_(False)

        frozen_parameter_names, tunable_parameter_names = \
                    get_frozen_and_tunable_parameter_names(model)
        logger.info("Following parameters are Frozen  (without gradient):")
        for name in frozen_parameter_names:
            logger.info(name)
        logger.info("Following parameters are Tunable (with gradient):")
        for name in tunable_parameter_names:
            logger.info(name)

        return TrainerPieces(model, iterator,
                             train_data, validation_data, test_data,
                             validation_iterator, trainer_params)

示例#2

0

显示文件

def train(model_path, train_path, val_path, seed, vocabulary_path=None, config_path=None):
    assert os.path.isdir(model_path), "Model directory does not exist"
    set_seed(seed)

    config_path = config_path or os.path.join(model_path, "config.json")
    assert os.path.isfile(config_path), "Config file does not exist"
    params = Params.from_file(config_path)

    vocabulary_path = vocabulary_path or os.path.join(model_path, "vocabulary")
    assert os.path.exists(vocabulary_path), "Vocabulary is not ready, do not forget to run preprocess.py first"
    vocabulary = Vocabulary.from_files(vocabulary_path)

    reader_params = params.duplicate().pop("reader", default=Params({}))
    reader = DatasetReader.from_params(reader_params)
    train_dataset = reader.read(train_path)
    val_dataset = reader.read(val_path) if val_path else None

    model_params = params.pop("model")
    model = Model.from_params(model_params, vocab=vocabulary)
    print(model)
    print("Trainable params count: ", sum(p.numel() for p in model.parameters() if p.requires_grad))

    iterator = DataIterator.from_params(params.pop('iterator'))
    iterator.index_with(vocabulary)
    trainer = Trainer.from_params(model, model_path, iterator,
                                  train_dataset, val_dataset, params.pop('trainer'))
    trainer.train()

示例#3

0

显示文件

文件： simple_seq2seq.py 项目： vidurj/allennlp

    def predict_json(self, _: JsonDict, cuda_device: int = -1) -> JsonDict:
        parameter_filename = 'allennlp/seq2seq.json'
        serialization_dir = 'retrained'
        subprocess.check_call(['mkdir', '-p', serialization_dir])
        params = Params.from_file(parameter_filename)

        iterator = DataIterator.from_params(params.pop("iterator"))
        iterator.index_with(self._model.vocab)

        parameters = [[n, p] for n, p in self._model.named_parameters()
                      if p.requires_grad]
        trainer_params = params.pop('trainer')
        optimizer = Optimizer.from_params(parameters,
                                          trainer_params.pop("optimizer"))

        all_datasets = datasets_from_params(params)
        train_data = all_datasets['train']
        trainer = SimpleTrainer(self._model, optimizer, train_data, iterator)
        interpreter = Interpreter(self._model, self._dataset_reader, trainer)
        while True:
            try:
                interpreter.cmdloop()
            except Exception as e:
                print(e)
                traceback.print_exc()
                print('Restarting interpreter cmdloop.')

示例#4

0

显示文件

文件： model_util.py 项目： apmoore1/nlp-uncertainty-ssl

def train_model(
        train_fp: Path,
        dev_fp: Path,
        model_fp: Path,
        vocab_data_fps: Optional[List[Path]] = None) -> Tuple[Model, Params]:
    '''
    :param train_fp: The Traning dataset file path
    :param dev_fp: The development dataset file path
    :param model_fp: The json file that describes the model
    :param vocab_data_fps: An optional List of additional dataset files that 
                           will be used to create the models vocab
    :returns: A tuple containing the Trained model and an object that 
              describes the model.
    '''
    set_random_env()
    model_params = Params.from_file(model_fp)
    emotion_dataset_reader = DatasetReader.from_params(
        model_params.pop('dataset_reader'))

    # Data
    train_dataset = emotion_dataset_reader.read(cached_path(str(train_fp)))
    dev_dataset = emotion_dataset_reader.read(cached_path(str(dev_fp)))
    vocab_datasets = [train_dataset, dev_dataset]
    if vocab_data_fps:
        for vocab_data_fp in vocab_data_fps:
            vocab_datasets.append(
                emotion_dataset_reader.read(cached_path(str(vocab_data_fp))))
    vocab_data = []
    for vocab_dataset in vocab_datasets:
        vocab_data.extend(vocab_dataset)
    vocab = Vocabulary.from_instances(vocab_data)
    emotion_model = Model.from_params(vocab=vocab,
                                      params=model_params.pop('model'))
    data_iter = DataIterator.from_params(model_params.pop('iterator'))
    data_iter.index_with(vocab)
    # Trainer
    with tempfile.TemporaryDirectory() as serial_dir:
        trainer_params = model_params.pop('trainer')
        trainer = Trainer.from_params(model=emotion_model,
                                      serialization_dir=serial_dir,
                                      iterator=data_iter,
                                      train_data=train_dataset,
                                      validation_data=dev_dataset,
                                      params=trainer_params)
        _ = trainer.train()

        temp_config_fp = str(Path(serial_dir, CONFIG_NAME).resolve())
        Params.from_file(model_fp).to_file(temp_config_fp)
        vocab.save_to_files(Path(serial_dir, "vocabulary").resolve())
        archive_model(serial_dir,
                      files_to_archive=model_params.files_to_archive)
        model_archive = load_archive(serial_dir, cuda_device=0)
        return model_archive.model, model_archive.config

示例#5

0

显示文件

文件： test_summarization.py 项目： crazydigger/ru_summarization_mbart

    def _test_model(self, file_name):
        params = self.params[file_name].duplicate()
        reader_params = params.duplicate().pop("reader", default=Params({}))
        if reader_params["type"] == "cnn_dailymail":
            reader_params["cnn_tokenized_dir"] = TEST_STORIES_DIR
            dataset_file = TEST_URLS_FILE
        elif reader_params["type"] == "ria":
            dataset_file = RIA_EXAMPLE_FILE
        else:
            assert False

        reader = DatasetReader.from_params(reader_params)
        tokenizer = reader._tokenizer
        dataset = reader.read(dataset_file)
        vocabulary_params = params.pop("vocabulary", default=Params({}))
        vocabulary = Vocabulary.from_params(vocabulary_params,
                                            instances=dataset)

        model_params = params.pop("model")
        model = Model.from_params(model_params, vocab=vocabulary)
        print(model)
        print("Trainable params count: ",
              sum(p.numel() for p in model.parameters() if p.requires_grad))

        iterator = DataIterator.from_params(params.pop('iterator'))
        iterator.index_with(vocabulary)
        trainer = Trainer.from_params(model, None, iterator, dataset, None,
                                      params.pop('trainer'))
        trainer.train()

        model.eval()
        predictor = Seq2SeqPredictor(model, reader)
        for article, reference_sents in reader.parse_set(dataset_file):
            ref_words = [
                token.text for token in tokenizer.tokenize(reference_sents)
            ]
            decoded_words = predictor.predict(article)["predicted_tokens"]
            self.assertGreaterEqual(len(decoded_words), len(ref_words))
            unk_count = 0
            while DEFAULT_OOV_TOKEN in decoded_words:
                unk_index = decoded_words.index(DEFAULT_OOV_TOKEN)
                decoded_words.pop(unk_index)
                unk_count += 1
                if unk_index < len(ref_words):
                    ref_words.pop(unk_index)
            self.assertLess(unk_count, 5)
            self.assertListEqual(decoded_words[:len(ref_words)], ref_words)

示例#6

0

显示文件

文件： neural_net.py 项目： IlyaGusev/rulm

    def train(self,
              train_file_name: str,
              train_params: Params,
              serialization_dir: str = None,
              valid_file_name: str = None):
        assert os.path.exists(train_file_name)
        assert not valid_file_name or os.path.exists(valid_file_name)
        train_dataset = self.reader.read(train_file_name)
        valid_dataset = self.reader.read(
            valid_file_name) if valid_file_name else None

        iterator = DataIterator.from_params(train_params.pop('iterator'))
        iterator.index_with(self.vocab)
        trainer = Trainer.from_params(self.model, serialization_dir, iterator,
                                      train_dataset, valid_dataset,
                                      train_params.pop('trainer'))
        train_params.assert_empty("Trainer")
        return trainer.train()

示例#7

0

显示文件

def train_model(params,
                serialization_dir,
                file_friendly_logging=False,
                recover=False,
                model="bidaf"):
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool`, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    """
    print("Starting training models...")
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    all_datasets = datasets_from_params(params)
    print("get all of the dataset.")
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    print("creatig vocaburary...")
    logger.info("Creating a vocabulary using %s data.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        (instance for key, dataset in all_datasets.items()
         for instance in dataset if key in datasets_for_vocab_creation))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    if model == "self":
        model = BiDAFSelfAttention.from_params(vocab, params.pop("model"))
    else:
        model = BidirectionalAttentionFlow.from_params(vocab,
                                                       params.pop("model"))
    print("Initialized a BiDAF model.")
    # This is for debugging.
    print(model)
    print(serialization_dir)

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    print("create iterator")

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    print("initalizing a trainer")
    trainer_params = params.pop("trainer")
    trainer = Trainer.from_params(model, serialization_dir, iterator,
                                  train_data, validation_data, trainer_params)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_metrics = evaluate(model,
                                test_data,
                                iterator,
                                cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"),
              "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model

示例#8

0

显示文件

文件： training_percent_labels_script.py 项目： belindal/allennlp

def train_model(params: Params,
                serialization_dir: str,
                results_fn: str,
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False) -> Tuple[Model, Dict[str, Any]]:
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover, force)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        (instance for key, dataset in all_datasets.items()
         for instance in dataset if key in datasets_for_vocab_creation))

    model = Model.from_params(vocab=vocab, params=params.pop('model'))

    # Initializing the model can have side effect of expanding the vocabulary
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(
            validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None
    held_out_iterator_params = params.pop("held_out_iterator", None)
    if held_out_iterator_params:
        held_out_iterator = DataIterator.from_params(held_out_iterator_params)
        held_out_iterator.index_with(vocab)
    else:
        held_out_iterator = None

    train_data = all_datasets['train']
    held_out_train_data = all_datasets.get('held_out_train')
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer_choice = trainer_params.pop_choice("type",
                                               Trainer.list_available(),
                                               default_to_first_choice=True)
    trainer = Trainer.by_name(trainer_choice).from_params(
        model=model,
        serialization_dir=serialization_dir,
        iterator=iterator,
        train_data=train_data,
        held_out_train_data=held_out_train_data,
        validation_data=validation_data,
        params=trainer_params,
        validation_iterator=validation_iterator,
        held_out_iterator=held_out_iterator)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)

    if test_data and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
            best_model,
            test_data,
            validation_iterator or iterator,
            cuda_device=trainer._cuda_devices[0]  # pylint: disable=protected-access
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    dump_metrics(os.path.join(results_dir, results_fn), metrics, log=True)

    return best_model, metrics

示例#9

0

显示文件

文件： train.py 项目： Jordan-Sauchuk/allennlp

def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool`, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(params.pop("vocabulary", {}),
                                   (instance for key, dataset in all_datasets.items()
                                    for instance in dataset
                                    if key in datasets_for_vocab_creation))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  validation_data,
                                  trainer_params)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Training interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model

示例#10

0

显示文件

文件： fine_tune.py 项目： pyknife/allennlp

def fine_tune_model(model: Model,
                    params: Params,
                    serialization_dir: str,
                    file_friendly_logging: bool = False) -> Model:
    """
    Fine tunes the given model, using a set of parameters that is largely identical to those used
    for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored,
    if it is present (as we are already given a ``Model`` here).

    The main difference between the logic done here and the logic done in ``train_model`` is that
    here we do not worry about vocabulary construction or creating the model object.  Everything
    else is the same.

    Parameters
    ----------
    archive : ``Archive``
        A saved model archive that is the result of running the ``train`` command.
    train_data_path : ``str``
        Path to the training data to use for fine-tuning.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    validation_data_path : ``str``, optional
        Path to the validation data to use while fine-tuning.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    """
    prepare_environment(params)
    os.makedirs(serialization_dir)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    if params.pop('model', None):
        logger.warning("You passed parameters for the model in your configuration file, but we "
                       "are ignoring them, using instead the model parameters in the archive.")

    vocabulary_params = params.pop('vocabulary', {})
    if vocabulary_params.get('directory_path', None):
        logger.warning("You passed `directory_path` in parameters for the vocabulary in "
                       "your configuration file, but it will be ignored. "
                       "Vocabulary from the saved model will be extended with current data.")

    all_datasets = datasets_from_params(params)

    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Extending model vocabulary using %s data.", ", ".join(datasets_for_vocab_creation))
    vocab = model.vocab
    vocab.extend_from_instances(vocabulary_params,
                                (instance for key, dataset in all_datasets.items()
                                 for instance in dataset
                                 if key in datasets_for_vocab_creation))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  validation_data,
                                  trainer_params)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')
    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Fine-tuning interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model

示例#11

0

显示文件

def train_model(params: Params,
                serialization_dir: str,
                selector: str,
                num_ensemble_models: Optional[int],
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover, force)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset
             if key in datasets_for_vocab_creation)
    )

    model_params = params.pop('model')
    if selector == 'qbc':
        assert num_ensemble_models is not None
        models_list = [Model.from_params(vocab=vocab, params=model_params.duplicate()) for i in range(num_ensemble_models)]
        ensemble_model = CorefEnsemble(models_list)
        model = ensemble_model.submodels[0]
    else:
        model = Model.from_params(vocab=vocab, params=model_params)
        ensemble_model = None

    # Initializing the model can have side effect of expanding the vocabulary
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None
    held_out_iterator_params = params.pop("held_out_iterator", None)
    if held_out_iterator_params:
        held_out_iterator = DataIterator.from_params(held_out_iterator_params)
        held_out_iterator.index_with(vocab)
    else:
        held_out_iterator = None

    train_data = all_datasets['train']
    held_out_train_data = all_datasets.get('held_out_train')
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer_choice = trainer_params.pop("type")
    trainer = ALCorefTrainer.by_name(trainer_choice).from_params(model=model,
                                                                serialization_dir=serialization_dir,
                                                                iterator=iterator,
                                                                train_data=train_data,
                                                                held_out_train_data=held_out_train_data,
                                                                validation_data=validation_data,
                                                                params=trainer_params,
                                                                validation_iterator=validation_iterator,
                                                                held_out_iterator=held_out_iterator,
                                                                ensemble_model=ensemble_model)
    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics, query_info = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Training interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    best_model = None
    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)
    
    if test_data and evaluate_on_test:
        logger.info("The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
            best_model, test_data, validation_iterator or iterator,
            cuda_device=trainer._cuda_devices[0],
            batch_weight_key="",
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value
    return best_model, metrics, query_info

示例#12

0

显示文件

文件： train_model.py 项目： ohaanika/attention-interpretability

def train_seq_models_reuse_iterator(base_filename, output_dir_base, gpu):
    processed_filenames = {}
    base_filename_prefix = base_filename[:base_filename.rfind('.')]
    filename_expression = base_filename_prefix + '*' + base_filename[
        base_filename.rfind('.'):]
    if os.path.isfile(base_filename):
        params_to_pull_iterator_from = Params.from_file(base_filename, "")
        params_for_copying = Params.from_file(base_filename, "")
    else:
        filename_to_pull = get_config_filenames_matching(
            filename_expression)[0]
        params_to_pull_iterator_from = Params.from_file(filename_to_pull, "")
        params_for_copying = Params.from_file(filename_to_pull, "")

    all_datasets = datasets_from_params(params_to_pull_iterator_from)
    datasets_for_vocab_creation = set(
        params_to_pull_iterator_from.pop("datasets_for_vocab_creation",
                                         all_datasets))

    vocab = Vocabulary.from_params(
        params_to_pull_iterator_from.pop("vocabulary", {}),
        (instance for key, dataset in all_datasets.items()
         for instance in dataset if key in datasets_for_vocab_creation))

    iterator = DataIterator.from_params(
        params_to_pull_iterator_from.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params_to_pull_iterator_from.pop(
        "validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(
            validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    params_to_copy = [
        "validation_iterator", "vocabulary", "iterator", "dataset_reader",
        "datasets_for_vocab_creation", "train_data_path",
        "validation_data_path"
    ]
    copied_param_vals = [(param_name, params_for_copying.pop(param_name, None))
                         for param_name in params_to_copy]

    while True:
        config_filenames = get_config_filenames_matching(filename_expression)
        cur_config_filename = None
        for config_filename in config_filenames:
            if not config_filename in processed_filenames:
                processed_filenames[config_filename] = 0
                cur_config_filename = config_filename
                break

        if cur_config_filename is None:
            break

        edit_config_file_to_have_gpu(cur_config_filename, gpu)

        progressing_ok = True
        try:
            params = Params.from_file(cur_config_filename, "")
        except:
            print("Could not properly read params from " +
                  cur_config_filename + "; skipping.")
            progressing_ok = False

        if progressing_ok:
            try:
                for param_tup in copied_param_vals:
                    modify_param(params, param_tup[0], param_tup[1])
            except:
                print("Something went wrong while modifying params in " +
                      cur_config_filename)
                progressing_ok = False

        if progressing_ok:
            print("Starting to train model from " + cur_config_filename)

        if progressing_ok:
            try:
                cur_config_filename = cur_config_filename[:cur_config_filename.
                                                          rfind('.')]
                last_letters_to_take = len(cur_config_filename) - len(
                    base_filename_prefix)
                if last_letters_to_take > 0:
                    tag_to_append_to_dir = cur_config_filename[(
                        -1 * last_letters_to_take):]
                else:
                    tag_to_append_to_dir = ''
                serialization_dir = output_dir_base + tag_to_append_to_dir
            except:
                progressing_ok = False
                print("Could not properly assemble a serialization directory")

        if progressing_ok:
            try:
                train_model_given_params_and_iterators(
                    params, serialization_dir, iterator, validation_iterator,
                    vocab, all_datasets, params_to_copy)
            except:
                progressing_ok = False
                print(
                    "Training model failed for some reason; skipping to next model."
                )
    print("Done processing all config files.")

示例#13

0

显示文件

文件： trainer.py 项目： bayesrule/coherence

    def from_params(params: Params,
                    serialization_dir: str,
                    recover: bool = False) -> 'TrainerPieces':
        # all_datasets = datasets_from_params(params)
        corpus = Corpus.from_params(params.pop('corpus'))
        # datasets_for_vocab_creation = set(params.pop(
        #     "datasets_for_vocab_creation", all_datasets))

        # for dataset in datasets_for_vocab_creation:
        #     if dataset not in all_datasets:
        #         raise ConfigurationError(
        #             f"invalid 'dataset_for_vocab_creation' {dataset}")

        # logger.info("From dataset instances, %s will be considered for vocabulary creation.",
        #             ", ".join(datasets_for_vocab_creation))

        seed = params.pop_int("seed", 5678)
        vocab_params = params.pop("vocabulary", {})
        vocab_type = vocab_params.get("type", "default")
        if vocab_type == 'default' and os.path.exists(
                os.path.join(serialization_dir, "vocabulary")):
            vocab = Vocabulary.from_files(
                os.path.join(serialization_dir, "vocabulary"))
        elif vocab_type == 'empty':
            vocab = Vocabulary()
        else:
            seed_environment(seed)
            vocab = Vocabulary.from_params(vocab_params, corpus.train)

        # Need to reset the seed. Otherwise loading existing vocab and creating
        # vocab from scratch will lead to different behavior.
        seed_environment(seed)
        # contextualizer_params = params.pop('contextualizer')
        # contextualizer = Seq2SeqDecoder.from_params(
        #     vocab=vocab, params=contextualizer_params)

        model = Model.from_params(vocab=vocab, params=params.pop('model'))

        # If vocab extension is ON for training, embedding extension should also be
        # done. If vocab and embeddings are already in sync, it would be a no-op.
        model.extend_embedder_vocab()

        # Initializing the model can have side effect of expanding the vocabulary
        vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

        iterator = DataIterator.from_params(params.pop("iterator"))
        iterator.index_with(model.vocab)
        validation_iterator_params = params.pop("validation_iterator", None)
        if validation_iterator_params:
            validation_iterator = DataIterator.from_params(
                validation_iterator_params)
            validation_iterator.index_with(model.vocab)
        else:
            validation_iterator = None

        # train_data = all_datasets['train']
        # validation_data = all_datasets.get('validation')
        # test_data = all_datasets.get('test')

        trainer_params = params.pop("trainer")
        no_grad_regexes = trainer_params.pop("no_grad", ())
        for name, parameter in model.named_parameters():
            if any(re.search(regex, name) for regex in no_grad_regexes):
                parameter.requires_grad_(False)

        frozen_parameter_names, tunable_parameter_names = get_frozen_and_tunable_parameter_names(
            model)
        logger.info("Following parameters are Frozen  (without gradient):")
        for name in frozen_parameter_names:
            logger.info(name)
        logger.info("Following parameters are Tunable (with gradient):")
        for name in tunable_parameter_names:
            logger.info(name)

        batch_weight_key = params.pop('batch_weight_key', '')

        return TrainerPieces(model, iterator, corpus, validation_iterator,
                             batch_weight_key, trainer_params)

示例#14

0

显示文件

文件： train.py 项目： sebastianGehrmann/allennlp

def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool`, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover)

    # TODO(mattg): pull this block out into a separate function (maybe just add this to
    # `prepare_environment`?)
    Tqdm.set_slower_interval(file_friendly_logging)
    sys.stdout = TeeLogger(
        os.path.join(serialization_dir, "stdout.log"),  # type: ignore
        sys.stdout,
        file_friendly_logging)
    sys.stderr = TeeLogger(
        os.path.join(serialization_dir, "stderr.log"),  # type: ignore
        sys.stderr,
        file_friendly_logging)
    handler = logging.FileHandler(
        os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Creating a vocabulary using %s data.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        (instance for key, dataset in all_datasets.items()
         for instance in dataset if key in datasets_for_vocab_creation))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    trainer = Trainer.from_params(model, serialization_dir, iterator,
                                  train_data, validation_data, trainer_params)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')
    metrics = trainer.train()

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_metrics = evaluate(model,
                                test_data,
                                iterator,
                                cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"),
              "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model

示例#15

0

显示文件

文件： evaluate.py 项目： chujiqun/advancedNLP-final-task

    # files=['./Final Task/Test/SemEval2017-task3-English-test-input.xml']
    files = ['./Final Task/dev/SemEval2016-Task3-CQA-QL-dev.xml']
    attn = "_cos_hyper"
    calculate_map = True
    write_file = False
    Wfile_name = "out.txt"

    import_submodules(library)
    model_config = "config/%s_eval.jsonnet" % model_name
    overrides = overrides = json.dumps(
        {"trainer": {
            "cuda_device": cuda_device
        }})
    params = Params.from_file(model_config, overrides)
    model_file = 'checkpoint/%s%s/' % (model_name, attn)
    iterator = DataIterator.from_params(params.pop("iterator"))

    torch.manual_seed(0)
    numpy.random.seed(0)

    if write_file:
        wf = Write_outfile(Wfile_name)

    print("Loading vocabulary")
    vocab = Vocabulary.from_files(model_file + 'vocabulary')

    print('Initialing model')
    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    print("Loading Model file from %s" % (model_file + 'best.th'))
    with open(model_file + 'best.th', 'rb') as f:
        model.load_state_dict(torch.load(f, encoding='utf-8'))

示例#16

0

显示文件

文件： train.py 项目： ziaridoy20/allennlp

def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover, force)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset
             if key in datasets_for_vocab_creation)
    )

    model = Model.from_params(vocab=vocab, params=params.pop('model'))

    # Initializing the model can have side effect of expanding the vocabulary
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer_choice = trainer_params.pop_choice("type",
                                               Trainer.list_available(),
                                               default_to_first_choice=True)
    trainer = Trainer.by_name(trainer_choice).from_params(model=model,
                                                          serialization_dir=serialization_dir,
                                                          iterator=iterator,
                                                          train_data=train_data,
                                                          validation_data=validation_data,
                                                          params=trainer_params,
                                                          validation_iterator=validation_iterator)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Training interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)

    if test_data and evaluate_on_test:
        logger.info("The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
                best_model, test_data, validation_iterator or iterator,
                cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True)

    return best_model

示例#17

0

显示文件

文件： registrable_test.py 项目： Jordan-Sauchuk/allennlp

 def test_registry_has_builtin_iterators(self):
     assert DataIterator.by_name('adaptive').__name__ == 'AdaptiveIterator'
     assert DataIterator.by_name('basic').__name__ == 'BasicIterator'
     assert DataIterator.by_name('bucket').__name__ == 'BucketIterator'

示例#18

0

显示文件

                 for instance in dataset
                 if key in datasets_for_vocab_creation)
        )

    model = Model.from_params(vocab=vocab, params=params.pop('model'))

    # Initializing the model can have side effect of expanding the vocabulary
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

<<<<<<< HEAD
    model = Model.from_params(vocab, params.pop('model'))
    selector = Selector.from_params(vocab, params.pop('selector'))

=======
>>>>>>> 9b2f0b45abdad09c78036fdaebe7dd2c9973128a
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
<<<<<<< HEAD
    trainer = Trainer.from_params(model,

示例#19

0

显示文件

文件： fine_tune.py 项目： sensecollective/allennlp

def fine_tune_model(model: Model,
                    params: Params,
                    serialization_dir: str,
                    file_friendly_logging: bool = False) -> Model:
    """
    Fine tunes the given model, using a set of parameters that is largely identical to those used
    for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored,
    if it is present (as we are already given a ``Model`` here).

    The main difference between the logic done here and the logic done in ``train_model`` is that
    here we do not worry about vocabulary construction or creating the model object.  Everything
    else is the same.

    Parameters
    ----------
    archive : ``Archive``
        A saved model archive that is the result of running the ``train`` command.
    train_data_path : ``str``
        Path to the training data to use for fine-tuning.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    validation_data_path : ``str``, optional
        Path to the validation data to use while fine-tuning.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    """
    prepare_environment(params)

    os.makedirs(serialization_dir)

    # TODO(mattg): pull this block out into a separate function (maybe just add this to
    # `prepare_environment`?)
    Tqdm.set_slower_interval(file_friendly_logging)
    sys.stdout = TeeLogger(
        os.path.join(serialization_dir, "stdout.log"),  # type: ignore
        sys.stdout,
        file_friendly_logging)
    sys.stderr = TeeLogger(
        os.path.join(serialization_dir, "stderr.log"),  # type: ignore
        sys.stderr,
        file_friendly_logging)
    handler = logging.FileHandler(
        os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    if params.pop('model', None):
        logger.warning(
            "You passed parameters for the model in your configuration file, but we "
            "are ignoring them, using instead the model parameters in the archive."
        )

    if params.pop('vocabulary', None):
        logger.warning(
            "You passed parameters for the vocabulary in your configuration file, but "
            "we are ignoring them, using instead the vocabulary from the saved model."
        )

    vocab = model.vocab
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    all_datasets = datasets_from_params(params)

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    trainer = Trainer.from_params(model, serialization_dir, iterator,
                                  train_data, validation_data, trainer_params)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')
    metrics = trainer.train()

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_metrics = evaluate(model,
                                test_data,
                                iterator,
                                cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"),
              "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model

示例#20

0

显示文件

文件： multi_task_trainer.py 项目： jerbarnes/multitask_negation_for_targeted_sentiment

    def __init__(self,
                 params: Params,
                 serialization_dir: str,
                 recover: bool = False,
                 cache_directory: Optional[str] = None,
                 cache_prefix: Optional[str] = None):
        self.original_params = params.duplicate()
        self.main_serialization_dir = serialization_dir
        if recover or cache_directory or cache_prefix:
            raise NotImplementedError(
                f'Currently do not support `recover` {recover}, '
                f'`cache_directory` {cache_directory}, or '
                f'`cache_prefix` {cache_prefix}')
        task_order = params.get('trainer').pop('task_order')
        main_task = params.get('trainer').pop('main_task')
        self.task_order = task_order
        if main_task != task_order[-1]:
            raise ConfigurationError(
                f'main task {main_task} with `trainer` has'
                ' to be equal to the last task in the '
                f'`task_order` {task_order}')
        logger.warning(f"Main task {main_task}")
        logger.warning("Training tasks each epoch in the following order:")
        for task_index, task in enumerate(task_order):
            logger.warning(f'{task_index}: {task}')

        # Get shared iterator
        shared_values = params.pop('shared_values')
        iterator = DataIterator.from_params(shared_values.pop("iterator"))
        self.iterator = iterator
        # Get dataset information
        # task name: dataset split name: data
        all_task_data: Dict[str, Dict[str, List[Instance]]] = {}
        self.task_params = {}
        for task in task_order:
            logger.warning(f'Loading dataset for {task}')
            task_data = {}
            task_params = params.get(task)
            self.task_params[task] = task_params.duplicate()

            dataset_reader = DatasetReader.from_params(
                task_params.pop('dataset_reader'))

            task_data['train'] = dataset_reader.read(
                task_params.pop('train_data_path'))
            task_data['validation'] = dataset_reader.read(
                task_params.pop('validation_data_path'))
            task_data['test'] = dataset_reader.read(
                task_params.pop('test_data_path'))
            all_task_data[task] = task_data

        # Create the vocab
        logger.warning('Creating Vocab from all task data')
        all_instances = [
            data for task_data in all_task_data.values()
            for data in task_data.values()
        ]
        all_instances = list(itertools.chain.from_iterable(all_instances))
        vocab = Vocabulary.from_instances(all_instances)
        iterator.index_with(vocab)
        logger.warning('Iterator indexed')

        # Shared model parameters
        text_embedder = None
        if 'text_field_embedder' in shared_values:
            logger.warning('Creating shared text embedder')
            text_embedder_params = shared_values.pop('text_field_embedder')
            text_embedder = TextFieldEmbedder.from_params(
                params=text_embedder_params, vocab=vocab)
        shared_encoder = None
        if 'shared_encoder' in shared_values:
            logger.warning('Creating shared Sequence Encoder')
            shared_encoder_params = shared_values.pop('shared_encoder')
            shared_encoder = Seq2SeqEncoder.from_params(
                params=shared_encoder_params)
        # Creating task specific models
        task_models: Dict[str, SharedCrfTagger] = {}
        for task in task_order:
            logger.warning(f'Creating shared model for task {task}')
            task_params = params.get(task)
            task_model_params = task_params.pop('model')
            task_text_embedder = None
            if text_embedder is not None:
                task_text_embedder = text_embedder

            if task_text_embedder is not None and "text_field_embedder" in task_model_params:
                raise ConfigurationError('Cannot have a shared text field '
                                         'embedder and a task specific one')
            if shared_encoder is not None and "shared_encoder" in task_model_params:
                raise ConfigurationError(
                    'Cannot have a shared encoder in shared_values '
                    'and a task specific shared encoder')

            if "text_field_embedder" in task_model_params:
                task_text_embedder_params = task_model_params.pop(
                    'text_field_embedder')
                task_text_embedder = TextFieldEmbedder.from_params(
                    params=task_text_embedder_params, vocab=vocab)

            if task_model_params.pop('type') != 'shared_crf_tagger':
                raise ConfigurationError(
                    'The SharedCRF tagger model is the '
                    f'only supported model. Error task {task}')
            task_models[task] = SharedCrfTagger.from_params(
                vocab=vocab,
                text_field_embedder=task_text_embedder,
                shared_encoder=shared_encoder,
                params=task_model_params)
        # Task specific trainers
        task_trainers: Dict[str, Trainer] = {}
        self.task_serialization_dir = {}
        for task in task_order:
            logger.warning(f'Creating {task} trainer')
            task_serialization_dir = str(Path(serialization_dir, task))
            self.task_serialization_dir[task] = task_serialization_dir
            logger.warning(
                f'Task {task} serialization directory: {task_serialization_dir}'
            )

            task_trainer_params = params.get(task).pop('trainer')
            task_train_data = all_task_data[task]['train']
            task_validation_data = all_task_data[task]['validation']
            task_model = task_models[task]
            task_trainers[task] = Trainer.from_params(
                params=task_trainer_params,
                model=task_model,
                serialization_dir=task_serialization_dir,
                iterator=iterator,
                validation_data=task_validation_data,
                train_data=task_train_data)
        # Getting task specific evaluation data
        self.task_cuda_evaluation = {}
        self.auxiliary_task_validation_data = {}
        self.all_task_test_data = {}
        for task in task_order:
            # If not setting for cuda or not then cuda is assumed.
            self.task_cuda_evaluation[task] = 0
            if 'evaluate' in params.get(task):
                if 'cuda_device' in params.get(task).get('evaluate'):
                    is_cuda = params.get(task).pop('evaluate')['cuda_device']
                    self.task_cuda_evaluation[task] = is_cuda
            if task != main_task:
                self.auxiliary_task_validation_data[task] = all_task_data[
                    task]['validation']
            self.all_task_test_data[task] = all_task_data[task]['test']
        # Remove all of the tasks from the params
        for task in task_order:
            params.pop(task)
        params.pop('trainer')
        params.assert_empty('MultiTaskTrainer')
        self.task_trainers = task_trainers

示例#21

0

显示文件

def train_model(params: Params, serialization_dir: str) -> Model:
    """
    This function can be used as an entry point to running models in AllenNLP
    directly from a JSON specification using a :class:`Driver`. Note that if
    you care about reproducibility, you should avoid running code using Pytorch
    or numpy which affect the reproducibility of your experiment before you
    import and use this function, these libraries rely on random seeds which
    can be set in this function via a JSON specification file. Note that this
    function performs training and will also evaluate the trained model on
    development and test sets if provided in the parameter json.

    Parameters
    ----------
    params: Params, required.
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir: str, required
        The directory in which to save results and logs.
    """
    prepare_environment(params)

    os.makedirs(serialization_dir, exist_ok=True)
    sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout)  # type: ignore
    sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr)  # type: ignore
    handler = logging.FileHandler(os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)
    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    # Now we begin assembling the required parts for the Trainer.
    dataset_reader = DatasetReader.from_params(params.pop('dataset_reader'))

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    all_datasets: List[Dataset] = [train_data]
    datasets_in_vocab = ["train"]

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = dataset_reader.read(validation_data_path)
        all_datasets.append(validation_data)
        datasets_in_vocab.append("validation")
    else:
        validation_data = None

    test_data_path = params.pop("test_data_path", None)
    if test_data_path is not None:
        logger.info("Reading test data from %s", test_data_path)
        test_data = dataset_reader.read(test_data_path)
        all_datasets.append(test_data)
        datasets_in_vocab.append("test")
    else:
        test_data = None

    logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_in_vocab))
    vocab = Vocabulary.from_params(params.pop("vocabulary", {}),
                                   Dataset([instance for dataset in all_datasets
                                            for instance in dataset.instances]))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))

    train_data.index_instances(vocab)
    if validation_data:
        validation_data.index_instances(vocab)

    trainer_params = params.pop("trainer")
    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  validation_data,
                                  trainer_params)

    evaluate_on_test = params.pop("evaluate_on_test", False)
    params.assert_empty('base train command')
    trainer.train()

    # Now tar up results
    archive_model(serialization_dir)

    if test_data and evaluate_on_test:
        test_data.index_instances(vocab)
        evaluate(model, test_data, iterator, cuda_device=trainer._cuda_device)  # pylint: disable=protected-access

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    return model

示例#22

0

显示文件

文件： train_multitask_two_tasks.py 项目： hashc/scicite

def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    check_for_gpu(params.get('trainer').get('cuda_device', -1))

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    all_datasets, all_datasets_aux, all_datasets_aux2 = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))
    datasets_for_vocab_creation_aux = set(params.pop("auxiliary_datasets_for_vocab_creation", all_datasets_aux))
    datasets_for_vocab_creation_aux2 = set(params.pop("auxiliary_datasets_for_vocab_creation_2", all_datasets_aux2))


    mixing_ratio = params.pop_float("mixing_ratio")
    mixing_ratio2 = params.pop_float("mixing_ratio2")

    cutoff_epoch = params.pop("cutoff_epoch", -1)

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))
    vocab_instances_aux = [
        instance for key, dataset in all_datasets_aux.items()
        for instance in dataset
        if key in datasets_for_vocab_creation_aux
    ]
    vocab_instances_aux.extend([
        instance for key, dataset in all_datasets_aux2.items()
        for instance in dataset
        if key in datasets_for_vocab_creation_aux2
    ])
    vocab = VocabularyMultitask.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset
             if key in datasets_for_vocab_creation),
            instances_aux=vocab_instances_aux
    )
    model = Model.from_params(vocab=vocab, params=params.pop('model'))

    # Initializing the model can have side effect of expanding the vocabulary
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    iterator_aux = DataIterator.from_params(params.pop("iterator_aux"))
    iterator_aux.index_with(vocab)

    iterator_aux2 = DataIterator.from_params(params.pop("iterator_aux2"))
    iterator_aux2.index_with(vocab)

    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    # TODO: if validation in multi-task need to add validation iterator as above

    train_data = all_datasets.get('train')
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    train_data_aux = all_datasets_aux.get('train_aux')
    validation_data_aux = all_datasets_aux.get('validation_aux')
    test_data_aux = all_datasets_aux.get('test_aux')

    train_data_aux2 = all_datasets_aux2.get('train_aux')
    validation_data_aux2 = all_datasets_aux2.get('validation_aux')
    test_data_aux2 = all_datasets_aux2.get('test_aux')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer = MultiTaskTrainer2.from_params(model=model,
                                            serialization_dir=serialization_dir,
                                            iterator=iterator,
                                            iterator_aux=iterator_aux,
                                            iterator_aux2=iterator_aux2,
                                            train_data=train_data,
                                            train_data_aux=train_data_aux,
                                            train_data_aux2=train_data_aux2,
                                            mixing_ratio=mixing_ratio,
                                            mixing_ratio2=mixing_ratio2,
                                            cutoff_epoch=cutoff_epoch,
                                            validation_data_aux=validation_data_aux,
                                            validation_data_aux2=validation_data_aux2,
                                            validation_data=validation_data,
                                            params=trainer_params,
                                            validation_iterator=validation_iterator)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    evaluate_aux_on_test = params.pop_bool("evaluate_aux_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Training interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)

    if test_data and evaluate_on_test:
        logger.info("The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
                best_model, test_data, validation_iterator or iterator,
                cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    if test_data_aux and evaluate_aux_on_test:
        # for instance in test_data_aux:
        #     instance.index_fields(vocab)
        # for instance in test_data_aux2:
        #     instance.index_fields(vocab)
        test_metrics_aux = evaluate(best_model, test_data_aux, iterator_aux,
                                    cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        test_metrics_aux2 = evaluate(best_model, test_data_aux2, iterator_aux2,
                                     cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access

        for key, value in test_metrics_aux.items():
            metrics["test_aux_" + key] = value
        for key, value in test_metrics_aux2.items():
            metrics["test_aux2_" + key] = value

    elif test_data_aux:
        logger.info("To evaluate on the auxiliary test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True)

    return best_model

示例#23

0

显示文件

文件： train_model.py 项目： ohaanika/attention-interpretability

def modified_train_model(serialization_dir,
                         training_config_filename,
                         cuda_device=-1,
                         file_friendly_logging: bool = False) -> Model:
    """
        Function not currently in use. This is from back when I was trying to keep each successive
        addition to the model's training in the same serialization directory.

    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.
    Parameters
    ----------
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    model, params, prev_optimizer_params, cur_optimizer_params = \
        load_model_from_serialization_dir(serialization_dir, training_config_filename, cuda_device=cuda_device)
    prepare_environment(params)

    prepare_global_logging(serialization_dir, file_friendly_logging)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        (instance for key, dataset in all_datasets.items()
         for instance in dataset if key in datasets_for_vocab_creation))

    params.pop('model')

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(
            validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    list_of_cur_optimizer_param_keys = [
        key for key in cur_optimizer_params.as_flat_dict().keys()
    ]
    list_of_prev_optimizer_param_keys = [
        key for key in prev_optimizer_params.as_flat_dict().keys()
    ]
    optimizer_params_match = True
    for key in list_of_cur_optimizer_param_keys:
        if key not in list_of_prev_optimizer_param_keys:
            optimizer_params_match = False
            break
    for key in list_of_prev_optimizer_param_keys:
        if key not in list_of_cur_optimizer_param_keys:
            optimizer_params_match = False
            break
    if not optimizer_params_match:
        # a list of each p is what will be passed to the optimizer constructor while constructing Trainer--
        # adjust if necessary (i.e., if we changed optimizers)
        model_params = [[n, p] for n, p in model.named_parameters()
                        if p.requires_grad]
        assert "parameter_groups" not in list_of_cur_optimizer_param_keys, \
            "Current way of dealing with optimizer change doesn't take parameter groups into account"
        assert "parameter_groups" not in list_of_prev_optimizer_param_keys, \
            "Current way of dealing with optimizer change doesn't take parameter groups into account"
        for param_tup in model_params:
            # modify the second element of param_tup in-place (it's a dict) to match the keys specified in
            # cur_optimizer_params
            param_dict = param_tup[1]
            keys_to_del = []
            keys_already_in_dict = []
            try:
                for key in param_dict.keys():
                    if not key in list_of_cur_optimizer_param_keys:
                        keys_to_del.append(key)
                    else:
                        keys_already_in_dict.append(key)
                for key in keys_to_del:
                    del param_dict[key]
                for key_to_have in list_of_cur_optimizer_param_keys:
                    if key_to_have != "type" and key_to_have not in keys_already_in_dict:
                        param_dict[key_to_have] = cur_optimizer_params.get(
                            key_to_have)
            except:
                pass

    trainer = Trainer.from_params(model=model,
                                  serialization_dir=serialization_dir,
                                  iterator=iterator,
                                  train_data=train_data,
                                  validation_data=validation_data,
                                  params=trainer_params,
                                  validation_iterator=validation_iterator)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)

    if test_data and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
            best_model,
            test_data,
            validation_iterator or iterator,
            cuda_device=trainer._cuda_devices[0]  # pylint: disable=protected-access
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    dump_metrics(os.path.join(serialization_dir, "metrics.json"),
                 metrics,
                 log=True)

    return best_model

示例#24

0

显示文件

文件： registrable_test.py 项目： wangzhiwei-ai/allennlp

 def test_registry_has_builtin_iterators(self):
     assert DataIterator.by_name('adaptive').__name__ == 'AdaptiveIterator'
     assert DataIterator.by_name('basic').__name__ == 'BasicIterator'
     assert DataIterator.by_name('bucket').__name__ == 'BucketIterator'

示例#25

0

显示文件

文件： train.py 项目： schangpi/allennlp

def train_model(params: Params,
                serialization_dir: str,
                cuda_device: int,
                train_data_path: str,
                validation_data_path: str,
                test_data_path: str,
                file_friendly_logging: bool = False) -> Model:
    """
    This function can be used as an entry point to running models in AllenNLP
    directly from a JSON specification using a :class:`Driver`. Note that if
    you care about reproducibility, you should avoid running code using Pytorch
    or numpy which affect the reproducibility of your experiment before you
    import and use this function, these libraries rely on random seeds which
    can be set in this function via a JSON specification file. Note that this
    function performs training and will also evaluate the trained model on
    development and test sets if provided in the parameter json.

    Parameters
    ----------
    params: Params, required.
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir: str, required
        The directory in which to save results and logs.
    """
    prepare_environment(params)

    os.makedirs(serialization_dir, exist_ok=True)
    sys.stdout = TeeLogger(
        os.path.join(serialization_dir, "stdout.log"),  # type: ignore
        sys.stdout,
        file_friendly_logging)
    sys.stderr = TeeLogger(
        os.path.join(serialization_dir, "stderr.log"),  # type: ignore
        sys.stderr,
        file_friendly_logging)
    handler = logging.FileHandler(
        os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)
    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, "model_params.json"),
              "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    # all_datasets = datasets_from_params(params)
    all_datasets = datasets_from_args(params, train_data_path,
                                      validation_data_path, test_data_path)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Creating a vocabulary using %s data.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        (instance for key, dataset in all_datasets.items()
         for instance in dataset if key in datasets_for_vocab_creation))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    if cuda_device >= 0:
        model = model.cuda(cuda_device)
    # iterator = DataIterator.from_params(params.pop("iterator"))
    # iterator.index_with(vocab)
    train_iterator = DataIterator.from_params(params.pop("train_iterator"))
    val_iterator = DataIterator.from_params(params.pop("val_iterator"))
    train_iterator.index_with(vocab)
    val_iterator.index_with(vocab)

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    trainer = Trainer.from_params(model, serialization_dir, train_iterator,
                                  val_iterator, cuda_device, train_data,
                                  validation_data, trainer_params)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    # params.assert_empty('base train command')
    metrics = trainer.train()

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_metrics = evaluate(model,
                                test_data,
                                val_iterator,
                                cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"),
              "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model

示例#26

0

显示文件

文件： fine_tune.py 项目： sxdkxgwan/allennlp

def fine_tune_model(model: Model,
                    params: Params,
                    serialization_dir: str,
                    file_friendly_logging: bool = False) -> Model:
    """
    Fine tunes the given model, using a set of parameters that is largely identical to those used
    for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored,
    if it is present (as we are already given a ``Model`` here).

    The main difference between the logic done here and the logic done in ``train_model`` is that
    here we do not worry about vocabulary construction or creating the model object.  Everything
    else is the same.

    Parameters
    ----------
    archive : ``Archive``
        A saved model archive that is the result of running the ``train`` command.
    train_data_path : ``str``
        Path to the training data to use for fine-tuning.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    validation_data_path : ``str``, optional
        Path to the validation data to use while fine-tuning.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    """
    prepare_environment(params)
    os.makedirs(serialization_dir)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    if params.pop('model', None):
        logger.warning(
            "You passed parameters for the model in your configuration file, but we "
            "are ignoring them, using instead the model parameters in the archive."
        )

    if params.pop('vocabulary', None):
        logger.warning(
            "You passed parameters for the vocabulary in your configuration file, but "
            "we are ignoring them, using instead the vocabulary from the saved model."
        )

    vocab = model.vocab
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    all_datasets = datasets_from_params(params)

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    trainer = Trainer.from_params(model, serialization_dir, iterator,
                                  train_data, validation_data, trainer_params)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')
    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Fine-tuning interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_metrics = evaluate(model,
                                test_data,
                                iterator,
                                cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"),
              "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model

示例#27

0

显示文件

def fine_tune_model(model: Model,
                    params: Params,
                    serialization_dir: str,
                    extend_vocab: bool = False,
                    file_friendly_logging: bool = False,
                    batch_weight_key: str = "") -> Model:
    """
    Fine tunes the given model, using a set of parameters that is largely identical to those used
    for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored,
    if it is present (as we are already given a ``Model`` here).

    The main difference between the logic done here and the logic done in ``train_model`` is that
    here we do not worry about vocabulary construction or creating the model object.  Everything
    else is the same.

    Parameters
    ----------
    archive : ``Archive``
        A saved model archive that is the result of running the ``train`` command.
    train_data_path : ``str``
        Path to the training data to use for fine-tuning.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    validation_data_path : ``str``, optional
        Path to the validation data to use while fine-tuning.
    extend_vocab: ``bool``, optional (default=False)
        If ``True``, we use the new instances to extend your vocabulary.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    """
    prepare_environment(params)
    if os.path.exists(serialization_dir) and os.listdir(serialization_dir):
        raise ConfigurationError(
            f"Serialization directory ({serialization_dir}) "
            f"already exists and is not empty.")

    os.makedirs(serialization_dir, exist_ok=True)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    if params.pop('model', None):
        logger.warning(
            "You passed parameters for the model in your configuration file, but we "
            "are ignoring them, using instead the model parameters in the archive."
        )

    vocabulary_params = params.pop('vocabulary', {})
    if vocabulary_params.get('directory_path', None):
        logger.warning(
            "You passed `directory_path` in parameters for the vocabulary in "
            "your configuration file, but it will be ignored. ")

    all_datasets = datasets_from_params(params)
    vocab = model.vocab

    if extend_vocab:
        datasets_for_vocab_creation = set(
            params.pop("datasets_for_vocab_creation", all_datasets))

        for dataset in datasets_for_vocab_creation:
            if dataset not in all_datasets:
                raise ConfigurationError(
                    f"invalid 'dataset_for_vocab_creation' {dataset}")

        logger.info("Extending model vocabulary using %s data.",
                    ", ".join(datasets_for_vocab_creation))
        vocab.extend_from_instances(
            vocabulary_params,
            (instance for key, dataset in all_datasets.items()
             for instance in dataset if key in datasets_for_vocab_creation))

    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(model.vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(
            validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer_type = trainer_params.pop("type", "default")
    if trainer_type == "default":
        trainer = Trainer.from_params(model=model,
                                      serialization_dir=serialization_dir,
                                      iterator=iterator,
                                      train_data=train_data,
                                      validation_data=validation_data,
                                      params=trainer_params,
                                      validation_iterator=validation_iterator)
    else:
        raise ConfigurationError(
            "currently fine-tune only works with the default Trainer")

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)

    params.assert_empty('base train command')
    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Fine-tuning interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Evaluate
    if test_data and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
            model,
            test_data,
            validation_iterator or iterator,
            cuda_device=trainer._cuda_devices[0],  # pylint: disable=protected-access,
            batch_weight_key=batch_weight_key)

        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"),
              "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model

示例#28

0

显示文件

文件： trainer_pieces.py 项目： anonymousemnlp2020/reptile-low-resource-parsing

    def from_params(
        cls,
        params: Params,
        serialization_dir: str,
        recover: bool = False,
        model: Model = None,
        embedding_sources_mapping: Dict[str, str] = None,
        extend_vocab: bool = False,
    ) -> "MetaTrainerPieces":
        all_datasets = training_util.datasets_from_params(params)

        vocabulary_params = params.pop("vocabulary", {})

        if model:
            if params.pop("model", None):
                logger.warning(
                    "You passed parameters for the model in your configuration file, but we "
                    "are ignoring them, using instead the loaded model parameters."
                )

            # TODO(mattg): This should be updated now that directory_path no longer exists.
            if vocabulary_params.get("directory_path", None):
                logger.warning(
                    "You passed `directory_path` in parameters for the vocabulary in "
                    "your configuration file, but it will be ignored because we already "
                    "have a model with a vocabulary.")

            vocab = model.vocab
        else:
            vocab = None

        vocabulary_path = os.path.join(serialization_dir, "vocabulary")

        if not vocab or extend_vocab:
            vocab = MetaTrainerPieces.create_or_extend_vocab(
                datasets=all_datasets,
                params=params,
                recover=recover,
                vocab=vocab,
                vocabulary_params=vocabulary_params,
                vocabulary_path=vocabulary_path,
            )

            if not model:
                model = Model.from_params(vocab=vocab,
                                          params=params.pop("model"))

            # If vocab extension is ON for training, embedding extension should also be
            # done. If vocab and embeddings are already in sync, it would be a no-op.
            model.extend_embedder_vocab(embedding_sources_mapping)

        # Initializing the model can have side effect of expanding the vocabulary
        # Save the vocab only in the master. In the degenerate non-distributed
        # case, we're trivially the master.
        if is_master():
            vocab.save_to_files(vocabulary_path)

        iterator = DataIterator.from_params(params.pop("iterator"))
        iterator.index_with(model.vocab)
        validation_iterator_params = params.pop("validation_iterator", None)
        if validation_iterator_params:
            validation_iterator = DataIterator.from_params(
                validation_iterator_params)
            validation_iterator.index_with(model.vocab)
        else:
            validation_iterator = None

        train_datas = all_datasets["train"]
        validation_datas = all_datasets.get("validation")
        test_datas = all_datasets.get("test")

        trainer_params = params.pop("trainer")
        no_grad_regexes = trainer_params.pop("no_grad", ())
        for name, parameter in model.named_parameters():
            if any(re.search(regex, name) for regex in no_grad_regexes):
                parameter.requires_grad_(False)

        log_frozen_and_tunable_parameter_names(model)

        return cls(
            model=model,
            iterator=iterator,
            train_datasets=train_datas,
            validation_datasets=validation_datas,
            test_datasets=test_datas,
            validation_iterator=validation_iterator,
            params=trainer_params,
        )

示例#29

0

显示文件

    def from_partial_objects(
        cls,
        model: Model,
        serialization_dir: str,
        iterator: DataIterator,
        train_data: Iterable[Instance],
        validation_iterator: DataIterator = None,
        validation_data: Iterable[Instance] = None,
        local_rank: int = 0,
        patience: int = None,
        validation_metric: str = "-loss",
        shuffle: bool = True,
        num_epochs: int = 20,
        cuda_device: int = -1,
        grad_norm: float = None,
        grad_clipping: float = None,
        model_save_interval: float = None,
        summary_interval: int = 100,
        histogram_interval: int = None,
        should_log_parameter_statistics: bool = True,
        should_log_learning_rate: bool = False,
        log_batch_size_period: int = None,
        distributed: bool = None,
        world_size: int = 1,
        num_gradient_accumulation_steps: int = 1,
        no_grad: List[str] = None,
        optimizer: Lazy[Optimizer] = None,
        learning_rate_scheduler: Lazy[LearningRateScheduler] = None,
        momentum_scheduler: Lazy[MomentumScheduler] = None,
        moving_average: Lazy[MovingAverage] = None,
        checkpointer: Lazy[Checkpointer] = None,
    ) -> "Trainer":
        """
        This method exists so that we can have a documented method to construct this class using
        `FromParams`. If you are not using `FromParams` or config files, you can safely ignore this
        method.

        The reason we can't just use `__init__` with `FromParams` here is because there are
        sequential dependencies to this class's arguments.  Anything that has a `Lazy[]` type
        annotation needs something from one of the non-`Lazy` arguments.  The `Optimizer` needs to
        have the parameters from the `Model` before it's constructed, and the `Schedulers` need to
        have the `Optimizer`. Because of this, the typical way we construct things `FromParams`
        doesn't work, so we use `Lazy` to allow for constructing the objects sequentially.

        If you're not using `FromParams`, you can just construct these arguments in the right order
        yourself in your code and call the constructor directly.
        """

        check_for_gpu(cuda_device)
        if cuda_device >= 0:
            # Moving model to GPU here so that the optimizer state gets constructed on
            # the right device.
            model = model.cuda(cuda_device)

        if no_grad:
            for name, parameter in model.named_parameters():
                if any(re.search(regex, name) for regex in no_grad):
                    parameter.requires_grad_(False)

        common_util.log_frozen_and_tunable_parameter_names(model)

        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]
        optimizer_ = optimizer.construct(model_parameters=parameters)
        if not optimizer_:
            optimizer_ = Optimizer.default(parameters)

        batches_per_epoch = iterator.get_num_batches(train_data)
        if batches_per_epoch == 1:  # get_num_batches returns 1 when it can't determine the answer
            batches_per_epoch = None
        moving_average_ = moving_average.construct(parameters=parameters)
        learning_rate_scheduler_ = learning_rate_scheduler.construct(
            optimizer=optimizer_,
            num_epochs=num_epochs,
            num_steps_per_epoch=batches_per_epoch)
        momentum_scheduler_ = momentum_scheduler.construct(
            optimizer=optimizer_)

        checkpointer_ = checkpointer.construct() or Checkpointer(
            serialization_dir)
        return cls(
            model,
            optimizer_,
            iterator,
            train_data,
            validation_data,
            patience=patience,
            validation_metric=validation_metric,
            validation_iterator=validation_iterator,
            shuffle=shuffle,
            num_epochs=num_epochs,
            serialization_dir=serialization_dir,
            cuda_device=cuda_device,
            grad_norm=grad_norm,
            grad_clipping=grad_clipping,
            learning_rate_scheduler=learning_rate_scheduler_,
            momentum_scheduler=momentum_scheduler_,
            checkpointer=checkpointer_,
            model_save_interval=model_save_interval,
            summary_interval=summary_interval,
            histogram_interval=histogram_interval,
            should_log_parameter_statistics=should_log_parameter_statistics,
            should_log_learning_rate=should_log_learning_rate,
            log_batch_size_period=log_batch_size_period,
            moving_average=moving_average_,
            distributed=distributed,
            local_rank=local_rank,
            world_size=world_size,
            num_gradient_accumulation_steps=num_gradient_accumulation_steps,
        )

示例#30

0

显示文件

文件： train_multitask.py 项目： tony-hong/scaffolding

def train_model(params: Params, serialization_dir: str) -> Model:
    """
    This function can be used as an entry point to running models in AllenNLP
    directly from a JSON specification using a :class:`Driver`. Note that if
    you care about reproducibility, you should avoid running code using Pytorch
    or numpy which affect the reproducibility of your experiment before you
    import and use this function, these libraries rely on random seeds which
    can be set in this function via a JSON specification file. Note that this
    function performs training and will also evaluate the trained model on
    development and test sets if provided in the parameter json.

    Parameters
    ----------
    params: Params, required.
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir: str, required
        The directory in which to save results and logs.
    """
    prepare_environment(params)

    os.makedirs(serialization_dir, exist_ok=True)
    sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"),
                           sys.stdout)  # type: ignore
    sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"),
                           sys.stderr)  # type: ignore
    handler = logging.FileHandler(
        os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)
    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, "model_params.json"),
              "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    # Now we begin assembling the required parts for the Trainer.

    # 1. Primary training data.
    dataset_reader = DatasetReader.from_params(params.pop('dataset_reader'))
    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    # 2. Auxillary training data.
    dataset_reader_aux = DatasetReader.from_params(
        params.pop('dataset_reader_aux'))
    train_data_path_aux = params.pop('train_data_path_aux')
    logger.info("Reading auxilliary training data from %s",
                train_data_path_aux)
    train_data_aux = dataset_reader_aux.read(train_data_path_aux)

    # If only using a fraction of the auxiliary data.
    aux_sample_fraction = params.pop("aux_sample_fraction", 1.0)
    if aux_sample_fraction < 1.0:
        sample_size = int(aux_sample_fraction * len(train_data_aux.instances))
        train_data_aux = Dataset(
            random.sample(train_data_aux.instances, sample_size))

    # Balance the two datasets by inflating the size of the smaller dataset to the size of the larger dataset.
    train_size = len(train_data.instances)
    aux_train_size = len(train_data_aux.instances)
    mixing_ratio = params.pop("mixing_ratio")
    # mixing_ratio = float(train_size)/aux_train_size

    if train_size > aux_train_size:  # case for PB scaffold.
        difference = train_size - aux_train_size
        aux_sample = [
            random.choice(train_data_aux.instances) for _ in range(difference)
        ]
        train_data_aux = Dataset(train_data_aux.instances + aux_sample)
        logger.info(
            "Inflating auxiliary train data from {} to {} samples".format(
                aux_train_size, len(train_data_aux.instances)))
    # else: # case for FN scaffold.
    #     difference = aux_train_size - train_size
    #     train_sample = [random.choice(train_data.instances) for _ in range(difference)]
    #     train_data = Dataset(train_data.instances + train_sample)
    #     logger.info("Inflating train data from {} to {} samples".format(
    #         train_size, len(train_data.instances)))

    all_datasets: Dict[str, Dataset] = {"train": train_data}
    all_datasets_aux: Dict[str, Dataset] = {"train_aux": train_data_aux}

    # 3. Primary validation data.
    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = dataset_reader.read(validation_data_path)
        all_datasets["validation"] = validation_data
    else:
        validation_data = None

    # 4. Auxillary validation data.
    validation_data_path_aux = params.pop('validation_data_path_aux', None)
    if validation_data_path_aux is not None:
        logger.info("Reading auxilliary validation data from %s",
                    validation_data_path_aux)
        validation_data_aux = dataset_reader_aux.read(validation_data_path_aux)
        all_datasets_aux["validation_aux"] = validation_data_aux
    else:
        validation_data_aux = None

    # 5. Primary test data
    test_data_path = params.pop("test_data_path", None)
    if test_data_path is not None:
        logger.info("Reading test data from %s", test_data_path)
        test_data = dataset_reader.read(test_data_path)
        all_datasets["test"] = test_data
    else:
        test_data = None

    # 6. Auxillary test data
    test_data_path_aux = params.pop("test_data_path_aux", None)
    if test_data_path_aux is not None:
        logger.info("Reading auxillary test data from %s", test_data_path_aux)
        test_data_aux = dataset_reader_aux.read(test_data_path_aux)
        all_datasets_aux["test_aux"] = test_data_aux
    else:
        test_data_aux = None

    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))
    datasets_for_vocab_creation_aux = set(
        params.pop("auxillary_datasets_for_vocab_creation", all_datasets_aux))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "Creating a vocabulary using %s data. Auxillary also included.",
        ", ".join(datasets_for_vocab_creation))
    dataset_primary = Dataset([
        instance for key, dataset in all_datasets.items()
        for instance in dataset.instances if key in datasets_for_vocab_creation
    ])
    dataset_aux = Dataset([
        instance for key, dataset in all_datasets_aux.items()
        for instance in dataset.instances
        if key in datasets_for_vocab_creation_aux
    ])
    vocab = Vocabulary.from_params(params.pop("vocabulary", {}),
                                   dataset_primary,
                                   dataset_aux=dataset_aux)
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator_aux = DataIterator.from_params(params.pop("iterator_aux"))

    train_data.index_instances(vocab)
    train_data_aux.index_instances(vocab)
    if validation_data:
        validation_data.index_instances(vocab)
    if validation_data_aux:
        validation_data_aux.index_instances(vocab)

    cutoff_epoch = params.pop("cutoff_epoch", -1)

    trainer_params = params.pop("trainer")
    trainer = MultiTaskTrainer.from_params(
        model=model,
        serialization_dir=serialization_dir,
        iterator=iterator,
        iterator_aux=iterator_aux,
        train_dataset=train_data,
        train_dataset_aux=train_data_aux,
        mixing_ratio=mixing_ratio,
        cutoff_epoch=cutoff_epoch,
        validation_dataset=validation_data,
        validation_dataset_aux=validation_data_aux,
        params=trainer_params,
        files_to_archive=params.files_to_archive)

    evaluate_on_test = params.pop("evaluate_on_test", False)
    params.assert_empty('base train command')
    trainer.train()

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_data.index_instances(vocab)
        evaluate(model, test_data, iterator, cuda_device=trainer._cuda_device)  # pylint: disable=protected-access

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    if test_data_aux and evaluate_on_test:
        test_data_aux.index_instances(vocab)
        evaluate(model,
                 test_data_aux,
                 iterator_aux,
                 cuda_device=trainer._cuda_device)  # pylint: disable=protected-access

    elif test_data_aux:
        logger.info(
            "To evaluate on the auxillary test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    return model

示例#31

0

显示文件

 def test_registry_has_builtin_iterators(self):
     assert DataIterator.by_name(u'basic').__name__ == u'BasicIterator'
     assert DataIterator.by_name(u'bucket').__name__ == u'BucketIterator'

示例#32

0

显示文件

 def test_registry_has_builtin_iterators(self):
     assert DataIterator.by_name("basic").__name__ == "BasicIterator"
     assert DataIterator.by_name("bucket").__name__ == "BucketIterator"

示例#33

0

显示文件

文件： train.py 项目： wyxingyuX/allennlp

def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    check_for_gpu(params.params.get('trainer').get('cuda_device', -1))

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Creating a vocabulary using %s data.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        (instance for key, dataset in all_datasets.items()
         for instance in dataset if key in datasets_for_vocab_creation))

    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(
            validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  validation_data,
                                  trainer_params,
                                  validation_iterator=validation_iterator)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)

    if test_data and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
            best_model,
            test_data,
            validation_iterator or iterator,
            cuda_device=trainer._cuda_devices[0]  # pylint: disable=protected-access
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"),
              "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return best_model

示例#34

0

显示文件

    def from_params(
            cls,
            params: Params,
            serialization_dir: str,
            recover: bool = False,
            cache_directory: str = None,
            cache_prefix: str = None,
    ) -> "TrainerPieces":
        all_datasets = training_util.meta_dataset_from_params(params, cache_directory, cache_prefix)
        datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

        for dataset in datasets_for_vocab_creation:
            if dataset not in all_datasets:
                raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

        logger.info(
            "From dataset instances, %s will be considered for vocabulary creation.",
            ", ".join(datasets_for_vocab_creation),
        )

        if recover and os.path.exists(os.path.join(serialization_dir, "vocabulary")):
            vocab_params = params.pop("vocabulary", {})
            vocab = Vocabulary.from_files(
                os.path.join(serialization_dir, "vocabulary"),
                vocab_params.get("padding_token", None),
                vocab_params.get("oov_token", None),
            )
        else:
            instance_train = (
                instance
                for key, dataset in all_datasets.items()
                if key == 'train'
                for subdata in dataset
                for instance in subdata
            )
            instance_valid_test = (
                instance
                for key, dataset in all_datasets.items()
                if key != 'train'
                for instance in dataset
            )
            instances = chain(instance_train, instance_valid_test)
            vocab = Vocabulary.from_params(
                params.pop("vocabulary", {}),
                # Using a generator comprehension here is important
                # because, being lazy, it allows us to not iterate over the
                # dataset when directory_path is specified.

                # (
                #     instance
                #     for key, dataset in all_datasets.items()
                #     if (key in datasets_for_vocab_creation)
                #     for instance in dataset
                # ),
                instances
            )

        model = Model.from_params(vocab=vocab, params=params.pop("model"))

        # If vocab extension is ON for training, embedding extension should also be
        # done. If vocab and embeddings are already in sync, it would be a no-op.
        model.extend_embedder_vocab()

        # Initializing the model can have side effect of expanding the vocabulary
        # Save the vocab only in the master
        if not is_distributed() or is_master():
            vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

        # print('[info] iterator in meta_pieces is:{}'.format(params.pop("iterator")))
        iterator = DataIterator.from_params(params.pop("iterator"))
        iterator.index_with(model.vocab)
        validation_iterator_params = params.pop("validation_iterator", None)
        if validation_iterator_params:
            validation_iterator = DataIterator.from_params(validation_iterator_params)
            validation_iterator.index_with(model.vocab)
        else:
            validation_iterator = None

        train_data = all_datasets["train"]
        validation_data = all_datasets.get("validation")
        test_data = all_datasets.get("test")

        trainer_params = params.pop("trainer")
        no_grad_regexes = trainer_params.pop("no_grad", ())
        for name, parameter in model.named_parameters():
            if any(re.search(regex, name) for regex in no_grad_regexes):
                parameter.requires_grad_(False)

        frozen_parameter_names, tunable_parameter_names = get_frozen_and_tunable_parameter_names(
            model
        )
        logger.info("Following parameters are Frozen  (without gradient):")
        for name in frozen_parameter_names:
            logger.info(name)
        logger.info("Following parameters are Tunable (with gradient):")
        for name in tunable_parameter_names:
            logger.info(name)

        return cls(
            model,
            iterator,
            train_data,
            validation_data,
            test_data,
            validation_iterator,
            trainer_params,
        )