示例#1
0
    def _save_vocab_to_disk(self, vocab: Vocabulary) -> str:
        """Saves the vocab to disk to reuse it between the trials

        Parameters
        ----------
        vocab
            Vocabulary to be saved to disk

        Returns
        -------
        vocab_path
            Path to the vocabulary, that is a directory
        """
        tmp_dir = tempfile.TemporaryDirectory()
        self._created_tmp_dirs.append(tmp_dir)
        vocab_path = tmp_dir.name
        vocab.save_to_files(vocab_path)

        # Make sure that we can load the vocab successfully
        try:
            Vocabulary.from_files(vocab_path)
        except Exception as exception:
            raise ValidationError(
                f"Could not load vocab saved in '{vocab_path}'") from exception

        return vocab_path
示例#2
0
def predict_from_ensemble(trigger_prediction_dirs, trigger_vocab_dirs, arg_prediction_dir,
                          arg_vocab_dir, test_file, output_file):
    trig_pred_dict = {k: get_pred_dicts(v) for k, v in trigger_prediction_dirs.items()}
    trigger_vocabs = [Vocabulary.from_files(v) for v in trigger_vocab_dirs]
    arg_preds = get_pred_dicts(arg_prediction_dir)
    arg_vocab = Vocabulary.from_files(arg_vocab_dir)
    gold = get_gold_data(test_file)
    assert all([set(arg_preds.keys()) == set(entry.keys()) for entry in trig_pred_dict.values()])
    trig_preds = unwrap(trig_pred_dict)
    with open(output_file, "w") as f:
        for doc in trig_preds:
            one_pred = predict_one(trig_preds[doc], arg_preds[doc], gold[doc], trigger_vocabs, arg_vocab)
            f.write(one_pred + "\n")
示例#3
0
    def from_config(
        cls,
        config: Union[PipelineConfiguration, dict],
        vocab_path: Optional[str] = None,
    ) -> "Pipeline":
        """Creates a pipeline from a `PipelineConfiguration` object or a configuration dictionary

        Parameters
        ----------
        config: `Union[PipelineConfiguration, dict]`
            A `PipelineConfiguration` object or a configuration dict
        vocab_path: `Optional[str]`
            If provided, the pipeline vocabulary will be loaded from this path

        Returns
        -------
        pipeline: `Pipeline`
            A configured pipeline
        """
        if isinstance(config, dict):
            config = PipelineConfiguration.from_dict(config)

        model = PipelineModel.from_params(
            Params({"config": config}),
            vocab=Vocabulary.from_files(vocab_path)
            if vocab_path is not None else None,
        )
        if not isinstance(model, PipelineModel):
            raise TypeError(f"Cannot load model. Wrong format of {model}")

        cls._add_transformers_vocab_if_needed(model)

        return cls(model, config)
示例#4
0
文件: model.py 项目: pyknife/allennlp
    def _load(cls,
              config: Params,
              serialization_dir: str,
              weights_file: str = None,
              cuda_device: int = -1) -> 'Model':
        """
        Instantiates an already-trained model, based on the experiment
        configuration and some optional overrides.
        """
        weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS)

        # Load vocabulary from file
        vocab_dir = os.path.join(serialization_dir, 'vocabulary')
        vocab = Vocabulary.from_files(vocab_dir)

        model_params = config.get('model')

        # The experiment config tells us how to _train_ a model, including where to get pre-trained
        # embeddings from.  We're now _loading_ the model, so those embeddings will already be
        # stored in our weights.  We don't need any pretrained weight file anymore, and we don't
        # want the code to look for it, so we remove it from the parameters here.
        remove_pretrained_embedding_params(model_params)
        model = Model.from_params(vocab=vocab, params=model_params)
        model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device))
        model.load_state_dict(model_state)

        # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
        # in sync with the weights
        if cuda_device >= 0:
            model.cuda(cuda_device)
        else:
            model.cpu()

        return model
示例#5
0
def main(cl_arguments):
    ''' Run REPL for a CoLA model '''

    # Arguments handling #
    cl_args = handle_arguments(cl_arguments)
    args = config.params_from_file(cl_args.config_file, cl_args.overrides)
    check_arg_name(args)
    assert args.target_tasks == "cola", \
        "Currently only supporting CoLA. ({})".format(args.target_tasks)

    if args.cuda >= 0:
        try:
            if not torch.cuda.is_available():
                raise EnvironmentError("CUDA is not available, or not detected"
                                       " by PyTorch.")
            log.info("Using GPU %d", args.cuda)
            torch.cuda.set_device(args.cuda)
        except Exception:
            log.warning("GPU access failed. You might be using a CPU-only"
                        " installation of PyTorch. Falling back to CPU.")
            args.cuda = -1

    # Prepare data #
    _, target_tasks, vocab, word_embs = build_tasks(args)
    tasks = sorted(set(target_tasks), key=lambda x: x.name)

    # Build or load model #
    model = build_model(args, vocab, word_embs, tasks)
    log.info("Loading existing model from %s...", cl_args.model_file_path)
    load_model_state(model,
                     cl_args.model_file_path,
                     args.cuda, [],
                     strict=False)

    # Inference Setup #
    model.eval()
    vocab = Vocabulary.from_files(os.path.join(args.exp_dir, 'vocab'))
    indexers = build_indexers(args)
    task = take_one(tasks)

    # Run Inference #
    if cl_args.inference_mode == "repl":
        assert cl_args.input_path is None
        assert cl_args.output_path is None
        print("Running REPL for task: {}".format(task.name))
        run_repl(model, vocab, indexers, task, args)
    elif cl_args.inference_mode == "corpus":
        run_corpus_inference(
            model,
            vocab,
            indexers,
            task,
            args,
            cl_args.input_path,
            cl_args.input_format,
            cl_args.output_path,
            cl_args.eval_output_path,
        )
    else:
        raise KeyError(cl_args.inference_mode)
示例#6
0
def create_save_vocab(file_path, target_dir, word_min_count, char_min_count):
    namespace_word = "word2idx"
    namespace_char = "char2idx"
    token_indexers = {
        "tokens": SingleIdTokenIndexer(namespace=namespace_word),
        "chars": TokenCharactersIndexer(namespace=namespace_char)
    }
    min_count = {
        namespace_word: word_min_count,
        namespace_char: char_min_count
    }

    reader = SquadReader(token_indexers=token_indexers)
    instances = reader.read(file_path)
    vocab = Vocabulary.from_instances(instances, min_count=min_count)
    word_cnt = vocab.get_vocab_size(namespace_word)
    char_cnt = vocab.get_vocab_size(namespace_char)
    vocab.save_to_files(target_dir)
    print("save word2idx={}, char2idx={} to {}".format(word_cnt, char_cnt,
                                                       target_dir))
    word2idx = vocab.get_index_to_token_vocabulary(namespace_word)
    char2idx = vocab.get_index_to_token_vocabulary(namespace_char)
    print(char2idx)
    vocab = Vocabulary.from_files(target_dir)
    char2idx = vocab.get_index_to_token_vocabulary(namespace_char)
    print(char2idx)
    return
示例#7
0
def get_data_loader(config):
    train_instances_path = Path(config.pop("train_instances_path"))
    valid_instances_path = Path(config.pop("valid_instances_path"))
    test_instances_path = Path(config.pop("test_instances_path"))
    create_vocab_s_nulya = False
    if train_instances_path.exists():
        info("Loading tokenized instances")
        instances = []
        for path in [train_instances_path, valid_instances_path]:
            with Path(path).open("rb") as f:
                instances.append(pickle.load(f))
    else:
        info("Tokenizing instances...")
        data_reader = NMTDataReader()
        create_vocab_s_nulya = True
        train_instances = data_reader.read(config.pop("train_data_path"))
        valid_instances = data_reader.read(config.pop("valid_data_path"))
        test_instances = data_reader.read(config.pop("test_data_path"))
        instances = [train_instances, valid_instances]
        info("Saving instances to disk")
        train_instances_path.parent.mkdir(parents=True)
        for inst, path in zip(
            [train_instances, valid_instances, test_instances],
            [train_instances_path, valid_instances_path, test_instances_path],
        ):
            with path.open("wb") as f:
                pickle.dump(inst, f, protocol=pickle.HIGHEST_PROTOCOL)
    vocab_path = Path(config.pop("vocab_path"))
    if create_vocab_s_nulya or not vocab_path.exists():
        max_vocab_size = config.pop("max_vocab_size")
        max_characters = config.pop("max_characters")
        vocab = Vocabulary.from_instances(
            instances[0],
            max_vocab_size={
                "char_src": max_characters,
                "token_src": max_vocab_size,
                "char_trg": max_characters,
                "token_trg": max_vocab_size,
            },
        )
        vocab.save_to_files(vocab_path)
    else:
        vocab = Vocabulary.from_files(vocab_path)

    train_instances, valid_instances = instances
    return (
        vocab,
        DataIteratorWrapper(
            vocab,
            train_instances,
            shuffle=True,
            batch_size=config.pop("train_batch_size"),
        ),
        DataIteratorWrapper(
            vocab,
            valid_instances,
            shuffle=False,
            batch_size=config.pop("valid_batch_size"),
        ),
    )
    def __init__(
        self,
        config: Config,
        models: Dict[str, Type[nn.Module]],
        gpu_ids: List[int] = [0],
        cpu_workers: int = 0,
    ):
        self._C = config

        if self._C.PHASE != "program_prior":
            raise ValueError(
                f"Trying to initialize a ProgramPriorEvaluator, expected config PHASE to be "
                f"program_prior, found {self._C.PHASE}")

        # Initialize vocabulary, dataloader and model.
        self._vocabulary = Vocabulary.from_files(self._C.DATA.VOCABULARY)

        dataset = ProgramPriorDataset(self._C.DATA.VAL_TOKENS)
        dataloader = DataLoader(dataset, batch_size=self._C.OPTIM.BATCH_SIZE)

        super().__init__(config=config,
                         dataloader=dataloader,
                         models=models,
                         gpu_ids=gpu_ids)

        # This will be a part of `self._models`, keep this handle for convenience.
        self._program_prior = self._models["program_prior"]
    def __init__(
        self,
        config: Config,
        models: Dict[str, Type[nn.Module]],
        gpu_ids: List[int] = [0],
        cpu_workers: int = 0,
    ):
        self._C = config

        if self._C.PHASE != "question_coding":
            raise ValueError(
                f"Trying to initialize a QuestionCodingEvaluator, expected config PHASE to be "
                f"question_coding, found {self._C.PHASE}")

        # Initialize vocabulary, dataloader and model.
        self._vocabulary = Vocabulary.from_files(self._C.DATA.VOCABULARY)

        # There is no notion of "supervision" during evaluation.
        dataset = QuestionCodingDataset(self._C.DATA.VAL_TOKENS)
        dataloader = DataLoader(dataset,
                                batch_size=self._C.OPTIM.BATCH_SIZE,
                                num_workers=cpu_workers)

        super().__init__(config=config,
                         dataloader=dataloader,
                         models=models,
                         gpu_ids=gpu_ids)

        # These will be a part of `self._models`, keep these handles for convenience.
        self._program_generator = self._models["program_generator"]
        self._question_reconstructor = self._models["question_reconstructor"]
def _load_elmo(serialization_dir,
               vocabulary_dir,
               weights_file=None,
               cuda_device=-1):
    config = Params.from_file(os.path.join(serialization_dir, 'config.json'),
                              "")
    config.loading_from_archive = True
    weights_file = os.path.join(vocabulary_dir, 'best.th')

    vocab_dir = os.path.join(vocabulary_dir, 'vocabulary')
    vocab = Vocabulary.from_files(vocab_dir)
    model_params = config.get('model')
    remove_pretrained_embedding_params(model_params)

    model = BiDAFSelfAttention.from_params(vocab=vocab, params=model_params)
    model_state = torch.load(weights_file,
                             map_location=device_mapping(cuda_device))
    model.load_state_dict(model_state)

    if cuda_device >= 0:
        model.cuda(cuda_device)
    else:
        model.cpu()

    return model, config
示例#11
0
def get_trainer_from_config(
        config: Params,
        train_instances: List[Instance],
        val_instances: List[Instance],
        device: int,
        serialization_dir: Optional[str] = None) -> Trainer:
    trainer_params = config.pop("trainer")
    trainer_params["cuda_device"] = device
    model_params = config.pop("model")
    vocab_dir = config.pop("vocab_dir", None)
    if vocab_dir is None:
        vocab = Vocabulary.from_instances(train_instances)
    else:
        vocab = Vocabulary.from_files(vocab_dir)
    model = Model.from_params(model_params, vocab=vocab)
    iterator = DataIterator.from_params(config.pop("iterator"))
    trainer_params["num_serialized_models_to_keep"] = 1
    iterator.index_with(vocab)
    trainer = Trainer.from_params(model=model,
                                  iterator=iterator,
                                  train_data=train_instances,
                                  validation_data=val_instances,
                                  serialization_dir=serialization_dir,
                                  params=trainer_params)
    return trainer
示例#12
0
    def _load(cls,
              config        ,
              serialization_dir     ,
              weights_file      = None,
              cuda_device      = -1)           :
        u"""
        Instantiates an already-trained model, based on the experiment
        configuration and some optional overrides.
        """
        weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS)

        # Load vocabulary from file
        vocab_dir = os.path.join(serialization_dir, u'vocabulary')
        vocab = Vocabulary.from_files(vocab_dir)

        model_params = config.get(u'model')

        # The experiment config tells us how to _train_ a model, including where to get pre-trained
        # embeddings from.  We're now _loading_ the model, so those embeddings will already be
        # stored in our weights.  We don't need any pretrained weight file anymore, and we don't
        # want the code to look for it, so we remove it from the parameters here.
        remove_pretrained_embedding_params(model_params)
        model = Model.from_params(vocab=vocab, params=model_params)
        model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device))
        model.load_state_dict(model_state)

        # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
        # in sync with the weights
        if cuda_device >= 0:
            model.cuda(cuda_device)
        else:
            model.cpu()

        return model
    def __init__(self, default_seeds: Iterable = None, quiet: bool = False):
        super().__init__(default_seeds, quiet)

        lm_files = download_files(fnames=['vocabulary.zip', 'lm-fold-0.bin'],
                                  local_folder='swag_lm')

        activity_data_files = download_files(
            fnames=['captions.zip'],
            paths='https://cs.stanford.edu/people/ranjaykrishna/densevid/',
            local_folder='activitynet_captions')

        const_parser_files = cached_path(
            'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz',
            cache_dir=str(DATA_ROOT / 'allennlp_constituency_parser'))

        self.const_parser = PretrainedModel(const_parser_files,
                                            'constituency-parser').predictor()
        vocab = Vocabulary.from_files(lm_files[0])
        self.language_model = SimpleBiLM(vocab=vocab,
                                         recurrent_dropout_probability=0.2,
                                         embedding_dropout_probability=0.2)
        optimistic_restore(
            self.language_model,
            torch.load(lm_files[1], map_location='cpu')['state_dict'])

        if default_seeds is None:
            self.default_seeds = ActivityNetCaptionsDatasetReader().read(
                activity_data_files[0] + '/train.json')
        else:
            self.default_seeds = default_seeds
示例#14
0
    def load(cls,
             config: Params,
             serialization_prefix: str = None,
             weights_file: str = None,
             cuda_device: int = -1) -> 'Model':
        """
        Instantiates an already-trained model, based on the experiment
        configuration and some optional overrides.

        Parameters
        ----------
        config: Params
            The configuration that was used to train the model. It should definitely
            have a `model` section, and should probably have a `trainer` section
            as well.
        serialization_prefix: str = None
            By default we look at `config['trainer']['serialization_prefix']` to
            get the path to the serialized model, but you can override that
            value here.
        weights_file: str = None
            By default we load the weights from `best.th` in the serialization
            directory, but you can override that value here.
        cuda_device: int = -1
            By default we load the model on the CPU, but if you want to load it
            for GPU usage you can specify the id of your GPU here


        Returns
        -------
        model: Model
            The model specified in the configuration, loaded with the serialized
            vocabulary and the trained weights.
        """
        trainer_config = config.get("trainer", {})
        serialization_prefix = (serialization_prefix
                                or trainer_config.get('serialization_prefix'))
        if serialization_prefix is None:
            raise ConfigurationError('serialization_prefix must be specified')

        weights_file = weights_file or os.path.join(serialization_prefix,
                                                    _DEFAULT_WEIGHTS)

        # Load vocabulary from file
        vocab_dir = os.path.join(serialization_prefix, 'vocabulary')
        vocab = Vocabulary.from_files(vocab_dir)

        model = Model.from_params(vocab, config.get('model'))
        model_state = torch.load(weights_file,
                                 map_location=device_mapping(cuda_device))
        model.load_state_dict(model_state)

        # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
        # in sync with the weights
        if cuda_device >= 0:
            model.cuda(cuda_device)
        else:
            model.cpu()

        return model
示例#15
0
    def load(cls,
             config: Params,
             serialization_dir: str,
             weights_file: str = None,
             cuda_device: int = -1) -> 'Model':
        """
        Instantiates an already-trained model, based on the experiment
        configuration and some optional overrides.

        Parameters
        ----------
        config: Params
            The configuration that was used to train the model. It should definitely
            have a `model` section, and should probably have a `trainer` section
            as well.
        serialization_dir: str = None
            The directory containing the serialized weights, parameters, and vocabulary
            of the model.
        weights_file: str = None
            By default we load the weights from `best.th` in the serialization
            directory, but you can override that value here.
        cuda_device: int = -1
            By default we load the model on the CPU, but if you want to load it
            for GPU usage you can specify the id of your GPU here


        Returns
        -------
        model: Model
            The model specified in the configuration, loaded with the serialized
            vocabulary and the trained weights.
        """
        weights_file = weights_file or os.path.join(serialization_dir,
                                                    _DEFAULT_WEIGHTS)

        # Load vocabulary from file
        vocab_dir = os.path.join(serialization_dir, 'vocabulary')
        vocab = Vocabulary.from_files(vocab_dir)

        model_params = config.get('model')

        # The experiment config tells us how to _train_ a model, including where to get pre-trained
        # embeddings from.  We're now _loading_ the model, so those embeddings will already be
        # stored in our weights.  We don't need any pretrained weight file anymore, and we don't
        # want the code to look for it, so we remove it from the parameters here.
        _remove_pretrained_embedding_params(model_params)
        model = Model.from_params(vocab, model_params)
        model_state = torch.load(weights_file,
                                 map_location=device_mapping(cuda_device))
        model.load_state_dict(model_state)

        # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
        # in sync with the weights
        if cuda_device >= 0:
            model.cuda(cuda_device)
        else:
            model.cpu()

        return model
示例#16
0
def build_vocab(instances: Iterable[Instance]) -> Vocabulary:
    print('Building the vocabulary')
    vocab_path = os.path.join(PRETRAIN_MODEL, 'vocab')
    if os.path.exists(vocab_path):
        vocab = Vocabulary.from_files(vocab_path)
    else:
        vocab = Vocabulary.from_instances(instances)
        vocab.save_to_files(vocab_path)
    return vocab
示例#17
0
 def _load_vocab(archive_path: Path) -> Vocabulary:
     # an annoying hack to load the vocab file
     tempdir = tempfile.mkdtemp()
     with tarfile.open(archive_path, 'r:gz') as _archive:
         _archive.extractall(tempdir)
     vocab_path = Path(tempdir) / "vocabulary"
     vocab = Vocabulary.from_files(vocab_path)
     shutil.rmtree(tempdir)
     return vocab
示例#18
0
    def load(cls,
             config: Params,
             serialization_dir: str,
             weights_file: str = None,
             cuda_device: int = -1) -> 'Model':
        """
        Instantiates an already-trained model, based on the experiment
        configuration and some optional overrides.

        Parameters
        ----------
        config: Params
            The configuration that was used to train the model. It should definitely
            have a `model` section, and should probably have a `trainer` section
            as well.
        serialization_dir: str = None
            The directory containing the serialized weights, parameters, and vocabulary
            of the model.
        weights_file: str = None
            By default we load the weights from `best.th` in the serialization
            directory, but you can override that value here.
        cuda_device: int = -1
            By default we load the model on the CPU, but if you want to load it
            for GPU usage you can specify the id of your GPU here


        Returns
        -------
        model: Model
            The model specified in the configuration, loaded with the serialized
            vocabulary and the trained weights.
        """
        weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS)

        # Load vocabulary from file
        vocab_dir = os.path.join(serialization_dir, 'vocabulary')
        vocab = Vocabulary.from_files(vocab_dir)

        model_params = config.get('model')

        # The experiment config tells us how to _train_ a model, including where to get pre-trained
        # embeddings from.  We're now _loading_ the model, so those embeddings will already be
        # stored in our weights.  We don't need any pretrained weight file anymore, and we don't
        # want the code to look for it, so we remove it from the parameters here.
        _remove_pretrained_embedding_params(model_params)
        model = Model.from_params(vocab, model_params)
        model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device))
        model.load_state_dict(model_state)

        # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
        # in sync with the weights
        if cuda_device >= 0:
            model.cuda(cuda_device)
        else:
            model.cpu()

        return model
示例#19
0
 def _load_files(self):
     self.config = Params.from_file(os.path.join(self.tempdir,
                                                 _CONFIG_NAME))
     self.vocabulary = Vocabulary.from_files(
         os.path.join(self.tempdir, _VOCAB_DIR_NAME))
     self.vocab_reader = VocabReader(
         os.path.join(self.tempdir, _VOCAB_DIR_NAME, _TOKENS_NAME))
     weights_path = os.path.join(self.tempdir, _WEIGHTS_NAME)
     self.model_state = torch.load(
         weights_path, map_location=lambda storage, loc: storage)
示例#20
0
    def __init__(
        self,
        vocab: Vocabulary,
        # source-side
        bert_encoder: BaseBertWrapper,
        encoder_token_embedder: TextFieldEmbedder,
        encoder_pos_embedding: Embedding,
        encoder: Seq2SeqEncoder,
        syntax_edge_type_namespace: str = None,
        biaffine_parser: DeepTreeParser = None,
        dropout: float = 0.0,
        eps: float = 1e-20,
        pretrained_weights: str = None,
        vocab_dir: str = None,
    ) -> None:

        super(UDParser,
              self).__init__(vocab=vocab,
                             bert_encoder=bert_encoder,
                             encoder_token_embedder=encoder_token_embedder,
                             encoder=encoder,
                             decoder_token_embedder=None,
                             decoder_node_index_embedding=None,
                             decoder=None,
                             extended_pointer_generator=None,
                             tree_parser=None,
                             label_smoothing=None,
                             target_output_namespace=None,
                             pretrained_weights=pretrained_weights,
                             dropout=dropout,
                             eps=eps)

        # source-side
        self.encoder_pos_embedding = encoder_pos_embedding
        # misc
        self._syntax_edge_type_namespace = syntax_edge_type_namespace
        self.biaffine_parser = biaffine_parser
        self.vocab_dir = vocab_dir
        #metrics
        self._syntax_metrics = AttachmentScores()
        self.syntax_las = 0.0
        self.syntax_uas = 0.0
        # compatibility
        self.loss_mixer = None
        self.syntactic_method = "encoder-side"

        # pretrained
        if self.pretrained_weights is not None:
            self.load_partial(self.pretrained_weights)
        # load vocab
        if self.vocab_dir is not None:
            syn_vocab = Vocabulary.from_files(vocab_dir)
            self.vocab._token_to_index[
                self._syntax_edge_type_namespace] = syn_vocab._token_to_index[
                    self._syntax_edge_type_namespace]
示例#21
0
    def from_config(cls, config: Config):
        r"""Instantiate this class directly from a :class:`~probnmn.config.Config`."""

        _C = config
        return cls(  # type: ignore
            vocabulary=Vocabulary.from_files(_C.DATA.VOCABULARY),
            image_feature_size=tuple(_C.NMN.IMAGE_FEATURE_SIZE),
            module_channels=_C.NMN.MODULE_CHANNELS,
            class_projection_channels=_C.NMN.CLASS_PROJECTION_CHANNELS,
            classifier_linear_size=_C.NMN.CLASSIFIER_LINEAR_SIZE,
        )
示例#22
0
    def from_config(cls, config: Config):
        r"""Instantiate this class directly from a :class:`~probnmn.config.Config`."""

        _C = config
        return cls(
            vocabulary=Vocabulary.from_files(_C.DATA.VOCABULARY),
            input_size=_C.QUESTION_RECONSTRUCTOR.INPUT_SIZE,
            hidden_size=_C.QUESTION_RECONSTRUCTOR.HIDDEN_SIZE,
            num_layers=_C.QUESTION_RECONSTRUCTOR.NUM_LAYERS,
            dropout=_C.QUESTION_RECONSTRUCTOR.DROPOUT,
        )
示例#23
0
def train(model_dir):

    # prepare data
    #reader = CoqaDatasetReader()
    #reader = CoqaDatasetReader(tokenizer=lambda x: WordTokenizer().tokenize(text=x))
    reader = CoqaDatasetReader(tokenizer=lambda sent: SpacyWordSplitter(
        language='en_core_web_sm').split_words(sent))
    train_dataset = reader.read(
        cached_path('/mnt/DATA/ML/data/corpora/QA/CoQA/coqa-train-v1.0.json'))
    validation_dataset = reader.read(
        cached_path('/mnt/DATA/ML/data/corpora/QA/CoQA/coqa-dev-v1.0.json'))

    vocab = None
    model_fn = os.path.join(model_dir, 'model.th')
    vocab_fn = os.path.join(model_dir, 'vocab')
    if os.path.exists(model_dir):
        if os.path.exists(vocab_fn):
            logging.info('load vocab from: %s...' % vocab_fn)
            vocab = Vocabulary.from_files(vocab_fn)
    else:
        os.makedirs(model_dir)
    if vocab is None:
        vocab = Vocabulary.from_instances(train_dataset + validation_dataset)
        logging.info('save vocab to: %s...' % vocab_fn)
        vocab.save_to_files(vocab_fn)
    logging.info('data prepared')

    model = create_model(vocab)

    if os.path.exists(model_fn):
        logging.info('load model wheights from: %s...' % model_fn)
        with open(model_fn, 'rb') as f:
            model.load_state_dict(torch.load(f))
    logging.info('model prepared')

    # prepare training
    # optimizer = optim.SGD(model.parameters(), lr=0.1)
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    iterator = BasicIterator(batch_size=2)
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=10,
                      num_epochs=100)
    logging.info('training prepared')

    trainer.train()

    logging.info('save model to: %s...' % model_fn)
    with open(model_fn, 'wb') as f:
        torch.save(model.state_dict(), f)
示例#24
0
def predict_from_pair(trigger_prediction_dir, arg_prediction_dir, vocab_dir,
                      test_file, output_file):
    trig_preds = get_pred_dicts(trigger_prediction_dir)
    arg_preds = get_pred_dicts(arg_prediction_dir)
    vocab = Vocabulary.from_files(vocab_dir)
    gold = get_gold_data(test_file)
    assert set(arg_preds.keys()) == set(trig_preds.keys())
    with open(output_file, "w") as f:
        for doc in trig_preds:
            one_pred = predict_one(trig_preds[doc], arg_preds[doc], gold[doc],
                                   vocab)
            f.write(one_pred + "\n")
示例#25
0
    def _restore_vocab(self, folder: str) -> Vocabulary:
        # The transformers feature comes with its own vocab, no need to restore anything if it is the only feature
        if self.config.features.configured_namespaces == [
            TransformersFeatures.namespace
        ]:
            return self.vocab

        self._check_for_word_vector_weights_file()

        vocab = Vocabulary.from_files(folder)
        self._model.extend_vocabulary(vocab)

        return vocab
示例#26
0
    def from_run(cls, run_dir: str, task_name: str, split_name: str):
        # Load vocabulary
        exp_dir = os.path.dirname(run_dir.rstrip("/"))
        vocab_path = os.path.join(exp_dir, "vocab")
        log.info("Loading vocabulary from %s" % vocab_path)
        vocab = Vocabulary.from_files(vocab_path)
        label_namespace = f"{task_name}_labels"

        # Load predictions
        preds_file = os.path.join(run_dir, f"{task_name}_{split_name}.json")
        log.info("Loading predictions from %s" % preds_file)
        return cls(vocab, utils.load_json_data(preds_file),
                   label_namespace=label_namespace)
示例#27
0
 def __init__(self,
              vocab: Vocabulary = None,
              vocab_path: str = None,
              max_enc_len: int = 512,
              max_dec_len: int = 30,
              max_turn_len: int = 3,
              index_name: str = "tokens",
              start_token: str = "[CLS]",
              end_token: str = "[SEP]",
              pad_token: str = "[PAD]",
              oov_token: str = "[UNK]",
              do_lower_case: bool = True,
              token_indexers: Dict[str, TokenIndexer] = None,
              tokenizer: Tokenizer = None,
              never_split: List[str] = None,
              lazy: bool = False,
              **kwargs):
     super(PointerRewriteReader, self).__init__(lazy, **kwargs)
     if never_split is not None:
         never_split = [start_token, end_token, pad_token, oov_token
                        ] + never_split
     else:
         never_split = [start_token, end_token, pad_token, oov_token]
     # the max length of the input
     self.max_length = max_enc_len
     # Tokens
     self._start_token = start_token
     self._end_token = end_token
     self._max_dec_len = max_dec_len
     self._max_turn_len = max_turn_len - 1
     self._index_name = index_name
     if tokenizer is None:
         self._tokenizer = ChineseCharacterTokenizer(
             do_lower_case, never_split)
     else:
         self._tokenizer = tokenizer
     self._token_indexers = token_indexers or {
         index_name: SingleIdTokenIndexer(namespace=index_name)
     }
     if vocab is None:
         self.vocab = Vocabulary.from_files(vocab_path,
                                            padding_token=pad_token,
                                            oov_token=oov_token)
     else:
         self.vocab = vocab
     # get the vocabulary_to_index and oov id
     self._token2index = self.vocab.get_token_to_index_vocabulary(
         namespace=index_name)
     self._unk_id = self.vocab.get_token_index(self.vocab._oov_token,
                                               namespace=index_name)
     self._vocab_size = self.vocab.get_vocab_size(namespace=index_name)
示例#28
0
 def predict(self, file_path: str, vocab_path: str):
     ds = self.data_reader.read(file_path)
     vocab = Vocabulary.from_files(vocab_path)
     self.iterator.index_with(vocab)
     self.model.eval()
     pred_generator = self.iterator(ds, num_epochs=1, shuffle=False)
     pred_generator_tqdm = tqdm(pred_generator,
                                total=self.iterator.get_num_batches(ds))
     preds = []
     with torch.no_grad():
         for batch in pred_generator_tqdm:
             batch = util.move_to_device(batch, self.cuda_device)
             preds.append(self._extract_data(batch))
     return preds
示例#29
0
    def setUp(self):
        logging.basicConfig(level=logging.INFO)

        tokenizer = WordTokenizer(JustSpacesWordSplitter())
        reader = SummDataReader(tokenizer, source_max_tokens=400, lazy=False)
        self.train_dataset = reader.read(
            '../data/dev_bbc/train.dev.tsv.tagged')
        self.val_dataset = reader.read('../data/dev_bbc/val.dev.tsv.tagged')
        vocab_path = 'data/cnndm/vocab'
        if os.path.exists(vocab_path):
            self.vocab = Vocabulary.from_files(vocab_path)
        else:
            self.vocab = Vocabulary.from_instances(self.train_dataset,
                                                   max_vocab_size=80000)
            self.vocab.save_to_files(vocab_path)
示例#30
0
def get_shared_from_dir_allennlp(vocab_dir):
    '''get word2idx, char2idx by allennlp vocab
    Args:
        vocab_dir -- allennlp vocab saved dir
    Returns:
        word2idx --
        char2idx --
    '''
    word_space = "word2idx"
    char_space = "char2idx"
    vocab = Vocabulary.from_files(vocab_dir)
    word2idx = vocab.get_token_to_index_vocabulary(word_space)
    char2idx = vocab.get_token_to_index_vocabulary(char_space)
    print("word={}, char={}".format(len(word2idx), len(char2idx)))
    return word2idx, char2idx
示例#31
0
 def get_parser():
     dataset_path = 'data/datasets/spider'
     vocab = Vocabulary.from_files(
         'parsers/irnet/checkpoints/v1.0_spider_baseline_model/vocabulary')
     overrides = {
         "dataset_path": dataset_path,
         "train_data_path": "train.json",
         "validation_data_path": "dev.json"
     }
     parser_model = load_archive(
         'parsers/irnet/checkpoints/v1.0_spider_baseline_model/model.tar.gz',
         cuda_device=0,
         overrides=json.dumps(overrides)).model
     parser_model.sql_metric_util._evaluator.update_dataset_path(
         dataset_path=dataset_path)
     parser = IRNetSpiderParser(model=parser_model)
     return parser
示例#32
0
    def build_ontology_and_vocab(ontology_path: str, vocab_path: Optional[str] = None) -> Tuple[Vocabulary, Dict]:
        with open(ontology_path) as f:
            ontology = json.load(f)

        if vocab_path is None:
            vocab: Vocabulary = Vocabulary()
            vocab.add_token_to_namespace(token='None', namespace='span_labels')
            vocab.add_token_to_namespace(token='@@PADDING@@', namespace='span_labels')
            vocab.add_tokens_to_namespace([
                role
                for role in ontology['args'].keys()
            ], namespace='span_labels')
            vocab.add_tokens_to_namespace([
                event
                for event in ontology['events'].keys()
            ], namespace='event_labels')
        else:
            vocab: Vocabulary = Vocabulary.from_files(vocab_path)

        return vocab, ontology
示例#33
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    force : ``bool``, optional (default=False)
        If ``True``, we will overwrite the serialization directory if it already exists.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover, force)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))

    if recover and os.path.exists(os.path.join(serialization_dir, "vocabulary")):
        vocab = Vocabulary.from_files(os.path.join(serialization_dir, "vocabulary"))
    else:
        vocab = Vocabulary.from_params(
                params.pop("vocabulary", {}),
                (instance for key, dataset in all_datasets.items()
                 for instance in dataset
                 if key in datasets_for_vocab_creation)
        )

    model = Model.from_params(vocab=vocab, params=params.pop('model'))

    # Initializing the model can have side effect of expanding the vocabulary
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer_choice = trainer_params.pop_choice("type",
                                               Trainer.list_available(),
                                               default_to_first_choice=True)
    trainer = Trainer.by_name(trainer_choice).from_params(model=model,
                                                          serialization_dir=serialization_dir,
                                                          iterator=iterator,
                                                          train_data=train_data,
                                                          validation_data=validation_data,
                                                          params=trainer_params,
                                                          validation_iterator=validation_iterator)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Training interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)

    if test_data and evaluate_on_test:
        logger.info("The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
                best_model, test_data, validation_iterator or iterator,
                cuda_device=trainer._cuda_devices[0], # pylint: disable=protected-access,
                # TODO(brendanr): Pass in an arg following Joel's trainer refactor.
                batch_weight_key=""
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True)

    return best_model
示例#34
0
def build_tasks(args):
    '''Prepare tasks'''

    def parse_tasks(task_list):
        '''parse string of tasks'''
        if task_list == 'all':
            tasks = ALL_TASKS
        elif task_list == 'none':
            tasks = []
        else:
            tasks = task_list.split(',')
        return tasks

    train_task_names = parse_tasks(args.train_tasks)
    eval_task_names = parse_tasks(args.eval_tasks)
    all_task_names = list(set(train_task_names + eval_task_names))
    tasks = get_tasks(all_task_names, args.max_seq_len, args.load_tasks)

    max_v_sizes = {'word': args.max_word_v_size}
    token_indexer = {}
    if args.elmo:
        token_indexer["elmo"] = ELMoTokenCharactersIndexer("elmo")
        if not args.elmo_no_glove:
            token_indexer["words"] = SingleIdTokenIndexer()
    else:
        token_indexer["words"] = SingleIdTokenIndexer()

    vocab_path = os.path.join(args.exp_dir, 'vocab')
    preproc_file = os.path.join(args.exp_dir, args.preproc_file)
    if args.load_preproc and os.path.exists(preproc_file):
        preproc = pkl.load(open(preproc_file, 'rb'))
        vocab = Vocabulary.from_files(vocab_path)
        word_embs = preproc['word_embs']
        for task in tasks:
            train, val, test = preproc[task.name]
            task.train_data = train
            task.val_data = val
            task.test_data = test
        log.info("\tFinished building vocab. Using %d words",
                 vocab.get_vocab_size('tokens'))
        log.info("\tLoaded data from %s", preproc_file)
    else:
        log.info("\tProcessing tasks from scratch")
        word2freq = get_words(tasks)
        vocab = get_vocab(word2freq, max_v_sizes)
        word_embs = get_embeddings(vocab, args.word_embs_file, args.d_word)
        preproc = {'word_embs': word_embs}
        for task in tasks:
            train, val, test = process_task(task, token_indexer, vocab)
            task.train_data = train
            task.val_data = val
            task.test_data = test
            del_field_tokens(task)
            preproc[task.name] = (train, val, test)
        log.info("\tFinished indexing tasks")
        pkl.dump(preproc, open(preproc_file, 'wb'))
        vocab.save_to_files(vocab_path)
        log.info("\tSaved data to %s", preproc_file)
        del word2freq
    del preproc

    train_tasks = [task for task in tasks if task.name in train_task_names]
    eval_tasks = [task for task in tasks if task.name in eval_task_names]
    log.info('\t  Training on %s', ', '.join([task.name for task in train_tasks]))
    log.info('\t  Evaluating on %s', ', '.join([task.name for task in eval_tasks]))
    return train_tasks, eval_tasks, vocab, word_embs