Пример #1
0
 def _check_for_word_vector_weights_file(self):
     # If the vocab is empty, we assume this is an untrained pipeline
     # and we want to raise an error if the weights file is not found.
     # Extending the vocab with a non-existent weights file only throws a warning.
     try:
         assert is_url_or_existing_file(Path(self.config.features.word.weights_file))
     except AssertionError:
         if vocabulary.is_empty(self.vocab, [WordFeatures.namespace]):
             raise FileNotFoundError(
                 f"Cannot find the weights file {self.config.features.word.weights_file}"
             )
     # no word feature, or weights_file is None
     except (AttributeError, TypeError):
         pass
Пример #2
0
    def extend_vocab(
        self,
        extended_vocab: Vocabulary,
        vocab_namespace: str = None,
        extension_pretrained_file: str = None,
        model_path: str = None,
    ):
        """
        Extends the embedding matrix according to the extended vocabulary.
        If extension_pretrained_file is available, it will be used for initializing the new words
        embeddings in the extended vocabulary; otherwise we will check if _pretrained_file attribute
        is already available. If none is available, they will be initialized with xavier uniform.

        # Parameters

        extended_vocab : `Vocabulary`
            Vocabulary extended from original vocabulary used to construct
            this `Embedding`.
        vocab_namespace : `str`, (optional, default=`None`)
            In case you know what vocab_namespace should be used for extension, you
            can pass it. If not passed, it will check if vocab_namespace used at the
            time of `Embedding` construction is available. If so, this namespace
            will be used or else extend_vocab will be a no-op.
        extension_pretrained_file : `str`, (optional, default=`None`)
            A file containing pretrained embeddings can be specified here. It can be
            the path to a local file or an URL of a (cached) remote file. Check format
            details in `from_params` of `Embedding` class.
        model_path : `str`, (optional, default=`None`)
            Path traversing the model attributes upto this embedding module.
            Eg. "_text_field_embedder.token_embedder_tokens". This is only useful
            to give a helpful error message when extend_vocab is implicitly called
            by train or any other command.
        """
        # Caveat: For allennlp v0.8.1 and below, we weren't storing vocab_namespace as an attribute,
        # knowing which is necessary at time of embedding vocab extension. So old archive models are
        # currently unextendable.

        vocab_namespace = vocab_namespace or self._vocab_namespace
        if not vocab_namespace:
            # It's not safe to default to "tokens" or any other namespace.
            logger.info(
                "Loading a model trained before embedding extension was implemented; "
                "pass an explicit vocab namespace if you want to extend the vocabulary."
            )
            return

        extended_num_embeddings = extended_vocab.get_vocab_size(
            vocab_namespace)
        if extended_num_embeddings == self.num_embeddings:
            # It's already been extended. No need to initialize / read pretrained file in first place (no-op)
            return

        if extended_num_embeddings < self.num_embeddings:
            raise ConfigurationError(
                f"Size of namespace, {vocab_namespace} for extended_vocab is smaller than "
                f"embedding. You likely passed incorrect vocab or namespace for extension."
            )

        # Case 1: user passed extension_pretrained_file and it's available.
        if extension_pretrained_file and is_url_or_existing_file(
                extension_pretrained_file):
            # Don't have to do anything here, this is the happy case.
            pass
        # Case 2: user passed extension_pretrained_file and it's not available
        elif extension_pretrained_file:
            raise ConfigurationError(
                f"You passed pretrained embedding file {extension_pretrained_file} "
                f"for model_path {model_path} but it's not available.")
        # Case 3: user didn't pass extension_pretrained_file, but pretrained_file attribute was
        # saved during training and is available.
        elif is_url_or_existing_file(self._pretrained_file):
            extension_pretrained_file = self._pretrained_file
        # Case 4: no file is available, hope that pretrained embeddings weren't used in the first place and warn
        elif self._pretrained_file is not None:
            # Warn here instead of an exception to allow a fine-tuning even without the original pretrained_file
            logger.warning(
                f"Embedding at model_path, {model_path} cannot locate the pretrained_file. "
                f"Originally pretrained_file was at '{self._pretrained_file}'."
            )
        else:
            # When loading a model from archive there is no way to distinguish between whether a pretrained-file
            # was or wasn't used during the original training. So we leave an info here.
            logger.info(
                "If you are fine-tuning and want to use a pretrained_file for "
                "embedding extension, please pass the mapping by --embedding-sources argument."
            )

        embedding_dim = self.weight.data.shape[-1]
        if not extension_pretrained_file:
            extra_num_embeddings = extended_num_embeddings - self.num_embeddings
            extra_weight = torch.FloatTensor(extra_num_embeddings,
                                             embedding_dim)
            torch.nn.init.xavier_uniform_(extra_weight)
        else:
            # It's easiest to just reload the embeddings for the entire vocab,
            # then only keep the ones we need.
            whole_weight = _read_pretrained_embeddings_file(
                extension_pretrained_file, embedding_dim, extended_vocab,
                vocab_namespace)
            extra_weight = whole_weight[self.num_embeddings:, :]

        device = self.weight.data.device
        extended_weight = torch.cat(
            [self.weight.data, extra_weight.to(device)], dim=0)
        self.weight = torch.nn.Parameter(
            extended_weight, requires_grad=self.weight.requires_grad)
        self.num_embeddings = extended_num_embeddings
Пример #3
0
    def extend_vocab(self,  # pylint: disable=arguments-differ
                     extended_vocab: Vocabulary,
                     vocab_namespace: str = None,
                     extension_pretrained_file: str = None,
                     model_path: str = None):
        """
        Extends the embedding matrix according to the extended vocabulary.
        If extension_pretrained_file is available, it will be used for initializing the new words
        embeddings in the extended vocabulary; otherwise we will check if _pretrained_file attribute
        is already available. If none is available, they will be initialized with xavier uniform.

        Parameters
        ----------
        extended_vocab : Vocabulary:
            Vocabulary extended from original vocabulary used to construct
            this ``Embedding``.
        vocab_namespace : str, (optional, default=None)
            In case you know what vocab_namespace should be used for extension, you
            can pass it. If not passed, it will check if vocab_namespace used at the
            time of ``Embedding`` construction is available. If so, this namespace
            will be used or else default 'tokens' namespace will be used.
        extension_pretrained_file : str, (optional, default=None)
            A file containing pretrained embeddings can be specified here. It can be
            the path to a local file or an URL of a (cached) remote file. Check format
            details in ``from_params`` of ``Embedding`` class.
        model_path : str, (optional, default=None)
            Path traversing the model attributes upto this embedding module.
            Eg. "_text_field_embedder.token_embedder_tokens". This is only useful
            to give helpful error message when extend_vocab is implicitly called
            by fine-tune or any other command.
        """
        # Caveat: For allennlp v0.8.1 and below, we weren't storing vocab_namespace as an attribute,
        # knowing which is necessary at time of embedding vocab extension. So old archive models are
        # currently unextendable unless the user used default vocab_namespace 'tokens' for it.

        vocab_namespace = vocab_namespace or self._vocab_namespace
        if not vocab_namespace:
            vocab_namespace = "tokens"
            logging.warning("No vocab_namespace provided to Embedder.extend_vocab. Defaulting to 'tokens'.")

        extended_num_embeddings = extended_vocab.get_vocab_size(vocab_namespace)
        if extended_num_embeddings <= self.num_embeddings:
            # It's already been extended. No need to initialize / read pretrained file in first place (no-op)
            return

        # Case 1: user passed extension_pretrained_file and it's available.
        if extension_pretrained_file and is_url_or_existing_file(extension_pretrained_file):
            # Don't have to do anything here, this is the happy case.
            pass
        # Case 2: user passed extension_pretrained_file and it's not available
        elif extension_pretrained_file:
            raise ConfigurationError(f"You passed pretrained embedding file {extension_pretrained_file} "
                                     f"for model_path {model_path} but it's not available.")
        # Case 3: user didn't pass extension_pretrained_file, but pretrained_file attribute was
        # saved during training and is available.
        elif is_url_or_existing_file(self._pretrained_file):
            extension_pretrained_file = self._pretrained_file
        # Case 4: no file is available, hope that pretrained embeddings weren't used in the first place and warn
        else:
            extra_info = (f"Originally pretrained_file was at "
                          f"{self._pretrained_file}. " if self._pretrained_file else "")
            # It's better to warn here and not give error because there is no way to distinguish between
            # whether pretrained-file wasn't used during training or user forgot to pass / passed incorrect
            # mapping. Raising an error would prevent fine-tuning in the former case.
            logging.warning(f"Embedding at model_path, {model_path} cannot locate the pretrained_file. "
                            f"{extra_info} If you are fine-tuning and want to use using pretrained_file for "
                            f"embedding extension, please pass the mapping by --embedding-sources argument.")

        embedding_dim = self.weight.data.shape[-1]
        if not extension_pretrained_file:
            extra_num_embeddings = extended_num_embeddings - self.num_embeddings
            extra_weight = torch.FloatTensor(extra_num_embeddings, embedding_dim)
            torch.nn.init.xavier_uniform_(extra_weight)
        else:
            # It's easiest to just reload the embeddings for the entire vocab,
            # then only keep the ones we need.
            whole_weight = _read_pretrained_embeddings_file(extension_pretrained_file, embedding_dim,
                                                            extended_vocab, vocab_namespace)
            extra_weight = whole_weight[self.num_embeddings:, :]

        extended_weight = torch.cat([self.weight.data, extra_weight], dim=0)
        self.weight = torch.nn.Parameter(extended_weight, requires_grad=self.weight.requires_grad)
Пример #4
0
    def _prepare_vocab(
        self,
        vocabulary_folder: Optional[str] = None,
        vocab_config: Optional[Union[str,
                                     VocabularyConfiguration]] = "default",
        training_data: Optional[Dataset] = None,
        lazy: bool = False,
    ):
        """Prepare and set the vocab for a training or learning rate scan.

        Parameters
        ----------
        vocabulary_folder
            If specified, load the vocab from this folder
        vocab_config
            A `VocabularyConfiguration` to create/extend the pipeline's vocabulary if necessary.
            If 'default' (str), we will use the default configuration
            `VocabularyConfiguration(datasets=[training_data])`.
            If None, we will leave the pipeline's vocabulary untouched.
        training_data
            The training data in case we need to construct the default config
        lazy
            If true, dataset instances are lazily loaded from disk, otherwise they are loaded and kept in memory.
        """
        # The transformers feature comes with its own vocab, no need to prepare anything if it is the only feature
        if self.config.features.configured_namespaces == [
                TransformersFeatures.namespace
        ]:
            return

        # If the vocab is empty, we assume this is an untrained pipeline
        # and we want to raise an error if the weights file is not found.
        # Extending the vocab with a non-existent weights file only throws a warning.
        try:
            assert is_url_or_existing_file(
                Path(self.config.features.word.weights_file))
        except AssertionError:
            if vocabulary.is_empty(self.vocab, [WordFeatures.namespace]):
                raise FileNotFoundError(
                    f"Cannot find the weights file {self.config.features.word.weights_file}"
                )
        # no word feature, or weights_file is None
        except (AttributeError, TypeError):
            pass

        if vocabulary_folder is not None:
            self._model.extend_vocabulary(
                Vocabulary.from_files(vocabulary_folder))
            vocab_config = None

        vocab_config = (VocabularyConfiguration(datasets=[training_data])
                        if vocab_config == "default" else vocab_config)
        if vocab_config is not None:
            vocab = vocab_config.build_vocab(pipeline=self, lazy=lazy)
            self._model.extend_vocabulary(vocab)

        if vocabulary.is_empty(self.vocab,
                               self.config.features.configured_namespaces):
            raise EmptyVocabError(
                "All your features need a non-empty vocabulary for a training!"
            )