예제 #1
0
    def load(self, path, file_format="default"):
        """Loads a serialized vocabulary.

    Args:
      path: The path to the vocabulary to load.
      file_format: Define the format of the vocabulary file. Can be: default,
        sentencepiece. "default" is simply one token per line.

    Raises:
      ValueError: if :obj:`file_format` is invalid.
    """
        with compat.gfile_open(path, mode="rb") as vocab:
            for line in vocab:
                if file_format == "default":
                    self.add(line[:-1])
                elif file_format == "sentencepiece":
                    token, _ = line.rstrip().split(b"\t")
                    if token in (
                            b"<unk>", b"<s>",
                            b"</s>"):  # Ignore SentencePiece special tokens.
                        continue
                    self.add(token)
                else:
                    raise ValueError("Invalid vocabulary format: %s" %
                                     file_format)
예제 #2
0
파일: misc.py 프로젝트: Byramklc/OneCeviri
def count_lines(filename):
    """Returns the number of lines of the file :obj:`filename`."""
    with compat.gfile_open(filename, mode="rb") as f:
        i = 0
        for i, _ in enumerate(f):
            pass
        return i + 1
예제 #3
0
    def initialize(self, metadata, asset_dir=None, asset_prefix=""):
        """Initializes the tokenizer (e.g. load BPE models).

    Args:
      metadata: A dictionary containing additional metadata set
        by the user.
      asset_dir: The directory where assets can be written. If ``None``, no
        assets are returned.
      asset_prefix: The prefix to attach to assets filename.

    Returns:
      A dictionary containing additional assets used by the tokenizer.
    """
        assets = {}
        if self._configuration_key is not None:
            configuration = metadata[self._configuration_key]
            if isinstance(configuration, dict):
                self._config = configuration
            else:
                with compat.gfile_open(configuration, mode="rb") as conf_file:
                    self._config = yaml.load(conf_file)
        if self._config and asset_dir is not None:
            asset_name = "%stokenizer_config.yml" % asset_prefix
            asset_path = os.path.join(asset_dir, asset_name)
            _make_config_asset_file(self._config, asset_path)
            assets[asset_name] = asset_path
        return assets
예제 #4
0
    def load(self, path):
        """Loads a serialized vocabulary.

    Args:
      path: The path to the vocabulary to load.
    """
        with compat.gfile_open(path, mode="rb") as vocab:
            for token in vocab:
                self.add(tf.compat.as_text(token[:-1]))
예제 #5
0
    def serialize(self, path):
        """Writes the vocabulary on disk.

    Args:
      path: The path where the vocabulary will be saved.
    """
        with compat.gfile_open(path, mode="wb") as vocab:
            for token in self._id_to_token:
                vocab.write(tf.compat.as_bytes(token))
                vocab.write(b"\n")
예제 #6
0
 def initialize(self, metadata, asset_dir=None, asset_prefix=""):
   self.vocabulary_file = metadata[self.vocabulary_file_key]
   self.vocabulary_size = count_lines(self.vocabulary_file) + self.num_oov_buckets
   if self.tokenizer is None:
     tokenizer_config = _get_field(metadata, "tokenization", prefix=asset_prefix)
     if tokenizer_config:
       if isinstance(tokenizer_config, six.string_types) and compat.gfile_exists(tokenizer_config):
         with compat.gfile_open(tokenizer_config, mode="rb") as config_file:
           tokenizer_config = yaml.load(config_file)
       self.tokenizer = tokenizers.OpenNMTTokenizer(params=tokenizer_config)
     else:
       self.tokenizer = tokenizers.SpaceTokenizer()
   self.tokenizer.initialize(metadata)
   return super(TextInputter, self).initialize(
       metadata, asset_dir=asset_dir, asset_prefix=asset_prefix)
예제 #7
0
    def add_from_text(self, filename, tokenizer=None):
        """Fills the vocabulary from a text file.

    Args:
      filename: The file to load from.
      tokenizer: A callable to tokenize a line of text.
    """
        with compat.gfile_open(filename, mode="rb") as text:
            for line in text:
                line = tf.compat.as_text(line.strip())
                if tokenizer:
                    tokens = tokenizer.tokenize(line)
                else:
                    tokens = line.split()
                for token in tokens:
                    self.add(token)
예제 #8
0
def load_config(config_paths, config=None):
    """Loads configuration files.

  Args:
    config_paths: A list of configuration files.
    config: A (possibly non empty) config dictionary to fill.

  Returns:
    The configuration dictionary.
  """
    if config is None:
        config = {}

    for config_path in config_paths:
        with compat.gfile_open(config_path, mode="rb") as config_file:
            subconfig = yaml.load(config_file.read(), Loader=yaml.UnsafeLoader)
            # Add or update section in main configuration.
            merge_dict(config, subconfig)

    return config
예제 #9
0
    def __init__(self, configuration_file_or_key=None, params=None):
        """Initializes the tokenizer.

    Args:
      configuration_file_or_key: The YAML configuration file or a the key to
        the YAML configuration file.
    """
        self._configuration_key = None
        if params is not None:
            self._config = params
        else:
            self._config = {}
            if configuration_file_or_key is not None and compat.gfile_exists(
                    configuration_file_or_key):
                configuration_file = configuration_file_or_key
                with compat.gfile_open(configuration_file,
                                       mode="rb") as conf_file:
                    self._config = yaml.load(conf_file)
            else:
                self._configuration_key = configuration_file_or_key
예제 #10
0
    def initialize(self, metadata, asset_dir=None, asset_prefix=""):
        """Initializes the tokenizer (e.g. load BPE models).

    Args:
      metadata: A dictionary containing additional metadata set
        by the user.
      asset_dir: The directory where assets can be written. If ``None``, no
        assets are returned.
      asset_prefix: The prefix to attach to assets filename.

    Returns:
      A dictionary containing additional assets used by the tokenizer.
    """
        if self._configuration_key is not None:
            configuration = metadata[self._configuration_key]
            if isinstance(configuration, dict):
                self._config = configuration
            else:
                with compat.gfile_open(configuration, mode="rb") as conf_file:
                    self._config = yaml.load(conf_file)
        if asset_dir is not None:
            return self.export_assets(asset_dir, asset_prefix=asset_prefix)
        return {}
예제 #11
0
def load_model(model_dir,
               model_file=None,
               model_name=None,
               serialize_model=True):
    """Loads the model from the catalog or a file.

  The model object is pickled in :obj:`model_dir` to make the model
  configuration optional for future runs.

  Args:
    model_dir: The model directory.
    model_file: An optional model configuration.
      Mutually exclusive with :obj:`model_name`.
    model_name: An optional model name from the catalog.
      Mutually exclusive with :obj:`model_file`.
    serialize_model: Serialize the model definition in the model directory.

  Returns:
    A :class:`opennmt.models.model.Model` instance.

  Raises:
    ValueError: if both :obj:`model_file` and :obj:`model_name` are set.
  """
    if model_file and model_name:
        raise ValueError("only one of model_file and model_name should be set")
    model_name_or_path = model_file or model_name
    model_description_path = os.path.join(model_dir, "model_description.py")

    # Also try to load the pickled model for backward compatibility.
    serial_model_file = os.path.join(model_dir, "model_description.pkl")

    if model_name_or_path:
        if tf.train.latest_checkpoint(model_dir) is not None:
            compat.logging.warn(
                "You provided a model configuration but a checkpoint already exists. "
                "The model configuration must define the same model as the one used for "
                "the initial training. However, you can change non structural values like "
                "dropout.")

        if model_file:
            model = load_model_from_file(model_file)
            if serialize_model:
                compat.gfile_copy(model_file,
                                  model_description_path,
                                  overwrite=True)
        elif model_name:
            model = load_model_from_catalog(model_name)
            if serialize_model:
                with compat.gfile_open(model_description_path,
                                       mode="w") as model_description_file:
                    model_description_file.write(
                        "from opennmt.models import catalog\n")
                    model_description_file.write("model = catalog.%s\n" %
                                                 model_name)
    elif compat.gfile_exists(model_description_path):
        compat.logging.info("Loading model description from %s",
                            model_description_path)
        model = load_model_from_file(model_description_path)
    elif compat.gfile_exists(serial_model_file):
        compat.logging.info("Loading serialized model description from %s",
                            serial_model_file)
        with compat.gfile_open(serial_model_file, mode="rb") as serial_model:
            model = pickle.load(serial_model)
    else:
        raise RuntimeError(
            "A model configuration is required: you probably need to "
            "set --model or --model_type on the command line.")

    return model
예제 #12
0
def load_pretrained_embeddings(embedding_file,
                               vocabulary_file,
                               num_oov_buckets=0,
                               with_header=True,
                               case_insensitive_embeddings=True):
    """Returns pretrained embeddings relative to the vocabulary.

  The :obj:`embedding_file` must have the following format:

  .. code-block:: text

      N M
      word1 val1 val2 ... valM
      word2 val1 val2 ... valM
      ...
      wordN val1 val2 ... valM

  or if :obj:`with_header` is ``False``:

  .. code-block:: text

      word1 val1 val2 ... valM
      word2 val1 val2 ... valM
      ...
      wordN val1 val2 ... valM

  This function will iterate on each embedding in :obj:`embedding_file` and
  assign the pretrained vector to the associated word in :obj:`vocabulary_file`
  if found. Otherwise, the embedding is ignored.

  If :obj:`case_insensitive_embeddings` is ``True``, word embeddings are assumed
  to be trained on lowercase data. In that case, word alignments are case
  insensitive meaning the pretrained word embedding for "the" will be assigned
  to "the", "The", "THE", or any other case variants included in
  :obj:`vocabulary_file`.

  Args:
    embedding_file: Path the embedding file. Entries will be matched against
      :obj:`vocabulary_file`.
    vocabulary_file: The vocabulary file containing one word per line.
    num_oov_buckets: The number of additional unknown tokens.
    with_header: ``True`` if the embedding file starts with a header line like
      in GloVe embedding files.
    case_insensitive_embeddings: ``True`` if embeddings are trained on lowercase
      data.

  Returns:
    A Numpy array of shape ``[vocabulary_size + num_oov_buckets, embedding_size]``.
  """
    # Map words to ids from the vocabulary.
    word_to_id = collections.defaultdict(list)
    with compat.gfile_open(vocabulary_file, mode="rb") as vocabulary:
        count = 0
        for word in vocabulary:
            word = word.strip()
            if case_insensitive_embeddings:
                word = word.lower()
            word_to_id[word].append(count)
            count += 1

    # Fill pretrained embedding matrix.
    with compat.gfile_open(embedding_file, mode="rb") as embedding:
        pretrained = None

        if with_header:
            next(embedding)

        for line in embedding:
            fields = line.strip().split()
            word = fields[0]

            if pretrained is None:
                pretrained = np.random.normal(size=(count + num_oov_buckets,
                                                    len(fields) - 1))

            # Lookup word in the vocabulary.
            if word in word_to_id:
                ids = word_to_id[word]
                for index in ids:
                    pretrained[index] = np.asarray(fields[1:])

    return pretrained