예제 #1
0
def make_tokenizer(config=None):
  """Creates a tokenizer instance from the configuration.

  Args:
    config: Path to a configuration file or the configuration dictionary.

  Returns:
    A :class:`opennmt.tokenizers.Tokenizer` instance.

  Raises:
    ValueError: if :obj:`config` is invalid.
  """
  if config:
    if isinstance(config, str) and tf.io.gfile.exists(config):
      with tf.io.gfile.GFile(config, mode="rb") as config_file:
        config = yaml.load(config_file, Loader=yaml.UnsafeLoader)
    if isinstance(config, dict):
      tokenizer_type = config.get("type")
      tokenizer_params = config.get("params", {})
      if tokenizer_type is None:
        tokenizer = OpenNMTTokenizer(**config)
      else:
        tokenizer_class = getattr(sys.modules[__name__], tokenizer_type, None)
        if tokenizer_class is None:
          raise ValueError("Invalid tokenizer type: %s" % tokenizer_type)
        tokenizer = tokenizer_class(**tokenizer_params)
    else:
      raise ValueError("Invalid tokenization configuration: %s" % str(config))
  else:
    # If the tokenization was not configured, we assume that an external tokenization
    # was used and we don't include the tokenizer in the exported graph.
    tokenizer = SpaceTokenizer(in_graph=False)
  return tokenizer
예제 #2
0
    def __init__(self,
                 vocabulary_file_key,
                 embedding_size,
                 num_outputs,
                 kernel_size=5,
                 stride=3,
                 dropout=0.0,
                 tokenizer=SpaceTokenizer(),
                 dtype=tf.float32):
        """Initializes the parameters of the character convolution embedder.

    Args:
      vocabulary_file_key: The meta configuration key of the vocabulary file
        containing one character per line.
      embedding_size: The size of the character embedding.
      num_outputs: The dimension of the convolution output space.
      kernel_size: Length of the convolution window.
      stride: Length of the convolution stride.
      dropout: The probability to drop units in the embedding.
      tokenizer: An optional :class:`opennmt.tokenizers.tokenizer.Tokenizer` to
        tokenize the input text.
      dtype: The embedding type.
    """
        super(CharConvEmbedder, self).__init__(vocabulary_file_key,
                                               embedding_size,
                                               dropout=dropout,
                                               tokenizer=tokenizer,
                                               dtype=dtype)
        self.num_outputs = num_outputs
        self.kernel_size = kernel_size
        self.stride = stride
        self.num_oov_buckets = 1
예제 #3
0
    def __init__(self,
                 vocabulary_file_key,
                 embedding_size=None,
                 embedding_file_key=None,
                 embedding_file_with_header=True,
                 case_insensitive_embeddings=True,
                 trainable=True,
                 dropout=0.0,
                 tokenizer=SpaceTokenizer(),
                 dtype=tf.float32):
        """Initializes the parameters of the word embedder.

    Args:
      vocabulary_file_key: The data configuration key of the vocabulary file
        containing one word per line.
      embedding_size: The size of the resulting embedding.
        If ``None``, an embedding file must be provided.
      embedding_file_key: The data configuration key of the embedding file.
      embedding_file_with_header: ``True`` if the embedding file starts with a
        header line like in GloVe embedding files.
      case_insensitive_embeddings: ``True`` if embeddings are trained on
        lowercase data.
      trainable: If ``False``, do not optimize embeddings.
      dropout: The probability to drop units in the embedding.
      tokenizer: An optional :class:`opennmt.tokenizers.tokenizer.Tokenizer` to
        tokenize the input text.
      dtype: The embedding type.

    Raises:
      ValueError: if neither :obj:`embedding_size` nor :obj:`embedding_file_key`
        are set.

    See Also:
      The :meth:`opennmt.inputters.text_inputter.load_pretrained_embeddings`
      function for details about the pretrained embedding format and behavior.
    """
        super(WordEmbedder, self).__init__(tokenizer=tokenizer, dtype=dtype)

        self.vocabulary_file_key = vocabulary_file_key
        self.embedding_size = embedding_size
        self.embedding_file_key = embedding_file_key
        self.embedding_file_with_header = embedding_file_with_header
        self.case_insensitive_embeddings = case_insensitive_embeddings
        self.trainable = trainable
        self.dropout = dropout
        self.num_oov_buckets = 1

        if embedding_size is None and embedding_file_key is None:
            raise ValueError(
                "Must either provide embedding_size or embedding_file_key")
예제 #4
0
def make_tokenizer(config):
    """Creates a tokenizer instance from the configuration.

  Args:
    config: Path to a configuration file or the configuration dictionary.

  Returns:
    A :class:`opennmt.tokenizers.Tokenizer` instance.
  """
    if config:
        if isinstance(config, six.string_types) and tf.io.gfile.exists(config):
            with tf.io.gfile.GFile(config, mode="rb") as config_file:
                config = yaml.load(config_file, Loader=yaml.UnsafeLoader)
        tokenizer = OpenNMTTokenizer(**config)
    else:
        tokenizer = SpaceTokenizer()
    return tokenizer
예제 #5
0
    def __init__(self,
                 vocabulary_file_key,
                 embedding_size,
                 num_units,
                 dropout=0.2,
                 encoding="average",
                 cell_class=tf.contrib.rnn.LSTMCell,
                 tokenizer=SpaceTokenizer(),
                 dtype=tf.float32):
        """Initializes the parameters of the character RNN embedder.

    Args:
      vocabulary_file_key: The meta configuration key of the vocabulary file
        containing one character per line.
      embedding_size: The size of the character embedding.
      num_units: The number of units in the RNN layer.
      dropout: The probability to drop units in the embedding and the RNN
        outputs.
      encoding: "average" or "last" (case insensitive), the encoding vector to
        extract from the RNN outputs.
      cell_class: The inner cell class or a callable taking :obj:`num_units` as
        argument and returning a cell.
      tokenizer: An optional :class:`opennmt.tokenizers.tokenizer.Tokenizer` to
        tokenize the input text.
      dtype: The embedding type.

    Raises:
      ValueError: if :obj:`encoding` is invalid.
    """
        super(CharRNNEmbedder, self).__init__(vocabulary_file_key,
                                              embedding_size,
                                              dropout=dropout,
                                              tokenizer=tokenizer,
                                              dtype=dtype)
        self.num_units = num_units
        self.cell_class = cell_class
        self.encoding = encoding.lower()
        if self.encoding not in ("average", "last"):
            raise ValueError("Invalid encoding vector: {}".format(
                self.encoding))
예제 #6
0
    def __init__(self,
                 vocabulary_file_key,
                 embedding_size,
                 dropout=0.0,
                 tokenizer=SpaceTokenizer(),
                 dtype=tf.float32):
        """Initializes the parameters of the character embedder.

    Args:
      vocabulary_file_key: The meta configuration key of the vocabulary file
        containing one character per line.
      embedding_size: The size of the character embedding.
      dropout: The probability to drop units in the embedding.
      tokenizer: An optional :class:`opennmt.tokenizers.tokenizer.Tokenizer` to
        tokenize the input text.
      dtype: The embedding type.
    """
        super(CharEmbedder, self).__init__(tokenizer=tokenizer, dtype=dtype)

        self.vocabulary_file_key = vocabulary_file_key
        self.embedding_size = embedding_size
        self.dropout = dropout
        self.num_oov_buckets = 1
예제 #7
0
 def __init__(self, tokenizer=SpaceTokenizer(), dtype=tf.float32):
     super(TextInputter, self).__init__(dtype=dtype)
     self.tokenizer = tokenizer
예제 #8
0
 def __init__(self, tokenizer=SpaceTokenizer()):
     super(TextInputter, self).__init__()
     self.tokenizer = tokenizer