def make_tokenizer(config=None): """Creates a tokenizer instance from the configuration. Args: config: Path to a configuration file or the configuration dictionary. Returns: A :class:`opennmt.tokenizers.Tokenizer` instance. Raises: ValueError: if :obj:`config` is invalid. """ if config: if isinstance(config, str) and tf.io.gfile.exists(config): with tf.io.gfile.GFile(config, mode="rb") as config_file: config = yaml.load(config_file, Loader=yaml.UnsafeLoader) if isinstance(config, dict): tokenizer_type = config.get("type") tokenizer_params = config.get("params", {}) if tokenizer_type is None: tokenizer = OpenNMTTokenizer(**config) else: tokenizer_class = getattr(sys.modules[__name__], tokenizer_type, None) if tokenizer_class is None: raise ValueError("Invalid tokenizer type: %s" % tokenizer_type) tokenizer = tokenizer_class(**tokenizer_params) else: raise ValueError("Invalid tokenization configuration: %s" % str(config)) else: # If the tokenization was not configured, we assume that an external tokenization # was used and we don't include the tokenizer in the exported graph. tokenizer = SpaceTokenizer(in_graph=False) return tokenizer
def __init__(self, vocabulary_file_key, embedding_size, num_outputs, kernel_size=5, stride=3, dropout=0.0, tokenizer=SpaceTokenizer(), dtype=tf.float32): """Initializes the parameters of the character convolution embedder. Args: vocabulary_file_key: The meta configuration key of the vocabulary file containing one character per line. embedding_size: The size of the character embedding. num_outputs: The dimension of the convolution output space. kernel_size: Length of the convolution window. stride: Length of the convolution stride. dropout: The probability to drop units in the embedding. tokenizer: An optional :class:`opennmt.tokenizers.tokenizer.Tokenizer` to tokenize the input text. dtype: The embedding type. """ super(CharConvEmbedder, self).__init__(vocabulary_file_key, embedding_size, dropout=dropout, tokenizer=tokenizer, dtype=dtype) self.num_outputs = num_outputs self.kernel_size = kernel_size self.stride = stride self.num_oov_buckets = 1
def __init__(self, vocabulary_file_key, embedding_size=None, embedding_file_key=None, embedding_file_with_header=True, case_insensitive_embeddings=True, trainable=True, dropout=0.0, tokenizer=SpaceTokenizer(), dtype=tf.float32): """Initializes the parameters of the word embedder. Args: vocabulary_file_key: The data configuration key of the vocabulary file containing one word per line. embedding_size: The size of the resulting embedding. If ``None``, an embedding file must be provided. embedding_file_key: The data configuration key of the embedding file. embedding_file_with_header: ``True`` if the embedding file starts with a header line like in GloVe embedding files. case_insensitive_embeddings: ``True`` if embeddings are trained on lowercase data. trainable: If ``False``, do not optimize embeddings. dropout: The probability to drop units in the embedding. tokenizer: An optional :class:`opennmt.tokenizers.tokenizer.Tokenizer` to tokenize the input text. dtype: The embedding type. Raises: ValueError: if neither :obj:`embedding_size` nor :obj:`embedding_file_key` are set. See Also: The :meth:`opennmt.inputters.text_inputter.load_pretrained_embeddings` function for details about the pretrained embedding format and behavior. """ super(WordEmbedder, self).__init__(tokenizer=tokenizer, dtype=dtype) self.vocabulary_file_key = vocabulary_file_key self.embedding_size = embedding_size self.embedding_file_key = embedding_file_key self.embedding_file_with_header = embedding_file_with_header self.case_insensitive_embeddings = case_insensitive_embeddings self.trainable = trainable self.dropout = dropout self.num_oov_buckets = 1 if embedding_size is None and embedding_file_key is None: raise ValueError( "Must either provide embedding_size or embedding_file_key")
def make_tokenizer(config): """Creates a tokenizer instance from the configuration. Args: config: Path to a configuration file or the configuration dictionary. Returns: A :class:`opennmt.tokenizers.Tokenizer` instance. """ if config: if isinstance(config, six.string_types) and tf.io.gfile.exists(config): with tf.io.gfile.GFile(config, mode="rb") as config_file: config = yaml.load(config_file, Loader=yaml.UnsafeLoader) tokenizer = OpenNMTTokenizer(**config) else: tokenizer = SpaceTokenizer() return tokenizer
def __init__(self, vocabulary_file_key, embedding_size, num_units, dropout=0.2, encoding="average", cell_class=tf.contrib.rnn.LSTMCell, tokenizer=SpaceTokenizer(), dtype=tf.float32): """Initializes the parameters of the character RNN embedder. Args: vocabulary_file_key: The meta configuration key of the vocabulary file containing one character per line. embedding_size: The size of the character embedding. num_units: The number of units in the RNN layer. dropout: The probability to drop units in the embedding and the RNN outputs. encoding: "average" or "last" (case insensitive), the encoding vector to extract from the RNN outputs. cell_class: The inner cell class or a callable taking :obj:`num_units` as argument and returning a cell. tokenizer: An optional :class:`opennmt.tokenizers.tokenizer.Tokenizer` to tokenize the input text. dtype: The embedding type. Raises: ValueError: if :obj:`encoding` is invalid. """ super(CharRNNEmbedder, self).__init__(vocabulary_file_key, embedding_size, dropout=dropout, tokenizer=tokenizer, dtype=dtype) self.num_units = num_units self.cell_class = cell_class self.encoding = encoding.lower() if self.encoding not in ("average", "last"): raise ValueError("Invalid encoding vector: {}".format( self.encoding))
def __init__(self, vocabulary_file_key, embedding_size, dropout=0.0, tokenizer=SpaceTokenizer(), dtype=tf.float32): """Initializes the parameters of the character embedder. Args: vocabulary_file_key: The meta configuration key of the vocabulary file containing one character per line. embedding_size: The size of the character embedding. dropout: The probability to drop units in the embedding. tokenizer: An optional :class:`opennmt.tokenizers.tokenizer.Tokenizer` to tokenize the input text. dtype: The embedding type. """ super(CharEmbedder, self).__init__(tokenizer=tokenizer, dtype=dtype) self.vocabulary_file_key = vocabulary_file_key self.embedding_size = embedding_size self.dropout = dropout self.num_oov_buckets = 1
def __init__(self, tokenizer=SpaceTokenizer(), dtype=tf.float32): super(TextInputter, self).__init__(dtype=dtype) self.tokenizer = tokenizer
def __init__(self, tokenizer=SpaceTokenizer()): super(TextInputter, self).__init__() self.tokenizer = tokenizer