Exemplo n.º 1
0
 def print_prediction(self, prediction, params=None, stream=None):
     if params is None:
         params = {}
     with_scores = params.get("with_scores")
     alignment_type = params.get("with_alignments")
     if alignment_type and "alignment" not in prediction:
         raise ValueError(
             "with_alignments is set but the model did not return alignment information"
         )
     num_hypotheses = len(prediction["log_probs"])
     for i in range(num_hypotheses):
         if "tokens" in prediction:
             target_length = prediction["length"][i]
             tokens = prediction["tokens"][i][:target_length]
             sentence = self.labels_inputter.tokenizer.detokenize(tokens)
         else:
             sentence = prediction["text"][i]
         score = None
         attention = None
         if with_scores:
             score = prediction["log_probs"][i]
         if alignment_type:
             attention = prediction["alignment"][i][:target_length]
         sentence = misc.format_translation_output(
             sentence,
             score=score,
             attention=attention,
             alignment_type=alignment_type,
         )
         misc.print_as_bytes(sentence, stream=stream)
Exemplo n.º 2
0
  def detokenize_stream(self, input_stream=sys.stdin, output_stream=sys.stdout, delimiter=" "):
    """Detokenizes a stream of sentences.

    Args:
      input_stream: The input stream.
      output_stream: The output stream.
      delimiter: The token delimiter used for text serialization.
    """
    for line in input_stream:
      tokens = line.strip().split(delimiter)
      string = self.detokenize(tokens)
      misc.print_as_bytes(string, stream=output_stream)
Exemplo n.º 3
0
  def tokenize_stream(self, input_stream=sys.stdin, output_stream=sys.stdout, delimiter=" "):
    """Tokenizes a stream of sentences.

    Args:
      input_stream: The input stream.
      output_stream: The output stream.
      delimiter: The token delimiter to use for text serialization.
    """
    for line in input_stream:
      line = line.strip()
      tokens = self.tokenize(line)
      merged_tokens = delimiter.join(tokens)
      misc.print_as_bytes(merged_tokens, stream=output_stream)
Exemplo n.º 4
0
 def print_score(self, score, params=None, stream=None):
     if params is None:
         params = {}
     length = score["length"]
     tokens = score["tokens"][:length]
     sentence = self.decoder_inputter.tokenizer.detokenize(tokens)
     token_level_scores = None
     attention = None
     if params.get("with_token_level"):
         token_level_scores = score["cross_entropy"][:length]
     if "attention" in score:
         attention = score["attention"][:length]
     alignment_type = params.get("with_alignments")
     sentence = misc.format_translation_output(
         sentence,
         score=score["score"],
         token_level_scores=token_level_scores,
         attention=attention,
         alignment_type=alignment_type)
     misc.print_as_bytes(sentence, stream=stream)
Exemplo n.º 5
0
    def tokenize_stream(
        self,
        input_stream=sys.stdin,
        output_stream=sys.stdout,
        delimiter=" ",
        training=True,
    ):
        """Tokenizes a stream of sentences.

        Args:
          input_stream: The input stream.
          output_stream: The output stream.
          delimiter: The token delimiter to use for text serialization.
          training: Set to ``False`` to tokenize for inference.
        """
        for line in input_stream:
            line = line.strip()
            tokens = self.tokenize(line, training=training)
            merged_tokens = delimiter.join(tokens)
            misc.print_as_bytes(merged_tokens, stream=output_stream)
Exemplo n.º 6
0
def _process_stream_as_dataset(
    input_stream,
    output_stream,
    map_func,
    batch_size=512,
    num_parallel_calls=4,
):
    dataset = tf.data.Dataset.from_generator(
        lambda: input_stream,
        output_types=tf.string,
        output_shapes=tf.TensorShape([]),
    )
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(map_func, num_parallel_calls=num_parallel_calls)

    expected_spec = tf.TensorSpec(shape=[None], dtype=tf.string)
    if dataset.element_spec != expected_spec:
        raise TypeError(
            "Expected map_func to produce elements with spec %s, but got spec %s instead"
            % (expected_spec, dataset.element_spec))

    for lines in dataset.as_numpy_iterator():
        for line in lines:
            misc.print_as_bytes(line, stream=output_stream)
Exemplo n.º 7
0
 def print_prediction(self, prediction, params=None, stream=None):
     misc.print_as_bytes(prediction["classes"], stream=stream)
Exemplo n.º 8
0
 def print_prediction(self, prediction, params=None, stream=None):
     target_length = prediction["length"]
     tokens = prediction["tokens"][:target_length]
     sentence = self.examples_inputter.tokenizer.detokenize(tokens)
     sentence = misc.format_translation_output(sentence)
     misc.print_as_bytes(sentence, stream=stream)
Exemplo n.º 9
0
 def print_prediction(self, prediction, params=None, stream=None):
     tags = prediction["tags"][:prediction["length"]]
     sent = b" ".join(tags)
     misc.print_as_bytes(sent, stream=stream)