Python Subtokenizer 예제들, official.transformer.utils.tokenizer.Subtokenizer Python 예제들

예제 #1

0

파일 보기

 def _init_subtokenizer(self, vocab_list):
   temp_file = tempfile.NamedTemporaryFile(delete=False)
   with tf.gfile.Open(temp_file.name, 'w') as w:
     for subtoken in vocab_list:
       w.write("'%s'" % subtoken)
       w.write("\n")
   return tokenizer.Subtokenizer(temp_file.name, reserved_tokens=[])

예제 #2

0

파일 보기

파일: translator_service.py 프로젝트: bmpi-dev/BrikoTranslationService

def translate_list(vocab, model_dir, params, contentList):
    translation_results = []

    subtokenizer = tokenizer.Subtokenizer(vocab)
    estimator = tf.estimator.Estimator(model_fn=transformer_main.model_fn,
                                       model_dir=model_dir,
                                       params=params)
    estimator_predictor = tf.contrib.predictor.from_estimator(
        estimator,
        export.build_tensor_serving_input_receiver_fn(shape=[None],
                                                      dtype=tf.int32,
                                                      batch_size=None))

    for content in contentList:
        try:
            tokens = _encode_and_add_eos(content, subtokenizer)
            predictions = estimator_predictor(
                {"input": np.array([tokens], dtype=np.int32)})
            translation = _trim_and_decode(predictions["outputs"][0],
                                           subtokenizer)
            translation_results.append(translation)

        except:
            print("error in translation")

    return translation_results

예제 #3

0

파일 보기

파일: transformer_main.py 프로젝트: Soistesimmer/Transformer

def evaluate_and_log_bleu(model,
                          params,
                          bleu_source,
                          bleu_ref,
                          vocab_file,
                          distribution_strategy=None):
    """Calculate and record the BLEU score.

    Args:
      model: A Keras model, used to generate the translations.
      params: A dictionary, containing the translation related parameters.
      bleu_source: A file containing source sentences for translation.
      bleu_ref: A file containing the reference for the translated sentences.
      vocab_file: A file containing the vocabulary for translation.
      distribution_strategy: A platform distribution strategy, used for TPU based
        translation.

    Returns:
      uncased_score: A float, the case insensitive BLEU score.
      cased_score: A float, the case sensitive BLEU score.
    """
    subtokenizer = tokenizer.Subtokenizer(vocab_file)

    uncased_score, cased_score = translate_and_compute_bleu(
        model, params, subtokenizer, bleu_source, bleu_ref, distribution_strategy)

    logging.info("Bleu score (uncased): %s", uncased_score)
    logging.info("Bleu score (cased): %s", cased_score)
    return uncased_score, cased_score

예제 #4

0

파일 보기

파일: translate_all.py 프로젝트: srinivas-varadharajan/tensorflow-models

def main(unused_argv):
    # changed to import transformer_main_hvd instead of transformer_main
    from official.transformer import transformer_main_hvd

    tf.logging.set_verbosity(tf.logging.INFO)

    if FLAGS.text is None and FLAGS.file is None:
        tf.logging.warn(
            "Nothing to translate. Make sure to call this script using "
            "flags --text or --file.")
        return

    subtokenizer = tokenizer.Subtokenizer(FLAGS.vocab_file)

    # Set up estimator and params
    params = transformer_main_hvd.PARAMS_MAP[FLAGS.param_set]
    params["beam_size"] = _BEAM_SIZE
    params["alpha"] = _ALPHA
    params["extra_decode_length"] = _EXTRA_DECODE_LENGTH
    params["batch_size"] = _DECODE_BATCH_SIZE
    estimator = tf.estimator.Estimator(
        model_fn=transformer_main_hvd.model_fn,
        model_dir=FLAGS.model_dir,
        params=params,
        config=tf.estimator.RunConfig(session_config=tf.ConfigProto(
            intra_op_parallelism_threads=FLAGS.intra_op,
            inter_op_parallelism_threads=FLAGS.inter_op)))

    # create translation directory

    tf.gfile.MakeDirs(FLAGS.translations_dir)

    if FLAGS.text is not None:
        tf.logging.info("Translating text: %s" % FLAGS.text)
        translate_text(estimator, subtokenizer, FLAGS.text)

    if FLAGS.file is not None:
        input_file = os.path.abspath(FLAGS.file)
        tf.logging.info("Translating file: %s" % input_file)
        if not tf.gfile.Exists(FLAGS.file):
            raise ValueError("File does not exist: %s" % input_file)
        """ output_file = None
    if FLAGS.file_out is not None:
      output_file = os.path.abspath(FLAGS.file_out)
      tf.logging.info("File output specified: %s" % output_file) """

        for model in checkpoint_yield.stepfiles_iterator(
                FLAGS.model_dir,
                wait_minutes=FLAGS.wait_minutes,
                min_steps=FLAGS.min_steps):

            checkpoint_path, checkpoint_file = os.path.split(model[0])
            output_file = os.path.abspath(FLAGS.translations_dir + "/" +
                                          checkpoint_file + "_" +
                                          FLAGS.file_out)
            tf.logging.info("Output file: %s" % output_file)

            translate_file(model[0], estimator, subtokenizer, input_file,
                           output_file)

예제 #5

0

파일 보기

def evaluate_and_log_bleu(model, bleu_source, bleu_ref, vocab_file):
  """Calculate and record the BLEU score."""
  subtokenizer = tokenizer.Subtokenizer(vocab_file)

  uncased_score, cased_score = translate_and_compute_bleu(
      model, subtokenizer, bleu_source, bleu_ref)

  tf.compat.v1.logging.info("Bleu score (uncased): %s", uncased_score)
  tf.compat.v1.logging.info("Bleu score (cased): %s", cased_score)
  return uncased_score, cased_score

예제 #6

0

파일 보기

def evaluate(model,
             params,
             source_file,
             target_file,
             vocab_file,
             distribution_strategy=None):
    subtokenizer = tokenizer.Subtokenizer(vocab_file)

    compute(model, params, subtokenizer, source_file, target_file,
            distribution_strategy)

예제 #7

0

파일 보기

파일: translate.py 프로젝트: AakritiBudhraja/emnlp2020_heads_imp

def main(unused_argv):
    from official.transformer import transformer_main

    tf.logging.set_verbosity(tf.logging.INFO)

    if FLAGS.text is None and FLAGS.file is None:
        tf.logging.warn(
            "Nothing to translate. Make sure to call this script using "
            "flags --text or --file.")
        return

    subtokenizer = tokenizer.Subtokenizer(FLAGS.vocab_file)

    # Set up estimator and params
    params = transformer_main.PARAMS_MAP[FLAGS.param_set]
    params["beam_size"] = _BEAM_SIZE
    params["alpha"] = _ALPHA
    params["extra_decode_length"] = _EXTRA_DECODE_LENGTH
    params["batch_size"] = _DECODE_BATCH_SIZE

    params["concrete_coef"] = FLAGS.concrete_coef
    params["concrete_heads"] = eval(FLAGS.concrete_heads)
    params["alive_heads_enc_self"] = eval(FLAGS.alive_heads_enc_self)
    params["alive_heads_dec_self"] = eval(FLAGS.alive_heads_dec_self)
    params["alive_heads_enc_dec"] = eval(FLAGS.alive_heads_enc_dec)
    print(" ******* Printing gate values ********")
    print(params["alive_heads_enc_self"])
    print(params["alive_heads_enc_dec"])
    print(params["alive_heads_dec_self"])
    print("concrete_coef : {}".format(params["concrete_coef"]))
    print("concrete_heads : {}".format(params["concrete_heads"]))

    estimator = tf.estimator.Estimator(model_fn=transformer_main.model_fn,
                                       model_dir=FLAGS.model_dir,
                                       params=params)

    if FLAGS.text is not None:
        tf.logging.info("Translating text: %s" % FLAGS.text)
        translate_text(estimator, subtokenizer, FLAGS.text)

    if FLAGS.file is not None:
        input_file = os.path.abspath(FLAGS.file)
        tf.logging.info("Translating file: %s" % input_file)
        if not tf.gfile.Exists(FLAGS.file):
            raise ValueError("File does not exist: %s" % input_file)

        output_file = None
        if FLAGS.file_out is not None:
            output_file = os.path.abspath(FLAGS.file_out)
            tf.logging.info("File output specified: %s" % output_file)

        translate_file(estimator, subtokenizer, input_file, output_file)

예제 #8

0

파일 보기

def main(unused_argv):
    from official.transformer import transformer_main

    tf.logging.set_verbosity(tf.logging.INFO)

    if FLAGS.text is None and FLAGS.file is None:
        tf.logging.warn(
            "Nothing to translate. Make sure to call this script using "
            "flags --text or --file.")
        return

    subtokenizer = tokenizer.Subtokenizer(FLAGS.vocab_file)

    # Set up estimator and params
    params = transformer_main.PARAMS_MAP[FLAGS.param_set]

    # debug
    #print('debug: hidden_size %d' % params["hidden_size"])
    #print('debug: filter_size %d' % params["filter_size"])

    params["beam_size"] = _BEAM_SIZE
    params["alpha"] = _ALPHA
    params["extra_decode_length"] = _EXTRA_DECODE_LENGTH
    params["batch_size"] = _DECODE_BATCH_SIZE

    # TC: set vocab_size as the number of tokens in vocab_file
    params["vocab_size"] = len(open(FLAGS.vocab_file).readlines())
    print('TC: vocab_size %d' % params["vocab_size"])

    estimator = tf.estimator.Estimator(model_fn=transformer_main.model_fn,
                                       model_dir=FLAGS.model_dir,
                                       params=params)

    if FLAGS.text is not None:
        tf.logging.info("Translating text: %s" % FLAGS.text)
        translate_text(estimator, subtokenizer, FLAGS.text)

    if FLAGS.file is not None:
        input_file = os.path.abspath(FLAGS.file)
        tf.logging.info("Translating file: %s" % input_file)
        if not tf.gfile.Exists(FLAGS.file):
            raise ValueError("File does not exist: %s" % input_file)

        output_file = None
        if FLAGS.file_out is not None:
            output_file = os.path.abspath(FLAGS.file_out)
            tf.logging.info("File output specified: %s" % output_file)

        translate_file(estimator, subtokenizer, input_file, output_file)

예제 #9

0

파일 보기

def main(unused_argv):
    from official.transformer import transformer_main

    tf.logging.set_verbosity(tf.logging.INFO)

    if FLAGS.text is None and FLAGS.file is None:
        tf.logging.warn(
            "Nothing to translate. Make sure to call this script using "
            "flags --text or --file.")
        return

    subtokenizer = tokenizer.Subtokenizer(
        os.path.join(FLAGS.data_dir, FLAGS.vocab_file))

    if FLAGS.params == "base":
        params = model_params.TransformerBaseParams
    elif FLAGS.params == "big":
        params = model_params.TransformerBigParams
    else:
        raise ValueError("Invalid parameter set defined: %s."
                         "Expected 'base' or 'big.'" % FLAGS.params)

    # Set up estimator and params
    params.beam_size = _BEAM_SIZE
    params.alpha = _ALPHA
    params.extra_decode_length = _EXTRA_DECODE_LENGTH
    params.batch_size = _DECODE_BATCH_SIZE
    estimator = tf.estimator.Estimator(model_fn=transformer_main.model_fn,
                                       model_dir=FLAGS.model_dir,
                                       params=params)

    if FLAGS.text is not None:
        tf.logging.info("Translating text: %s" % FLAGS.text)
        translate_text(estimator, subtokenizer, FLAGS.text)

    if FLAGS.file is not None:
        input_file = os.path.abspath(FLAGS.file)
        tf.logging.info("Translating file: %s" % input_file)
        if not tf.gfile.Exists(FLAGS.file):
            raise ValueError("File does not exist: %s" % input_file)

        output_file = None
        if FLAGS.file_out is not None:
            output_file = os.path.abspath(FLAGS.file_out)
            tf.logging.info("File output specified: %s" % output_file)

        translate_file(estimator, subtokenizer, input_file, output_file)

예제 #10

0

파일 보기

  def predict(self):
    """Predicts result from the model."""
    params, flags_obj, is_train = self.params, self.flags_obj, False

    with tf.name_scope("model"):
      model = transformer.create_model(params, is_train)
      self._load_weights_if_possible(model, flags_obj.init_weight_path)
      model.summary()
    subtokenizer = tokenizer.Subtokenizer(flags_obj.vocab_file)

    ds = data_pipeline.eval_input_fn(params)
    ds = ds.map(lambda x, y: x).take(_SINGLE_SAMPLE)
    ret = model.predict(ds)
    val_outputs, _ = ret
    length = len(val_outputs)
    for i in range(length):
      translate.translate_from_input(val_outputs[i], subtokenizer)

예제 #11

0

파일 보기

def main(unused_argv):
    from official.transformer import transformer_main_triblock as transformer_main

    tf.logging.set_verbosity(tf.logging.INFO)

    if FLAGS.text is None and FLAGS.file is None:
        tf.logging.warn(
            "Nothing to translate. Make sure to call this script using "
            "flags --text or --file.")
        return

    underscored_ids = FLAGS.underscored_ids.split(",")
    underscored_ids = [int(idx) for idx in underscored_ids]

    subtokenizer = tokenizer.Subtokenizer(FLAGS.vocab_file)
    #tf.logging.info(subtokenizer.subtoken_list[:] )
    # Set up estimator and params
    params = transformer_main.PARAMS_MAP[FLAGS.param_set]
    params["underscored_ids"] = underscored_ids
    params["vocab_file"] = FLAGS.vocab_file
    params["beam_size"] = _BEAM_SIZE
    params["alpha"] = _ALPHA
    params["extra_decode_length"] = _EXTRA_DECODE_LENGTH
    params["batch_size"] = _DECODE_BATCH_SIZE
    estimator = tf.estimator.Estimator(model_fn=transformer_main.model_fn,
                                       model_dir=FLAGS.model_dir,
                                       params=params)

    if FLAGS.text is not None:
        tf.logging.info("Translating text: %s" % FLAGS.text)
        translate_text(estimator, subtokenizer, FLAGS.text)

    if FLAGS.file is not None:
        input_file = os.path.abspath(FLAGS.file)
        tf.logging.info("Translating file: %s" % input_file)
        if not tf.gfile.Exists(FLAGS.file):
            raise ValueError("File does not exist: %s" % input_file)

        output_file = None
        if FLAGS.file_out is not None:
            output_file = os.path.abspath(FLAGS.file_out)
            tf.logging.info("File output specified: %s" % output_file)

        translate_file(estimator, subtokenizer, input_file, output_file)

예제 #12

0

파일 보기

파일: transformer_main.py 프로젝트: wsgan001/AppIndP3

def evaluate_and_log_bleu(estimator, bleu_writer, bleu_source, bleu_ref):
    """Calculate and record the BLEU score."""
    subtokenizer = tokenizer.Subtokenizer(
        os.path.join(FLAGS.data_dir, FLAGS.vocab_file))

    uncased_score, cased_score = translate_and_compute_bleu(
        estimator, subtokenizer, bleu_source, bleu_ref)

    print("Bleu score (uncased):", uncased_score)
    print("Bleu score (cased):", cased_score)

    summary = tf.Summary(value=[
        tf.Summary.Value(tag="bleu/uncased", simple_value=uncased_score),
        tf.Summary.Value(tag="bleu/cased", simple_value=cased_score),
    ])

    bleu_writer.add_summary(summary, get_global_step(estimator))
    bleu_writer.flush()
    return uncased_score, cased_score

예제 #13

0

파일 보기

    def predict(self):
        """Predicts result from the model."""
        self.params['train'] = False

        params = self.params
        flags_obj = self.flags_obj

        with tf.name_scope("model"):
            model = transformer.create_model(params, is_train=False)
            self._load_weights_if_possible(
                model, tf.train.latest_checkpoint(self.flags_obj.model_dir))
            model.summary()
        subtokenizer = tokenizer.Subtokenizer(flags_obj.vocab_file)
        print(params)
        ds = data_pipeline.eval_input_fn(params)
        ds = ds.map(lambda x, y: x).take(_SINGLE_SAMPLE)
        import time
        start = time.time()
        ret = model.predict(ds)
        val_outputs, _ = ret
        length = len(val_outputs)
        for i in range(length):
            translate.translate_from_input(val_outputs[i], subtokenizer)
        print('\n\n\n', time.time() - start)

예제 #14

0

파일 보기

파일: transformer_triblock.py 프로젝트: domyounglee/Transformer-Summarization

    def predict(self, encoder_outputs, encoder_decoder_attention_bias):
        """Return predicted sequence."""
        batch_size = tf.shape(encoder_outputs)[0]
        input_length = tf.shape(encoder_outputs)[1]

        max_decode_length = self.params["max_output_length"]

        symbols_to_logits_fn = self._get_symbols_to_logits_fn(
            max_decode_length)

        # Create initial set of IDs that will be passed into symbols_to_logits_fn.
        initial_ids = tf.zeros([batch_size], dtype=tf.int32)

        # Create cache storing decoder attention values for each layer.
        cache = {
            "layer_%d" % layer: {
                "k": tf.zeros([batch_size, 0, self.params["hidden_size"]]),
                "v": tf.zeros([batch_size, 0, self.params["hidden_size"]]),
            }
            for layer in range(self.params["num_hidden_layers"])
        }

        # Add encoder output and attention bias to the cache.
        cache["encoder_outputs"] = encoder_outputs
        cache[
            "encoder_decoder_attention_bias"] = encoder_decoder_attention_bias

        ####domyoung 2019.12.9#####
        nontrigrams = nontrigram_generator(max_decode_length,
                                           self.params["underscored_ids"])
        nontrigrams = tf.constant(nontrigrams, dtype=tf.int32)
        tile_dims = [1] * nontrigrams.shape.ndims
        tile_dims[-1] = batch_size * self.params["beam_size"]
        nontrigrams = tf.tile(nontrigrams, tile_dims)
        nontrigrams = tf.reshape(nontrigrams, [-1, max_decode_length])
        subtokenizer = tokenizer.Subtokenizer(self.params["vocab_file"])
        key = tf.range(self.params["vocab_size"], dtype=tf.int32)
        #replace the first token '<pad>_' into '<pad>'
        subtoken_list = subtokenizer.subtoken_list[:]

        value = tf.constant(subtoken_list, dtype=tf.string)
        default_value = tf.constant("", dtype=tf.string)
        hashTable = tf.contrib.lookup.HashTable(
            tf.contrib.lookup.KeyValueTensorInitializer(key, value),
            default_value)
        # Use beam search to find the top beam_size sequences and scores.
        tf.logging.info(key)
        tf.logging.info(value)
        tf.logging.info(subtoken_list)
        decoded_ids, scores = beam_search.sequence_beam_search(
            symbols_to_logits_fn=symbols_to_logits_fn,
            initial_ids=initial_ids,
            initial_cache=cache,
            vocab_size=self.params["vocab_size"],
            hashTable=hashTable,
            nontrigrams=nontrigrams,
            use_trigram=True,
            beam_size=self.params["beam_size"],
            batch_size=self.params["batch_size"],
            alpha=self.params["alpha"],
            max_decode_length=max_decode_length,
            eos_id=EOS_ID)

        #########################
        # Get the top sequence for each batch element
        top_decoded_ids = decoded_ids[:, 0, 1:]
        top_scores = scores[:, 0]

        return {"outputs": top_decoded_ids, "scores": top_scores}