def _init_subtokenizer(self, vocab_list): temp_file = tempfile.NamedTemporaryFile(delete=False) with tf.gfile.Open(temp_file.name, 'w') as w: for subtoken in vocab_list: w.write("'%s'" % subtoken) w.write("\n") return tokenizer.Subtokenizer(temp_file.name, reserved_tokens=[])
def translate_list(vocab, model_dir, params, contentList): translation_results = [] subtokenizer = tokenizer.Subtokenizer(vocab) estimator = tf.estimator.Estimator(model_fn=transformer_main.model_fn, model_dir=model_dir, params=params) estimator_predictor = tf.contrib.predictor.from_estimator( estimator, export.build_tensor_serving_input_receiver_fn(shape=[None], dtype=tf.int32, batch_size=None)) for content in contentList: try: tokens = _encode_and_add_eos(content, subtokenizer) predictions = estimator_predictor( {"input": np.array([tokens], dtype=np.int32)}) translation = _trim_and_decode(predictions["outputs"][0], subtokenizer) translation_results.append(translation) except: print("error in translation") return translation_results
def evaluate_and_log_bleu(model, params, bleu_source, bleu_ref, vocab_file, distribution_strategy=None): """Calculate and record the BLEU score. Args: model: A Keras model, used to generate the translations. params: A dictionary, containing the translation related parameters. bleu_source: A file containing source sentences for translation. bleu_ref: A file containing the reference for the translated sentences. vocab_file: A file containing the vocabulary for translation. distribution_strategy: A platform distribution strategy, used for TPU based translation. Returns: uncased_score: A float, the case insensitive BLEU score. cased_score: A float, the case sensitive BLEU score. """ subtokenizer = tokenizer.Subtokenizer(vocab_file) uncased_score, cased_score = translate_and_compute_bleu( model, params, subtokenizer, bleu_source, bleu_ref, distribution_strategy) logging.info("Bleu score (uncased): %s", uncased_score) logging.info("Bleu score (cased): %s", cased_score) return uncased_score, cased_score
def main(unused_argv): # changed to import transformer_main_hvd instead of transformer_main from official.transformer import transformer_main_hvd tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.text is None and FLAGS.file is None: tf.logging.warn( "Nothing to translate. Make sure to call this script using " "flags --text or --file.") return subtokenizer = tokenizer.Subtokenizer(FLAGS.vocab_file) # Set up estimator and params params = transformer_main_hvd.PARAMS_MAP[FLAGS.param_set] params["beam_size"] = _BEAM_SIZE params["alpha"] = _ALPHA params["extra_decode_length"] = _EXTRA_DECODE_LENGTH params["batch_size"] = _DECODE_BATCH_SIZE estimator = tf.estimator.Estimator( model_fn=transformer_main_hvd.model_fn, model_dir=FLAGS.model_dir, params=params, config=tf.estimator.RunConfig(session_config=tf.ConfigProto( intra_op_parallelism_threads=FLAGS.intra_op, inter_op_parallelism_threads=FLAGS.inter_op))) # create translation directory tf.gfile.MakeDirs(FLAGS.translations_dir) if FLAGS.text is not None: tf.logging.info("Translating text: %s" % FLAGS.text) translate_text(estimator, subtokenizer, FLAGS.text) if FLAGS.file is not None: input_file = os.path.abspath(FLAGS.file) tf.logging.info("Translating file: %s" % input_file) if not tf.gfile.Exists(FLAGS.file): raise ValueError("File does not exist: %s" % input_file) """ output_file = None if FLAGS.file_out is not None: output_file = os.path.abspath(FLAGS.file_out) tf.logging.info("File output specified: %s" % output_file) """ for model in checkpoint_yield.stepfiles_iterator( FLAGS.model_dir, wait_minutes=FLAGS.wait_minutes, min_steps=FLAGS.min_steps): checkpoint_path, checkpoint_file = os.path.split(model[0]) output_file = os.path.abspath(FLAGS.translations_dir + "/" + checkpoint_file + "_" + FLAGS.file_out) tf.logging.info("Output file: %s" % output_file) translate_file(model[0], estimator, subtokenizer, input_file, output_file)
def evaluate_and_log_bleu(model, bleu_source, bleu_ref, vocab_file): """Calculate and record the BLEU score.""" subtokenizer = tokenizer.Subtokenizer(vocab_file) uncased_score, cased_score = translate_and_compute_bleu( model, subtokenizer, bleu_source, bleu_ref) tf.compat.v1.logging.info("Bleu score (uncased): %s", uncased_score) tf.compat.v1.logging.info("Bleu score (cased): %s", cased_score) return uncased_score, cased_score
def evaluate(model, params, source_file, target_file, vocab_file, distribution_strategy=None): subtokenizer = tokenizer.Subtokenizer(vocab_file) compute(model, params, subtokenizer, source_file, target_file, distribution_strategy)
def main(unused_argv): from official.transformer import transformer_main tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.text is None and FLAGS.file is None: tf.logging.warn( "Nothing to translate. Make sure to call this script using " "flags --text or --file.") return subtokenizer = tokenizer.Subtokenizer(FLAGS.vocab_file) # Set up estimator and params params = transformer_main.PARAMS_MAP[FLAGS.param_set] params["beam_size"] = _BEAM_SIZE params["alpha"] = _ALPHA params["extra_decode_length"] = _EXTRA_DECODE_LENGTH params["batch_size"] = _DECODE_BATCH_SIZE params["concrete_coef"] = FLAGS.concrete_coef params["concrete_heads"] = eval(FLAGS.concrete_heads) params["alive_heads_enc_self"] = eval(FLAGS.alive_heads_enc_self) params["alive_heads_dec_self"] = eval(FLAGS.alive_heads_dec_self) params["alive_heads_enc_dec"] = eval(FLAGS.alive_heads_enc_dec) print(" ******* Printing gate values ********") print(params["alive_heads_enc_self"]) print(params["alive_heads_enc_dec"]) print(params["alive_heads_dec_self"]) print("concrete_coef : {}".format(params["concrete_coef"])) print("concrete_heads : {}".format(params["concrete_heads"])) estimator = tf.estimator.Estimator(model_fn=transformer_main.model_fn, model_dir=FLAGS.model_dir, params=params) if FLAGS.text is not None: tf.logging.info("Translating text: %s" % FLAGS.text) translate_text(estimator, subtokenizer, FLAGS.text) if FLAGS.file is not None: input_file = os.path.abspath(FLAGS.file) tf.logging.info("Translating file: %s" % input_file) if not tf.gfile.Exists(FLAGS.file): raise ValueError("File does not exist: %s" % input_file) output_file = None if FLAGS.file_out is not None: output_file = os.path.abspath(FLAGS.file_out) tf.logging.info("File output specified: %s" % output_file) translate_file(estimator, subtokenizer, input_file, output_file)
def main(unused_argv): from official.transformer import transformer_main tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.text is None and FLAGS.file is None: tf.logging.warn( "Nothing to translate. Make sure to call this script using " "flags --text or --file.") return subtokenizer = tokenizer.Subtokenizer(FLAGS.vocab_file) # Set up estimator and params params = transformer_main.PARAMS_MAP[FLAGS.param_set] # debug #print('debug: hidden_size %d' % params["hidden_size"]) #print('debug: filter_size %d' % params["filter_size"]) params["beam_size"] = _BEAM_SIZE params["alpha"] = _ALPHA params["extra_decode_length"] = _EXTRA_DECODE_LENGTH params["batch_size"] = _DECODE_BATCH_SIZE # TC: set vocab_size as the number of tokens in vocab_file params["vocab_size"] = len(open(FLAGS.vocab_file).readlines()) print('TC: vocab_size %d' % params["vocab_size"]) estimator = tf.estimator.Estimator(model_fn=transformer_main.model_fn, model_dir=FLAGS.model_dir, params=params) if FLAGS.text is not None: tf.logging.info("Translating text: %s" % FLAGS.text) translate_text(estimator, subtokenizer, FLAGS.text) if FLAGS.file is not None: input_file = os.path.abspath(FLAGS.file) tf.logging.info("Translating file: %s" % input_file) if not tf.gfile.Exists(FLAGS.file): raise ValueError("File does not exist: %s" % input_file) output_file = None if FLAGS.file_out is not None: output_file = os.path.abspath(FLAGS.file_out) tf.logging.info("File output specified: %s" % output_file) translate_file(estimator, subtokenizer, input_file, output_file)
def main(unused_argv): from official.transformer import transformer_main tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.text is None and FLAGS.file is None: tf.logging.warn( "Nothing to translate. Make sure to call this script using " "flags --text or --file.") return subtokenizer = tokenizer.Subtokenizer( os.path.join(FLAGS.data_dir, FLAGS.vocab_file)) if FLAGS.params == "base": params = model_params.TransformerBaseParams elif FLAGS.params == "big": params = model_params.TransformerBigParams else: raise ValueError("Invalid parameter set defined: %s." "Expected 'base' or 'big.'" % FLAGS.params) # Set up estimator and params params.beam_size = _BEAM_SIZE params.alpha = _ALPHA params.extra_decode_length = _EXTRA_DECODE_LENGTH params.batch_size = _DECODE_BATCH_SIZE estimator = tf.estimator.Estimator(model_fn=transformer_main.model_fn, model_dir=FLAGS.model_dir, params=params) if FLAGS.text is not None: tf.logging.info("Translating text: %s" % FLAGS.text) translate_text(estimator, subtokenizer, FLAGS.text) if FLAGS.file is not None: input_file = os.path.abspath(FLAGS.file) tf.logging.info("Translating file: %s" % input_file) if not tf.gfile.Exists(FLAGS.file): raise ValueError("File does not exist: %s" % input_file) output_file = None if FLAGS.file_out is not None: output_file = os.path.abspath(FLAGS.file_out) tf.logging.info("File output specified: %s" % output_file) translate_file(estimator, subtokenizer, input_file, output_file)
def predict(self): """Predicts result from the model.""" params, flags_obj, is_train = self.params, self.flags_obj, False with tf.name_scope("model"): model = transformer.create_model(params, is_train) self._load_weights_if_possible(model, flags_obj.init_weight_path) model.summary() subtokenizer = tokenizer.Subtokenizer(flags_obj.vocab_file) ds = data_pipeline.eval_input_fn(params) ds = ds.map(lambda x, y: x).take(_SINGLE_SAMPLE) ret = model.predict(ds) val_outputs, _ = ret length = len(val_outputs) for i in range(length): translate.translate_from_input(val_outputs[i], subtokenizer)
def main(unused_argv): from official.transformer import transformer_main_triblock as transformer_main tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.text is None and FLAGS.file is None: tf.logging.warn( "Nothing to translate. Make sure to call this script using " "flags --text or --file.") return underscored_ids = FLAGS.underscored_ids.split(",") underscored_ids = [int(idx) for idx in underscored_ids] subtokenizer = tokenizer.Subtokenizer(FLAGS.vocab_file) #tf.logging.info(subtokenizer.subtoken_list[:] ) # Set up estimator and params params = transformer_main.PARAMS_MAP[FLAGS.param_set] params["underscored_ids"] = underscored_ids params["vocab_file"] = FLAGS.vocab_file params["beam_size"] = _BEAM_SIZE params["alpha"] = _ALPHA params["extra_decode_length"] = _EXTRA_DECODE_LENGTH params["batch_size"] = _DECODE_BATCH_SIZE estimator = tf.estimator.Estimator(model_fn=transformer_main.model_fn, model_dir=FLAGS.model_dir, params=params) if FLAGS.text is not None: tf.logging.info("Translating text: %s" % FLAGS.text) translate_text(estimator, subtokenizer, FLAGS.text) if FLAGS.file is not None: input_file = os.path.abspath(FLAGS.file) tf.logging.info("Translating file: %s" % input_file) if not tf.gfile.Exists(FLAGS.file): raise ValueError("File does not exist: %s" % input_file) output_file = None if FLAGS.file_out is not None: output_file = os.path.abspath(FLAGS.file_out) tf.logging.info("File output specified: %s" % output_file) translate_file(estimator, subtokenizer, input_file, output_file)
def evaluate_and_log_bleu(estimator, bleu_writer, bleu_source, bleu_ref): """Calculate and record the BLEU score.""" subtokenizer = tokenizer.Subtokenizer( os.path.join(FLAGS.data_dir, FLAGS.vocab_file)) uncased_score, cased_score = translate_and_compute_bleu( estimator, subtokenizer, bleu_source, bleu_ref) print("Bleu score (uncased):", uncased_score) print("Bleu score (cased):", cased_score) summary = tf.Summary(value=[ tf.Summary.Value(tag="bleu/uncased", simple_value=uncased_score), tf.Summary.Value(tag="bleu/cased", simple_value=cased_score), ]) bleu_writer.add_summary(summary, get_global_step(estimator)) bleu_writer.flush() return uncased_score, cased_score
def predict(self): """Predicts result from the model.""" self.params['train'] = False params = self.params flags_obj = self.flags_obj with tf.name_scope("model"): model = transformer.create_model(params, is_train=False) self._load_weights_if_possible( model, tf.train.latest_checkpoint(self.flags_obj.model_dir)) model.summary() subtokenizer = tokenizer.Subtokenizer(flags_obj.vocab_file) print(params) ds = data_pipeline.eval_input_fn(params) ds = ds.map(lambda x, y: x).take(_SINGLE_SAMPLE) import time start = time.time() ret = model.predict(ds) val_outputs, _ = ret length = len(val_outputs) for i in range(length): translate.translate_from_input(val_outputs[i], subtokenizer) print('\n\n\n', time.time() - start)
def predict(self, encoder_outputs, encoder_decoder_attention_bias): """Return predicted sequence.""" batch_size = tf.shape(encoder_outputs)[0] input_length = tf.shape(encoder_outputs)[1] max_decode_length = self.params["max_output_length"] symbols_to_logits_fn = self._get_symbols_to_logits_fn( max_decode_length) # Create initial set of IDs that will be passed into symbols_to_logits_fn. initial_ids = tf.zeros([batch_size], dtype=tf.int32) # Create cache storing decoder attention values for each layer. cache = { "layer_%d" % layer: { "k": tf.zeros([batch_size, 0, self.params["hidden_size"]]), "v": tf.zeros([batch_size, 0, self.params["hidden_size"]]), } for layer in range(self.params["num_hidden_layers"]) } # Add encoder output and attention bias to the cache. cache["encoder_outputs"] = encoder_outputs cache[ "encoder_decoder_attention_bias"] = encoder_decoder_attention_bias ####domyoung 2019.12.9##### nontrigrams = nontrigram_generator(max_decode_length, self.params["underscored_ids"]) nontrigrams = tf.constant(nontrigrams, dtype=tf.int32) tile_dims = [1] * nontrigrams.shape.ndims tile_dims[-1] = batch_size * self.params["beam_size"] nontrigrams = tf.tile(nontrigrams, tile_dims) nontrigrams = tf.reshape(nontrigrams, [-1, max_decode_length]) subtokenizer = tokenizer.Subtokenizer(self.params["vocab_file"]) key = tf.range(self.params["vocab_size"], dtype=tf.int32) #replace the first token '<pad>_' into '<pad>' subtoken_list = subtokenizer.subtoken_list[:] value = tf.constant(subtoken_list, dtype=tf.string) default_value = tf.constant("", dtype=tf.string) hashTable = tf.contrib.lookup.HashTable( tf.contrib.lookup.KeyValueTensorInitializer(key, value), default_value) # Use beam search to find the top beam_size sequences and scores. tf.logging.info(key) tf.logging.info(value) tf.logging.info(subtoken_list) decoded_ids, scores = beam_search.sequence_beam_search( symbols_to_logits_fn=symbols_to_logits_fn, initial_ids=initial_ids, initial_cache=cache, vocab_size=self.params["vocab_size"], hashTable=hashTable, nontrigrams=nontrigrams, use_trigram=True, beam_size=self.params["beam_size"], batch_size=self.params["batch_size"], alpha=self.params["alpha"], max_decode_length=max_decode_length, eos_id=EOS_ID) ######################### # Get the top sequence for each batch element top_decoded_ids = decoded_ids[:, 0, 1:] top_scores = scores[:, 0] return {"outputs": top_decoded_ids, "scores": top_scores}