def testTextInputterTest(self): vocab_src = Vocab(vocab_src_file) vocab_trg = Vocab(vocab_trg_file) dataset = Dataset(vocab_src, vocab_trg, train_src_file, train_trg_file, [eval_src_file], eval_trg_file) test_iter = TestTextIterator( train_src_file, vocab_src, batch_size=13) inputter = TextLineInputter( dataset, "eval_features_file", batch_size=13) input_fields = dataset.input_fields test_data = inputter.make_feeding_data() for a, b in zip(test_iter, test_data[0]): x_str = a[0] x = a[1][0] x_len = a[1][1] x_str_new = b[0] x_new = b[2][input_fields[Constants.FEATURE_IDS_NAME]] x_len_new = b[2][input_fields[Constants.FEATURE_LENGTH_NAME]] assert x.all() == x_new.all() assert x_len.all() == x_len_new.all() assert numpy.all([str1 == str2 for str1, str2 in zip(x_str, x_str_new)]) print("Test Passed...")
def _prepare(self): """ Prepares for evaluation. Builds the model with reuse=True, mode=EVAL and preprocesses data file(s). """ text_inputter = TextLineInputter(dataset=self._dataset, data_field_name="eval_features_file", batch_size=self._batch_size) self._eval_feeding_data = text_inputter.make_feeding_data() self._model_configs = update_infer_params( # update inference parameters self._model_configs, beam_size=self._beam_size, maximum_labels_length=self._maximum_labels_length, length_penalty=self._length_penalty) estimator_spec = model_fn(model_configs=self._model_configs, mode=ModeKeys.INFER, dataset=self._dataset, name=self._model_name, reuse=True, verbose=False) self._predict_ops = estimator_spec.predictions tmp_trans_dir = os.path.join(self._model_configs["model_dir"], GlobalNames.TMP_TRANS_DIRNAME) if not gfile.Exists(tmp_trans_dir): gfile.MakeDirs(tmp_trans_dir) self._tmp_trans_file_prefix = os.path.join( tmp_trans_dir, GlobalNames.TMP_TRANS_FILENAME_PREFIX) self._read_ckpt_bleulog() self._eval_labels_file = self._dataset.eval_labels_file self._check_bleu_script() self._estop_patience = 0 self._best_bleu_score = 0.
def run(self): """ Runs ensemble model. """ self._vocab_source = Vocab( filename=self._model_configs["infer"]["source_words_vocabulary"], bpe_codes_file=self._model_configs["infer"]["source_bpecodes"]) self._vocab_target = Vocab( filename=self._model_configs["infer"]["target_words_vocabulary"], bpe_codes_file=self._model_configs["infer"]["target_bpecodes"]) # build dataset dataset = Dataset(self._vocab_source, self._vocab_target, eval_features_file=[ p["features_file"] for p in self._model_configs["infer_data"] ]) estimator_spec = model_fn_ensemble( self._model_dirs, dataset, weight_scheme=self._weight_scheme, inference_options=self._model_configs["infer"]) predict_op = estimator_spec.predictions sess = self._build_default_session() text_inputter = TextLineInputter( dataset=dataset, data_field_name="eval_features_file", batch_size=self._model_configs["infer"]["batch_size"], maximum_line_length=None) sess.run(tf.global_variables_initializer()) tf.logging.info("Start inference.") overall_start_time = time.time() for feeding_data, param in zip(text_inputter.make_feeding_data(), self._model_configs["infer_data"]): tf.logging.info("Infer Source Features File: {}.".format( param["features_file"])) start_time = time.time() infer(sess=sess, prediction_op=predict_op, feeding_data=feeding_data, output=param["output_file"], vocab_target=self._vocab_target, alpha=self._model_configs["infer"]["length_penalty"], delimiter=self._model_configs["infer"]["delimiter"], output_attention=False, tokenize_output=self._model_configs["infer"]["char_level"], tokenize_script=self._model_configs["infer"] ["tokenize_script"], verbose=True) tf.logging.info("FINISHED {}. Elapsed Time: {}.".format( param["features_file"], str(time.time() - start_time))) if param["labels_file"] is not None: bleu_score = multi_bleu_score( self._model_configs["infer"]["multibleu_script"], param["labels_file"], param["output_file"]) tf.logging.info("BLEU score ({}): {}".format( param["features_file"], bleu_score)) tf.logging.info("Total Elapsed Time: %s" % str(time.time() - overall_start_time))
def run(self): """ Runs ensemble model. """ vocab_source = Vocab( filename=self._model_configs["infer"]["source_words_vocabulary"], bpe_codes=self._model_configs["infer"]["source_bpecodes"]) vocab_target = Vocab( filename=self._model_configs["infer"]["target_words_vocabulary"], bpe_codes=self._model_configs["infer"]["target_bpecodes"]) estimator_spec = model_fn_ensemble( self._model_dirs, vocab_source, vocab_target, weight_scheme=self._weight_scheme, inference_options=self._model_configs["infer"]) predict_op = estimator_spec.predictions sess = self._build_default_session() text_inputter = TextLineInputter(line_readers=[ LineReader( data=p["features_file"], preprocessing_fn=lambda x: vocab_source.convert_to_idlist(x)) for p in self._model_configs["infer_data"] ], padding_id=vocab_source.pad_id, batch_size=self. _model_configs["infer"]["batch_size"]) sess.run(tf.global_variables_initializer()) tf.logging.info("Start inference.") overall_start_time = time.time() for feeding_data, param in zip( text_inputter.make_feeding_data(estimator_spec.input_fields), self._model_configs["infer_data"]): tf.logging.info("Infer Source Features File: {}.".format( param["features_file"])) start_time = time.time() infer(sess=sess, prediction_op=predict_op, infer_data=feeding_data, output=param["output_file"], vocab_source=vocab_source, vocab_target=vocab_target, delimiter=self._model_configs["infer"]["delimiter"], output_attention=False, to_char_level=self._model_configs["infer"]["char_level"], verbose=True) tf.logging.info("FINISHED {}. Elapsed Time: {}.".format( param["features_file"], str(time.time() - start_time))) if param["labels_file"] is not None: bleu_score = multi_bleu_score_from_file( hypothesis_file=param["output_file"], references_files=param["labels_file"], char_level=self._model_configs["infer"]["char_level"]) tf.logging.info("BLEU score (%s): %.2f" % (param["features_file"], bleu_score)) tf.logging.info("Total Elapsed Time: %s" % str(time.time() - overall_start_time))
def _prepare(self): """ Prepares for evaluation. Builds the model with reuse=True, mode=EVAL and preprocesses data file(s). """ features_file = self._dataset["features_file"] labels_file = self._dataset["labels_file"] vocab_source = self._dataset["vocab_source"] vocab_target = self._dataset["vocab_target"] self._model_configs = update_infer_params( # update inference parameters self._model_configs, beam_size=self._beam_size, maximum_labels_length=self._maximum_labels_length, length_penalty=self._length_penalty) estimator_spec = model_fn(model_configs=self._model_configs, mode=ModeKeys.INFER, vocab_source=vocab_source, vocab_target=vocab_target, name=self._model_name, reuse=True, verbose=False) self._predict_ops = estimator_spec.predictions text_inputter = TextLineInputter( line_readers=LineReader( data=features_file, preprocessing_fn=lambda x: vocab_source.convert_to_idlist(x)), padding_id=vocab_source.pad_id, batch_size=self._batch_size) self._infer_data = text_inputter.make_feeding_data( input_fields=estimator_spec.input_fields) tmp_trans_dir = os.path.join(self._model_configs["model_dir"], Constants.TMP_TRANS_DIRNAME) if not gfile.Exists(tmp_trans_dir): gfile.MakeDirs(tmp_trans_dir) self._tmp_trans_file_prefix = os.path.join(tmp_trans_dir, Constants.TMP_TRANS_FILENAME_PREFIX) self._read_ckpt_bleulog() # load references self._references = [] for rfile in access_multiple_files(labels_file): with open_file(rfile) as fp: if self._char_level: self._references.append(to_chinese_char(fp.readlines())) else: self._references.append(fp.readlines()) self._references = list(map(list, zip(*self._references))) with open_file(features_file) as fp: self._sources = fp.readlines() self._bad_count = 0 self._best_bleu_score = 0.
def _prepare(self): """ Prepares for evaluation. Builds the model with reuse=True, mode=EVAL and preprocesses data file(s). """ text_inputter = TextLineInputter(dataset=self._dataset, data_field_name="eval_features_file", batch_size=self._batch_size) self._infer_data = text_inputter.make_feeding_data() self._model_configs = update_infer_params( # update inference parameters self._model_configs, beam_size=self._beam_size, maximum_labels_length=self._maximum_labels_length, length_penalty=self._length_penalty) estimator_spec = model_fn(model_configs=self._model_configs, mode=ModeKeys.INFER, dataset=self._dataset, name=self._model_name, reuse=True, verbose=False) self._predict_ops = estimator_spec.predictions tmp_trans_dir = os.path.join(self._model_configs["model_dir"], Constants.TMP_TRANS_DIRNAME) if not gfile.Exists(tmp_trans_dir): gfile.MakeDirs(tmp_trans_dir) self._tmp_trans_file_prefix = os.path.join( tmp_trans_dir, Constants.TMP_TRANS_FILENAME_PREFIX) self._read_ckpt_bleulog() # load references self._references = [] for rfile in self._dataset.eval_labels_file: with open_file(rfile) as fp: self._references.append(fp.readlines()) self._references = list(map(list, zip(*self._references))) with open_file(self._dataset.eval_features_file) as fp: self._sources = fp.readlines() self._bad_count = 0 self._best_bleu_score = 0.
def run(self): """Infers data files. """ # build datasets self._vocab_source = Vocab( filename=self._model_configs["infer"]["source_words_vocabulary"], bpe_codes=self._model_configs["infer"]["source_bpecodes"], reverse_seq=False) self._vocab_target = Vocab( filename=self._model_configs["infer"]["target_words_vocabulary"], bpe_codes=self._model_configs["infer"]["target_bpecodes"], reverse_seq=self._model_configs["train"]["reverse_target"]) # build dataset dataset = Dataset( self._vocab_source, self._vocab_target, eval_features_file=[p["features_file"] for p in self._model_configs["infer_data"]]) self._model_configs = update_infer_params( self._model_configs, beam_size=self._model_configs["infer"]["beam_size"], maximum_labels_length=self._model_configs["infer"]["maximum_labels_length"], length_penalty=self._model_configs["infer"]["length_penalty"]) # build model estimator_spec = model_fn(model_configs=self._model_configs, mode=ModeKeys.INFER, dataset=dataset, name=self._model_configs["problem_name"]) predict_op = estimator_spec.predictions sess = self._build_default_session() text_inputter = TextLineInputter( dataset=dataset, data_field_name="eval_features_file", batch_size=self._model_configs["infer"]["batch_size"]) # reload checkpoint_path = tf.train.latest_checkpoint(self._model_configs["model_dir"]) if checkpoint_path: tf.logging.info("reloading models...") saver = tf.train.Saver() saver.restore(sess, checkpoint_path) else: raise OSError("File NOT Found. Fail to find checkpoint file from: {}" .format(self._model_configs["model_dir"])) tf.logging.info("Start inference.") overall_start_time = time.time() for infer_data, param in zip(text_inputter.make_feeding_data( input_fields=estimator_spec.input_fields), self._model_configs["infer_data"]): tf.logging.info("Infer Source File: {}.".format(param["features_file"])) start_time = time.time() infer(sess=sess, prediction_op=predict_op, infer_data=infer_data, output=param["output_file"], vocab_source=self._vocab_source, vocab_target=self._vocab_target, delimiter=self._model_configs["infer"]["delimiter"], output_attention=param["output_attention"], tokenize_output=self._model_configs["infer"]["char_level"], verbose=True) tf.logging.info("FINISHED {}. Elapsed Time: {}." .format(param["features_file"], str(time.time() - start_time))) if param["labels_file"] is not None: bleu_score = multi_bleu_score_from_file( hypothesis_file=param["output_file"], references_files=param["labels_file"], char_level=self._model_configs["infer"]["char_level"]) tf.logging.info("BLEU score (%s): %.2f" % (param["features_file"], bleu_score)) tf.logging.info("Total Elapsed Time: %s" % str(time.time() - overall_start_time))