Python TextLineInputter.TextLineInputter示例，njunmt.data.text_inputter.TextLineInputter.TextLineInputter Python示例

示例#1

0

显示文件

 def testTextInputterTest(self):
     vocab_src = Vocab(vocab_src_file)
     vocab_trg = Vocab(vocab_trg_file)
     dataset = Dataset(vocab_src, vocab_trg,
                       train_src_file, train_trg_file,
                       [eval_src_file], eval_trg_file)
     test_iter = TestTextIterator(
         train_src_file,
         vocab_src,
         batch_size=13)
     inputter = TextLineInputter(
         dataset,
         "eval_features_file",
         batch_size=13)
     input_fields = dataset.input_fields
     test_data = inputter.make_feeding_data()
     for a, b in zip(test_iter, test_data[0]):
         x_str = a[0]
         x = a[1][0]
         x_len = a[1][1]
         x_str_new = b[0]
         x_new = b[2][input_fields[Constants.FEATURE_IDS_NAME]]
         x_len_new = b[2][input_fields[Constants.FEATURE_LENGTH_NAME]]
         assert x.all() == x_new.all()
         assert x_len.all() == x_len_new.all()
         assert numpy.all([str1 == str2 for str1, str2 in zip(x_str, x_str_new)])
     print("Test Passed...")

示例#2

0

显示文件

    def _prepare(self):
        """ Prepares for evaluation.

        Builds the model with reuse=True, mode=EVAL and preprocesses
        data file(s).
        """
        text_inputter = TextLineInputter(dataset=self._dataset,
                                         data_field_name="eval_features_file",
                                         batch_size=self._batch_size)
        self._eval_feeding_data = text_inputter.make_feeding_data()
        self._model_configs = update_infer_params(  # update inference parameters
            self._model_configs,
            beam_size=self._beam_size,
            maximum_labels_length=self._maximum_labels_length,
            length_penalty=self._length_penalty)
        estimator_spec = model_fn(model_configs=self._model_configs,
                                  mode=ModeKeys.INFER,
                                  dataset=self._dataset,
                                  name=self._model_name,
                                  reuse=True,
                                  verbose=False)
        self._predict_ops = estimator_spec.predictions
        tmp_trans_dir = os.path.join(self._model_configs["model_dir"],
                                     GlobalNames.TMP_TRANS_DIRNAME)
        if not gfile.Exists(tmp_trans_dir):
            gfile.MakeDirs(tmp_trans_dir)
        self._tmp_trans_file_prefix = os.path.join(
            tmp_trans_dir, GlobalNames.TMP_TRANS_FILENAME_PREFIX)
        self._read_ckpt_bleulog()
        self._eval_labels_file = self._dataset.eval_labels_file
        self._check_bleu_script()
        self._estop_patience = 0
        self._best_bleu_score = 0.

示例#3

0

显示文件

    def run(self):
        """ Runs ensemble model. """
        self._vocab_source = Vocab(
            filename=self._model_configs["infer"]["source_words_vocabulary"],
            bpe_codes_file=self._model_configs["infer"]["source_bpecodes"])
        self._vocab_target = Vocab(
            filename=self._model_configs["infer"]["target_words_vocabulary"],
            bpe_codes_file=self._model_configs["infer"]["target_bpecodes"])
        # build dataset
        dataset = Dataset(self._vocab_source,
                          self._vocab_target,
                          eval_features_file=[
                              p["features_file"]
                              for p in self._model_configs["infer_data"]
                          ])
        estimator_spec = model_fn_ensemble(
            self._model_dirs,
            dataset,
            weight_scheme=self._weight_scheme,
            inference_options=self._model_configs["infer"])
        predict_op = estimator_spec.predictions
        sess = self._build_default_session()
        text_inputter = TextLineInputter(
            dataset=dataset,
            data_field_name="eval_features_file",
            batch_size=self._model_configs["infer"]["batch_size"],
            maximum_line_length=None)
        sess.run(tf.global_variables_initializer())
        tf.logging.info("Start inference.")
        overall_start_time = time.time()

        for feeding_data, param in zip(text_inputter.make_feeding_data(),
                                       self._model_configs["infer_data"]):
            tf.logging.info("Infer Source Features File: {}.".format(
                param["features_file"]))
            start_time = time.time()
            infer(sess=sess,
                  prediction_op=predict_op,
                  feeding_data=feeding_data,
                  output=param["output_file"],
                  vocab_target=self._vocab_target,
                  alpha=self._model_configs["infer"]["length_penalty"],
                  delimiter=self._model_configs["infer"]["delimiter"],
                  output_attention=False,
                  tokenize_output=self._model_configs["infer"]["char_level"],
                  tokenize_script=self._model_configs["infer"]
                  ["tokenize_script"],
                  verbose=True)
            tf.logging.info("FINISHED {}. Elapsed Time: {}.".format(
                param["features_file"], str(time.time() - start_time)))
            if param["labels_file"] is not None:
                bleu_score = multi_bleu_score(
                    self._model_configs["infer"]["multibleu_script"],
                    param["labels_file"], param["output_file"])
                tf.logging.info("BLEU score ({}): {}".format(
                    param["features_file"], bleu_score))
        tf.logging.info("Total Elapsed Time: %s" %
                        str(time.time() - overall_start_time))

示例#4

0

显示文件

    def run(self):
        """ Runs ensemble model. """
        vocab_source = Vocab(
            filename=self._model_configs["infer"]["source_words_vocabulary"],
            bpe_codes=self._model_configs["infer"]["source_bpecodes"])
        vocab_target = Vocab(
            filename=self._model_configs["infer"]["target_words_vocabulary"],
            bpe_codes=self._model_configs["infer"]["target_bpecodes"])

        estimator_spec = model_fn_ensemble(
            self._model_dirs,
            vocab_source,
            vocab_target,
            weight_scheme=self._weight_scheme,
            inference_options=self._model_configs["infer"])
        predict_op = estimator_spec.predictions
        sess = self._build_default_session()
        text_inputter = TextLineInputter(line_readers=[
            LineReader(
                data=p["features_file"],
                preprocessing_fn=lambda x: vocab_source.convert_to_idlist(x))
            for p in self._model_configs["infer_data"]
        ],
                                         padding_id=vocab_source.pad_id,
                                         batch_size=self.
                                         _model_configs["infer"]["batch_size"])
        sess.run(tf.global_variables_initializer())
        tf.logging.info("Start inference.")
        overall_start_time = time.time()

        for feeding_data, param in zip(
                text_inputter.make_feeding_data(estimator_spec.input_fields),
                self._model_configs["infer_data"]):
            tf.logging.info("Infer Source Features File: {}.".format(
                param["features_file"]))
            start_time = time.time()
            infer(sess=sess,
                  prediction_op=predict_op,
                  infer_data=feeding_data,
                  output=param["output_file"],
                  vocab_source=vocab_source,
                  vocab_target=vocab_target,
                  delimiter=self._model_configs["infer"]["delimiter"],
                  output_attention=False,
                  to_char_level=self._model_configs["infer"]["char_level"],
                  verbose=True)
            tf.logging.info("FINISHED {}. Elapsed Time: {}.".format(
                param["features_file"], str(time.time() - start_time)))
            if param["labels_file"] is not None:
                bleu_score = multi_bleu_score_from_file(
                    hypothesis_file=param["output_file"],
                    references_files=param["labels_file"],
                    char_level=self._model_configs["infer"]["char_level"])
                tf.logging.info("BLEU score (%s): %.2f" %
                                (param["features_file"], bleu_score))
        tf.logging.info("Total Elapsed Time: %s" %
                        str(time.time() - overall_start_time))

示例#5

0

显示文件

文件： text_metrics_spec.py 项目： xiaofeng5611/NJUNMT-tf

    def _prepare(self):
        """ Prepares for evaluation.

        Builds the model with reuse=True, mode=EVAL and preprocesses
        data file(s).
        """
        features_file = self._dataset["features_file"]
        labels_file = self._dataset["labels_file"]
        vocab_source = self._dataset["vocab_source"]
        vocab_target = self._dataset["vocab_target"]
        self._model_configs = update_infer_params(  # update inference parameters
            self._model_configs,
            beam_size=self._beam_size,
            maximum_labels_length=self._maximum_labels_length,
            length_penalty=self._length_penalty)
        estimator_spec = model_fn(model_configs=self._model_configs,
                                  mode=ModeKeys.INFER,
                                  vocab_source=vocab_source,
                                  vocab_target=vocab_target,
                                  name=self._model_name, reuse=True,
                                  verbose=False)
        self._predict_ops = estimator_spec.predictions
        text_inputter = TextLineInputter(
            line_readers=LineReader(
                data=features_file,
                preprocessing_fn=lambda x: vocab_source.convert_to_idlist(x)),
            padding_id=vocab_source.pad_id,
            batch_size=self._batch_size)
        self._infer_data = text_inputter.make_feeding_data(
            input_fields=estimator_spec.input_fields)
        tmp_trans_dir = os.path.join(self._model_configs["model_dir"], Constants.TMP_TRANS_DIRNAME)
        if not gfile.Exists(tmp_trans_dir):
            gfile.MakeDirs(tmp_trans_dir)
        self._tmp_trans_file_prefix = os.path.join(tmp_trans_dir, Constants.TMP_TRANS_FILENAME_PREFIX)
        self._read_ckpt_bleulog()
        # load references
        self._references = []
        for rfile in access_multiple_files(labels_file):
            with open_file(rfile) as fp:
                if self._char_level:
                    self._references.append(to_chinese_char(fp.readlines()))
                else:
                    self._references.append(fp.readlines())
        self._references = list(map(list, zip(*self._references)))
        with open_file(features_file) as fp:
            self._sources = fp.readlines()
        self._bad_count = 0
        self._best_bleu_score = 0.

示例#6

0

显示文件

文件： text_metrics_spec.py 项目： CRyan2016/NJUNMT-tf

    def _prepare(self):
        """ Prepares for evaluation.

        Builds the model with reuse=True, mode=EVAL and preprocesses
        data file(s).
        """
        text_inputter = TextLineInputter(dataset=self._dataset,
                                         data_field_name="eval_features_file",
                                         batch_size=self._batch_size)
        self._infer_data = text_inputter.make_feeding_data()
        self._model_configs = update_infer_params(  # update inference parameters
            self._model_configs,
            beam_size=self._beam_size,
            maximum_labels_length=self._maximum_labels_length,
            length_penalty=self._length_penalty)
        estimator_spec = model_fn(model_configs=self._model_configs,
                                  mode=ModeKeys.INFER,
                                  dataset=self._dataset,
                                  name=self._model_name,
                                  reuse=True,
                                  verbose=False)
        self._predict_ops = estimator_spec.predictions
        tmp_trans_dir = os.path.join(self._model_configs["model_dir"],
                                     Constants.TMP_TRANS_DIRNAME)
        if not gfile.Exists(tmp_trans_dir):
            gfile.MakeDirs(tmp_trans_dir)
        self._tmp_trans_file_prefix = os.path.join(
            tmp_trans_dir, Constants.TMP_TRANS_FILENAME_PREFIX)
        self._read_ckpt_bleulog()
        # load references
        self._references = []
        for rfile in self._dataset.eval_labels_file:
            with open_file(rfile) as fp:
                self._references.append(fp.readlines())
        self._references = list(map(list, zip(*self._references)))
        with open_file(self._dataset.eval_features_file) as fp:
            self._sources = fp.readlines()
        self._bad_count = 0
        self._best_bleu_score = 0.

示例#7

0

显示文件

    def run(self):
        """Infers data files. """
        # build datasets
        self._vocab_source = Vocab(
            filename=self._model_configs["infer"]["source_words_vocabulary"],
            bpe_codes=self._model_configs["infer"]["source_bpecodes"],
            reverse_seq=False)
        self._vocab_target = Vocab(
            filename=self._model_configs["infer"]["target_words_vocabulary"],
            bpe_codes=self._model_configs["infer"]["target_bpecodes"],
            reverse_seq=self._model_configs["train"]["reverse_target"])
        # build dataset
        dataset = Dataset(
            self._vocab_source,
            self._vocab_target,
            eval_features_file=[p["features_file"] for p
                                in self._model_configs["infer_data"]])

        self._model_configs = update_infer_params(
            self._model_configs,
            beam_size=self._model_configs["infer"]["beam_size"],
            maximum_labels_length=self._model_configs["infer"]["maximum_labels_length"],
            length_penalty=self._model_configs["infer"]["length_penalty"])
        # build model
        estimator_spec = model_fn(model_configs=self._model_configs,
                                  mode=ModeKeys.INFER,
                                  dataset=dataset,
                                  name=self._model_configs["problem_name"])
        predict_op = estimator_spec.predictions

        sess = self._build_default_session()

        text_inputter = TextLineInputter(
            dataset=dataset,
            data_field_name="eval_features_file",
            batch_size=self._model_configs["infer"]["batch_size"])
        # reload
        checkpoint_path = tf.train.latest_checkpoint(self._model_configs["model_dir"])
        if checkpoint_path:
            tf.logging.info("reloading models...")
            saver = tf.train.Saver()
            saver.restore(sess, checkpoint_path)
        else:
            raise OSError("File NOT Found. Fail to find checkpoint file from: {}"
                          .format(self._model_configs["model_dir"]))

        tf.logging.info("Start inference.")
        overall_start_time = time.time()

        for infer_data, param in zip(text_inputter.make_feeding_data(
                input_fields=estimator_spec.input_fields),
                self._model_configs["infer_data"]):
            tf.logging.info("Infer Source File: {}.".format(param["features_file"]))
            start_time = time.time()
            infer(sess=sess,
                  prediction_op=predict_op,
                  infer_data=infer_data,
                  output=param["output_file"],
                  vocab_source=self._vocab_source,
                  vocab_target=self._vocab_target,
                  delimiter=self._model_configs["infer"]["delimiter"],
                  output_attention=param["output_attention"],
                  tokenize_output=self._model_configs["infer"]["char_level"],
                  verbose=True)
            tf.logging.info("FINISHED {}. Elapsed Time: {}."
                            .format(param["features_file"], str(time.time() - start_time)))
            if param["labels_file"] is not None:
                bleu_score = multi_bleu_score_from_file(
                    hypothesis_file=param["output_file"],
                    references_files=param["labels_file"],
                    char_level=self._model_configs["infer"]["char_level"])
                tf.logging.info("BLEU score (%s): %.2f"
                                % (param["features_file"], bleu_score))
        tf.logging.info("Total Elapsed Time: %s" % str(time.time() - overall_start_time))