Python open_file 예제들, njunmt.utils.misc.open_file Python 예제들

예제 #1

0

파일 보기

파일: data_iterator.py 프로젝트: zhengzx-nlp/NJUNMT-tf-server

    def __init__(self,
                 source,
                 target,
                 vocab_source,
                 vocab_target,
                 batch_size=80,
                 maxlen_src=50,
                 maxlen_trg=100,
                 n_words_src=-1,
                 n_words_trg=-1,
                 shuffle_every_epoch=None,
                 shuffle_before_train=None):
        """

        :param source: `str`
        :param target: `str`
        :param vocab_source: `Vocab`
        :param vocab_target: `Vocab`
        :param batch_size: `int`
        :param maxlen_src: `int`
        :param maxlen_trg: `int`
        :param n_words_src: `int`
        :param n_words_trg: `int`
        :param shuffle_every_epoch: if is not None, use it as postfix of shuffled data
        :param shuffle_before_train: if is not None, use it as postfix of shuffled data
        :return:
        """
        if shuffle_before_train:
            tf.logging.info("shuffling data before training\n"
                            "\t%s ==> %s\n\t%s ==> %s" %
                            (source, "./source.shuf." + shuffle_before_train,
                             target, "./target.shuf." + shuffle_before_train))
            shuffle_data([source, target], [
                "./source.shuf." + shuffle_before_train,
                "./target.shuf." + shuffle_before_train
            ])
            source = "./source.shuf." + shuffle_before_train
            target = "./target.shuf." + shuffle_before_train
        self.source_file = source
        self.target_file = target

        self.source = open_file(source, encoding='utf-8')
        self.target = open_file(target, encoding='utf-8')

        self.vocab_source = vocab_source
        self.vocab_target = vocab_target

        self.batch_size = batch_size
        self.maxlen_src = maxlen_src
        self.maxlen_trg = maxlen_trg

        self.n_words_src = n_words_src
        self.n_words_trg = n_words_trg

        self.source_buffer = []
        self.target_buffer = []

        self.k = batch_size * 128
        self.end_of_data = False
        self.shuffle_every_epoch = shuffle_every_epoch

예제 #2

0

파일 보기

        def _shuffle_and_reopen(self):
            """ shuffle features & labels file. """
            if self._parent._shuffle_every_epoch:
                if not hasattr(self, "_shuffled_features_file"):
                    self._shuffled_features_file = self._features_file.strip().split("/")[-1] \
                                                   + "." + self._parent._shuffle_every_epoch
                    self._shuffled_labels_file = self._labels_file.strip().split("/")[-1] \
                                                 + "." + self._parent._shuffle_every_epoch

                tf.logging.info(
                    "shuffling data\n\t{} ==> {}\n\t{} ==> {}".format(
                        self._features_file, self._shuffled_features_file,
                        self._labels_file, self._shuffled_labels_file))
                shuffle_data(
                    [self._features_file, self._labels_file],
                    [self._shuffled_features_file, self._shuffled_labels_file])
                self._features_file = self._shuffled_features_file
                self._labels_file = self._shuffled_labels_file
                if hasattr(self, "_features"):
                    close_file(self._features)
                    close_file(self._labels)
            elif hasattr(self, "_features"):
                self._features.seek(0)
                self._labels.seek(0)
                return self._features, self._labels
            return open_file(self._features_file), open_file(self._labels_file)

예제 #3

0

파일 보기

    def _SmallParallelData(self,
                           features_file,
                           labels_file,
                           maximum_features_length=None,
                           maximum_labels_length=None):
        """ Function for reading small scale parallel data for evaluation.

        Args:
            features_file: The path of features file.
            labels_file: The path of labels file.
            maximum_features_length: The maximum length of feature symbols (especially
              after BPE is applied) . If provided, the number of symbols of one sentence
              exceeding this value will be ignore.
            maximum_labels_length: The maximum length of label symbols (especially
              after BPE is applied) . If provided, the number of symbols of one sentence
              exceeding this value will be ignore.

        Returns: A list of feeding data.
        """
        features = open_file(features_file, encoding="utf-8")
        labels = open_file(labels_file[0], encoding="utf-8")

        ss_buf = []
        tt_buf = []
        while True:
            ss = read_line_with_filter(features, maximum_features_length,
                                       self._features_preprocessing_fn)
            tt = read_line_with_filter(labels, maximum_labels_length,
                                       self._labels_preprocessing_fn)
            if ss == "" or tt == "":
                break
            ss_buf.append(ss)
            tt_buf.append(tt)
        close_file(features)
        close_file(labels)
        if self._bucketing:
            tt_buf, ss_buf = do_bucketing(tt_buf, [ss_buf])
            ss_buf = ss_buf[0]
        data = []
        batch_data_idx = 0
        while batch_data_idx < len(ss_buf):
            x, len_x = padding_batch_data(
                ss_buf[batch_data_idx:batch_data_idx + self._batch_size],
                self._features_padding)
            y, len_y = padding_batch_data(
                tt_buf[batch_data_idx:batch_data_idx + self._batch_size],
                self._labels_padding)
            data.append({
                "feature_ids": x,
                "label_ids": y,
                "feed_dict": {
                    self.input_fields[Constants.FEATURE_IDS_NAME]: x,
                    self.input_fields[Constants.FEATURE_LENGTH_NAME]: len_x,
                    self.input_fields[Constants.LABEL_IDS_NAME]: y,
                    self.input_fields[Constants.LABEL_LENGTH_NAME]: len_y
                }
            })
            batch_data_idx += self._batch_size
        return data

예제 #4

0

파일 보기

    def _SmallParallelData(self,
                           features_file,
                           labels_file,
                           input_fields,
                           maximum_features_length=None,
                           maximum_labels_length=None):
        """ Function for reading small scale parallel data for evaluation.

        Args:
            features_file: The path of features file.
            labels_file: The path of labels file.
            input_fields: A dict of placeholders.
            maximum_features_length: The maximum length of feature symbols (especially
              after BPE is applied) . If provided, the number of symbols of one sentence
              exceeding this value will be ignore.
            maximum_labels_length: The maximum length of label symbols (especially
              after BPE is applied) . If provided, the number of symbols of one sentence
              exceeding this value will be ignore.

        Returns: A list of feeding data.
        """
        features = open_file(features_file, encoding="utf-8")
        labels = open_file(labels_file[0], encoding="utf-8")

        ss_buf = []
        tt_buf = []
        while True:
            ss = read_line_with_filter(features, maximum_features_length,
                                       self._features_preprocessing_fn)
            tt = read_line_with_filter(labels, maximum_labels_length,
                                       self._labels_preprocessing_fn)
            if ss == "" or tt == "":
                break
            ss_buf.append(ss)
            tt_buf.append(tt)
        close_file(features)
        close_file(labels)
        if self._bucketing:
            tt_buf, ss_buf = do_bucketing(tt_buf, [ss_buf])
            ss_buf = ss_buf[0]
        data = []
        batch_data_idx = 0
        while batch_data_idx < len(ss_buf):
            data.append(
                pack_feed_dict(
                    name_prefixs=[
                        Constants.FEATURE_NAME_PREFIX,
                        Constants.LABEL_NAME_PREFIX
                    ],
                    origin_datas=[
                        ss_buf[batch_data_idx:batch_data_idx +
                               self._batch_size],
                        tt_buf[batch_data_idx:batch_data_idx +
                               self._batch_size]
                    ],
                    paddings=[self._features_padding, self._labels_padding],
                    input_fields=input_fields))
            batch_data_idx += self._batch_size
        return data

예제 #5

0

파일 보기

    def _EvalParallelData(self, features_file, labels_file):
        """ Function for reading small scale parallel data for evaluation.

        Args:
            features_file: The path of features file.
            labels_file: The path of labels file.

        Returns: A list of feeding data.
        """
        eval_features = open_file(features_file, encoding="utf-8")
        if gfile.Exists(labels_file):
            eval_labels = open_file(labels_file, encoding="utf-8")
        else:
            eval_labels = open_file(labels_file + "0", encoding="utf-8")
        ss_buf = []
        tt_buf = []
        ss_str_buf = []
        tt_str_buf = []
        for ss, tt in zip(eval_features, eval_labels):
            ss_str = self._vocab_source.bpe_encode(ss.strip()).split()
            tt_str = self._vocab_target.bpe_encode(tt.strip()).split()
            ss_str_buf.append(ss_str)
            tt_str_buf.append(tt_str)
            ss_buf.append(self._vocab_source.convert_to_idlist(ss.strip()))
            tt_buf.append(self._vocab_target.convert_to_idlist(tt.strip()))
        close_file(eval_features)
        close_file(eval_labels)
        if self._bucketing:
            tlen = numpy.array([len(t) for t in tt_buf])
            tidx = tlen.argsort()
            _ss_buf = [ss_buf[i] for i in tidx]
            _tt_buf = [tt_buf[i] for i in tidx]
            _ss_str_buf = [ss_str_buf[i] for i in tidx]
            _tt_str_buf = [tt_str_buf[i] for i in tidx]
            ss_buf = _ss_buf
            tt_buf = _tt_buf
            ss_str_buf = _ss_str_buf
            tt_str_buf = _tt_str_buf
        data = []
        batch_data_idx = 0
        while batch_data_idx < len(ss_buf):
            x, len_x = padding_batch_data(
                ss_buf[batch_data_idx:batch_data_idx + self._batch_size],
                self._vocab_source.eos_id)
            y, len_y = padding_batch_data(
                tt_buf[batch_data_idx:batch_data_idx + self._batch_size],
                self._vocab_target.eos_id)
            data.append(
                (ss_str_buf[batch_data_idx:batch_data_idx + self._batch_size],
                 tt_str_buf[batch_data_idx:batch_data_idx + self._batch_size],
                 {
                     self.input_fields[GlobalNames.PH_FEATURE_IDS_NAME]: x,
                     self.input_fields[GlobalNames.PH_FEATURE_LENGTH_NAME]:
                     len_x,
                     self.input_fields[GlobalNames.PH_LABEL_IDS_NAME]: y,
                     self.input_fields[GlobalNames.PH_LABEL_LENGTH_NAME]: len_y
                 }))
            batch_data_idx += self._batch_size
        return data

예제 #6

0

파일 보기

파일: text_inputter.py 프로젝트: xiaY1fAn/NJUNMT-tf

 def _reset(self):
     if self._parent._shuffle_every_epoch:
         close_file(self._features)
         close_file(self._labels)
         self._shuffle()
         self._features = open_file(self._features_file, encoding="utf-8")
         self._labels = open_file(self._labels_file, encoding="utf-8")
     self._features.seek(0)
     self._labels.seek(0)

예제 #7

0

파일 보기

파일: data_iterator.py 프로젝트: KIngpon/NJUNMT-tf

    def __init__(self, source, target,
                 vocab_source, vocab_target,
                 batch_size=80,
                 maxlen_src=50, maxlen_trg=100,
                 n_words_src=-1, n_words_trg=-1,
                 shuffle_every_epoch=None,
                 shuffle_before_train=None):
        """

        :param source: `str`
        :param target: `str`
        :param vocab_source: `Vocab`
        :param vocab_target: `Vocab`
        :param batch_size: `int`
        :param maxlen_src: `int`
        :param maxlen_trg: `int`
        :param n_words_src: `int`
        :param n_words_trg: `int`
        :param shuffle_every_epoch: if is not None, use it as postfix of shuffled data
        :param shuffle_before_train: if is not None, use it as postfix of shuffled data
        :return:
        """
        if shuffle_before_train:
            tf.logging.info("shuffling data before training\n"
                         "\t%s ==> %s\n\t%s ==> %s"
                         % (source, "./source.shuf." + shuffle_before_train,
                            target, "./target.shuf." + shuffle_before_train))
            shuffle_data([source, target],
                         ["./source.shuf." + shuffle_before_train,
                          "./target.shuf." + shuffle_before_train])
            source = "./source.shuf." + shuffle_before_train
            target = "./target.shuf." + shuffle_before_train
        self.source_file = source
        self.target_file = target

        self.source = open_file(source, encoding='utf-8')
        self.target = open_file(target, encoding='utf-8')

        self.vocab_source = vocab_source
        self.vocab_target = vocab_target

        self.batch_size = batch_size
        self.maxlen_src = maxlen_src
        self.maxlen_trg = maxlen_trg

        self.n_words_src = n_words_src
        self.n_words_trg = n_words_trg

        self.source_buffer = []
        self.target_buffer = []

        self.k = batch_size * 128
        self.end_of_data = False
        self.shuffle_every_epoch = shuffle_every_epoch

예제 #8

0

파일 보기

파일: data_iterator.py 프로젝트: KIngpon/NJUNMT-tf

 def reset(self):
     if self.shuffle_every_epoch:
         close_file(self.source)
         close_file(self.target)
         tf.logging.info("shuffling data among epochs")
         shuffle_data([self.source_file, self.target_file],
                      ["./source.shuf." + self.shuffle_every_epoch,
                       "./target.shuf." + self.shuffle_every_epoch])
         self.source = open_file("./source.shuf." + self.shuffle_every_epoch)
         self.target = open_file("./target.shuf." + self.shuffle_every_epoch)
     else:
         self.source.seek(0)
         self.target.seek(0)

예제 #9

0

파일 보기

파일: text_metrics_spec.py 프로젝트: xiaofeng5611/NJUNMT-tf

    def _prepare(self):
        """ Prepares for evaluation.

        Builds the model with reuse=True, mode=EVAL and preprocesses
        data file(s).
        """
        features_file = self._dataset["features_file"]
        labels_file = self._dataset["labels_file"]
        vocab_source = self._dataset["vocab_source"]
        vocab_target = self._dataset["vocab_target"]
        self._model_configs = update_infer_params(  # update inference parameters
            self._model_configs,
            beam_size=self._beam_size,
            maximum_labels_length=self._maximum_labels_length,
            length_penalty=self._length_penalty)
        estimator_spec = model_fn(model_configs=self._model_configs,
                                  mode=ModeKeys.INFER,
                                  vocab_source=vocab_source,
                                  vocab_target=vocab_target,
                                  name=self._model_name, reuse=True,
                                  verbose=False)
        self._predict_ops = estimator_spec.predictions
        text_inputter = TextLineInputter(
            line_readers=LineReader(
                data=features_file,
                preprocessing_fn=lambda x: vocab_source.convert_to_idlist(x)),
            padding_id=vocab_source.pad_id,
            batch_size=self._batch_size)
        self._infer_data = text_inputter.make_feeding_data(
            input_fields=estimator_spec.input_fields)
        tmp_trans_dir = os.path.join(self._model_configs["model_dir"], Constants.TMP_TRANS_DIRNAME)
        if not gfile.Exists(tmp_trans_dir):
            gfile.MakeDirs(tmp_trans_dir)
        self._tmp_trans_file_prefix = os.path.join(tmp_trans_dir, Constants.TMP_TRANS_FILENAME_PREFIX)
        self._read_ckpt_bleulog()
        # load references
        self._references = []
        for rfile in access_multiple_files(labels_file):
            with open_file(rfile) as fp:
                if self._char_level:
                    self._references.append(to_chinese_char(fp.readlines()))
                else:
                    self._references.append(fp.readlines())
        self._references = list(map(list, zip(*self._references)))
        with open_file(features_file) as fp:
            self._sources = fp.readlines()
        self._bad_count = 0
        self._best_bleu_score = 0.

예제 #10

0

파일 보기

파일: text_metrics_spec.py 프로젝트: xiaofeng5611/NJUNMT-tf

 def _read_ckpt_bleulog(self):
     """Read the best BLEU scores and the name of corresponding
     checkpoint archives from log file."""
     if gfile.Exists(self._top_bleu_ckpt_log_filename):
         with open_file(self._top_bleu_ckpt_log_filename, mode="r") as fp:
             self._best_checkpoint_bleus = [float(x) for x in fp.readline().strip().split(",")]
             self._best_checkpoint_names = [x for x in fp.readline().strip().split(",")]

예제 #11

0

파일 보기

파일: data_iterator.py 프로젝트: zhengzx-nlp/NJUNMT-tf-server

    def __init__(self, source, vocab_source, batch_size=1, n_words_src=-1):
        # read in batch datas
        f_source = open_file(source)

        ss_buf = []
        ss_str_buf = []
        for ss in f_source:
            # ss_str_buf.append(ss.strip())
            ss_str_buf.append(vocab_source.bpe_encode(ss.strip()))
            ss = vocab_source.convert_to_idlist(ss.strip().split(),
                                                n_words_src)
            ss_buf.append(ss)
        f_source.close()

        self.batch_source_buffer = []
        self.batch_source_str_buffer = []

        self.batch_data_idx = 0
        self.batch_size = batch_size
        while self.batch_data_idx < len(ss_buf):
            self.batch_source_buffer.append(
                padding_batch_data(
                    ss_buf[self.batch_data_idx:self.batch_data_idx +
                           batch_size], vocab_source.eos_id))
            self.batch_source_str_buffer.append(
                ss_str_buf[self.batch_data_idx:self.batch_data_idx +
                           batch_size])
            self.batch_data_idx += batch_size
        self.reset()

예제 #12

0

파일 보기

파일: data_iterator.py 프로젝트: zhengzx-nlp/NJUNMT-tf-server

 def reset(self):
     if self.shuffle_every_epoch:
         close_file(self.source)
         close_file(self.target)
         tf.logging.info("shuffling data among epochs")
         shuffle_data([self.source_file, self.target_file], [
             "./source.shuf." + self.shuffle_every_epoch,
             "./target.shuf." + self.shuffle_every_epoch
         ])
         self.source = open_file("./source.shuf." +
                                 self.shuffle_every_epoch)
         self.target = open_file("./target.shuf." +
                                 self.shuffle_every_epoch)
     else:
         self.source.seek(0)
         self.target.seek(0)

예제 #13

0

파일 보기

    def _make_feeding_data_from(self, filename):
        """ Processes the data file and return an iterable instance for loop.

        Args:
            filename: A specific data file.

        Returns: An iterable instance that packs feeding dictionary
                   for `tf.Session().run` according to the `filename`.
        """
        features = open_file(filename, encoding="utf-8")
        str_buf = []
        ss_buf = []
        for ss in features:
            str_buf.append(self._vocab.bpe_encode(ss.strip()))
            ss_buf.append(self._vocab.convert_to_idlist(ss.strip().split(" ")))
        close_file(features)
        data = []
        batch_data_idx = 0
        while batch_data_idx < len(ss_buf):
            x, len_x = padding_batch_data(
                ss_buf[batch_data_idx: batch_data_idx + self._batch_size],
                self._vocab.eos_id)
            str_x = str_buf[batch_data_idx: batch_data_idx + self._batch_size]
            batch_data_idx += self._batch_size
            data.append((
                str_x, len_x,
                {self.input_fields[GlobalNames.PH_FEATURE_IDS_NAME]: x,
                 self.input_fields[GlobalNames.PH_FEATURE_LENGTH_NAME]: len_x}))
        return data

예제 #14

0

파일 보기

파일: data_iterator.py 프로젝트: KIngpon/NJUNMT-tf

    def __init__(self, source,
                 vocab_source,
                 batch_size=1,
                 n_words_src=-1):
        # read in batch datas
        f_source = open_file(source)

        ss_buf = []
        ss_str_buf = []
        for ss in f_source:
            # ss_str_buf.append(ss.strip())
            ss_str_buf.append(vocab_source.bpe_encode(ss.strip()))
            ss = vocab_source.convert_to_idlist(ss.strip().split(), n_words_src)
            ss_buf.append(ss)
        f_source.close()

        self.batch_source_buffer = []
        self.batch_source_str_buffer = []

        self.batch_data_idx = 0
        self.batch_size = batch_size
        while self.batch_data_idx < len(ss_buf):
            self.batch_source_buffer.append(
                padding_batch_data(ss_buf[self.batch_data_idx: self.batch_data_idx + batch_size], vocab_source.eos_id))
            self.batch_source_str_buffer.append(
                ss_str_buf[self.batch_data_idx: self.batch_data_idx + batch_size])
            self.batch_data_idx += batch_size
        self.reset()

예제 #15

0

파일 보기

파일: vocab.py 프로젝트: cjliux/NJUNMT-tf

def create_vocabulary_lookup_table_numpy(filename):
    """Creates a lookup table from a vocabulary file.

    Args:
        filename: Path to a vocabulary file containing one word per line.
          Each word is mapped to its line number (starting from 0).

    Returns: A tuple `(word_to_id_mapping, id_to_word_mapping, special_fields)`

    """
    if not gfile.Exists(filename):
        raise ValueError("File does not exist: {}".format(filename))

    # Load vocabulary into memory
    with open_file(filename, encoding="utf-8") as file:
        vocab = list(line.strip("\n") for line in file)
    vocab_size = len(vocab)

    has_counts = len(vocab[0].split("\t")) == 2
    if has_counts:
        vocab, counts = zip(*[_.split("\t") for _ in vocab])
        counts = [float(_) for _ in counts]
        vocab = list(vocab)
    else:
        counts = [-1. for _ in vocab]

    # Add special vocabulary items
    special_vocab = get_special_vocab(vocab_size)
    vocab += list(special_vocab._fields)
    vocab_size += len(special_vocab)
    counts += [-1. for _ in list(special_vocab._fields)]

    return {v: k for k, v in enumerate(vocab)}, \
           {k: v for k, v in enumerate(vocab)}, \
           special_vocab._fields

예제 #16

0

파일 보기

def load_from_config_path(config_paths):
    """ Loads configurations from files of yaml format.

    Args:
        config_paths: A string (each file name is seperated by ",") or
          a list of strings (file names).

    Returns: A dictionary of model configurations, parsed from config files.
    """
    if isinstance(config_paths, six.string_types):
        config_paths = config_paths.strip().split(",")
    assert isinstance(config_paths, list) or isinstance(config_paths, tuple)
    model_configs = dict()
    for config_path in config_paths:
        config_path = config_path.strip()
        if not config_path:
            continue
        if not gfile.Exists(config_path):
            raise OSError("config file does not exist: {}".format(config_path))
        config_path = os.path.abspath(config_path)
        tf.logging.info("loading configurations from {}".format(config_path))
        with open_file(config_path, mode="r") as config_file:
            config_flags = yaml.load(config_file)
            model_configs = deep_merge_dict(model_configs, config_flags)
    return model_configs

예제 #17

0

파일 보기

파일: vocab.py 프로젝트: KIngpon/NJUNMT-tf

def create_vocabulary_lookup_table_numpy(filename):
    """Creates a lookup table from a vocabulary file.

    Args:
        filename: Path to a vocabulary file containing one word per line.
          Each word is mapped to its line number (starting from 0).

    Returns: A tuple `(word_to_id_mapping, id_to_word_mapping, special_fields)`

    """
    if not gfile.Exists(filename):
        raise ValueError("File does not exist: {}".format(filename))

    # Load vocabulary into memory
    with open_file(filename, encoding="utf-8") as file:
        vocab = list(line.strip("\n") for line in file)
    vocab_size = len(vocab)

    has_counts = len(vocab[0].split("\t")) == 2
    if has_counts:
        vocab, counts = zip(*[_.split("\t") for _ in vocab])
        counts = [float(_) for _ in counts]
        vocab = list(vocab)
    else:
        counts = [-1. for _ in vocab]

    # Add special vocabulary items
    special_vocab = get_special_vocab(vocab_size)
    vocab += list(special_vocab._fields)
    vocab_size += len(special_vocab)
    counts += [-1. for _ in list(special_vocab._fields)]

    return {v: k for k, v in enumerate(vocab)}, \
           {k: v for k, v in enumerate(vocab)}, \
           special_vocab._fields

예제 #18

0

파일 보기

파일: data_iterator.py 프로젝트: zhengzx-nlp/NJUNMT-tf-server

    def __init__(self,
                 source,
                 target,
                 vocab_source,
                 vocab_target,
                 batch_size=128,
                 n_words_src=-1,
                 n_words_trg=-1):
        # read in batch datas
        f_source = open_file(source)
        if gfile.Exists(target):
            f_target = open_file(target)
        else:
            f_target = open_file(target + "0")

        ss_buf = []
        tt_buf = []
        for ss, tt in zip(f_source, f_target):
            ss = vocab_source.convert_to_idlist(ss.strip().split(),
                                                n_words_src)
            tt = vocab_target.convert_to_idlist(tt.strip().split(),
                                                n_words_trg)
            ss_buf.append(ss)
            tt_buf.append(tt)
        f_source.close()
        f_target.close()
        tlen = numpy.array([len(t) for t in tt_buf])
        tidx = tlen.argsort()
        _ss_buf = [ss_buf[i] for i in tidx]
        _tt_buf = [tt_buf[i] for i in tidx]
        ss_buf = _ss_buf
        tt_buf = _tt_buf
        self.batch_source_buffer = []
        self.batch_target_buffer = []
        self.batch_data_idx = 0
        self.batch_size = batch_size
        while self.batch_data_idx < len(ss_buf):
            self.batch_source_buffer.append(
                padding_batch_data(
                    ss_buf[self.batch_data_idx:self.batch_data_idx +
                           batch_size], vocab_source.eos_id))
            self.batch_target_buffer.append(
                padding_batch_data(
                    tt_buf[self.batch_data_idx:self.batch_data_idx +
                           batch_size], vocab_target.eos_id))
            self.batch_data_idx += batch_size
        self.reset()

예제 #19

0

파일 보기

파일: text_inputter.py 프로젝트: xiaY1fAn/NJUNMT-tf

        def __init__(self,
                     parent,
                     features_file,
                     labels_file,
                     maximum_features_length=None,
                     maximum_labels_length=None,
                     maximum_encoded_features_length=None,
                     maximum_encoded_labels_length=None):
            """ Initializes.

            Args:
                parent: A `ParallelTextInputter` object.
                features_file: The path of features file.
                labels_file: The path of labels file.
                maximum_features_length: The maximum sequence length of "features" field.
                  If provided, sentences exceeding this value will be ignore.
                maximum_labels_length: The maximum sequence length of "labels" field.
                  If provided, sentences exceeding this value will be ignore.
                maximum_encoded_features_length: The maximum length of feature symbols (especially
                  after BPE is applied) . If provided, the number of symbols of one sentence
                  exceeding this value will be ignore.
                maximum_encoded_labels_length: The maximum length of label symbols (especially
                  after BPE is applied) . If provided, the number of symbols of one sentence
                  exceeding this value will be ignore.
            """
            self._parent = parent
            self._features_file = features_file
            self._labels_file = labels_file
            if not gfile.Exists(self._labels_file):
                self._labels_file = self._labels_file + "0"
            self._maximum_features_length = maximum_features_length
            self._maximum_labels_length = maximum_labels_length
            self._maximum_encoded_features_length = maximum_encoded_features_length
            self._maximum_encoded_labels_length = maximum_encoded_labels_length
            if self._parent._shuffle_every_epoch:
                self._shuffle_features_file = self._features_file.strip().split("/")[-1] \
                                              + "." + self._parent._shuffle_every_epoch
                self._shuffle_labels_file = self._labels_file.strip().split("/")[-1] \
                                            + "." + self._parent._shuffle_every_epoch
                self._shuffle()
            self._features = open_file(self._features_file, encoding="utf-8")
            self._labels = open_file(self._labels_file, encoding="utf-8")
            self._features_buffer = []
            self._labels_buffer = []
            self._features_len_buffer = []
            self._labels_len_buffer = []
            self._end_of_data = False

예제 #20

0

파일 보기

파일: text_metrics_spec.py 프로젝트: xiaofeng5611/NJUNMT-tf

    def _update_bleu_ckpt(self, run_context, bleu, hypothesis, global_step):
        """ Updates the best checkpoints according to BLEU score and
        removes the worst model if the number of checkpoint archives
        exceeds maximum_keep_models.

        If the model does not improves BLEU score anymore (hits the
        maximum patience), request stop session.

        Args:
            run_context: A `SessionRunContext` object.
            bleu: A python float, the BLEU score derived by the model
              at this step.
            hypothesis: A list of hypothesis for validation set.
            global_step: A python integer, the current training step.
        """
        if bleu >= self._best_bleu_score:
            self._best_bleu_score = bleu
            self._bad_count = 0
        else:
            self._bad_count += 1
        if self._bad_count >= self._estop_patience_max and self._early_stop:
            tf.logging.info("early stop.")
            run_context.request_stop()
        # saving checkpoints if eval_steps and save_checkpoint_steps mismatch
        if len(self._best_checkpoint_names) == 0 or bleu > self._best_checkpoint_bleus[0]:
            with open_file(self._tmp_trans_file_prefix + str(global_step), mode="w") as fw:
                fw.write('\n'.join(hypothesis) + "\n")
            if not gfile.Exists("{}-{}.meta".format(
                    os.path.join(self._checkpoint_dir, Constants.MODEL_CKPT_FILENAME), global_step)):
                saver = saver_lib._get_saver_or_default()
                saver.save(run_context.session,
                           os.path.join(self._checkpoint_dir, Constants.MODEL_CKPT_FILENAME),
                           global_step=global_step)
            backup_dirname = os.path.join(self._model_configs["model_dir"], "../") \
                             + "{dirname_prefix}_iter{global_step}_bleu{bleu}".format(
                dirname_prefix=Constants.BACKUP_MODEL_DIRNAME_PREFIX,
                global_step=global_step,
                bleu=("%.1f" % bleu))
            tf.logging.info("Saving to directoruy: {}/".format(backup_dirname))
            os.system("mkdir {backup_dirname};"
                      "cp {ckpt_dirname}/checkpoint {backup_dirname}/;"
                      "cp {ckpt_dirname}/{model_config} {backup_dirname}/;"
                      "cp {ckpt_dirname}/{model_analysis} {backup_dirname}/;"
                      "cp {ckpt_dirname}/*{global_step}* {backup_dirname}/".format(
                backup_dirname=backup_dirname,
                ckpt_dirname=self._checkpoint_dir,
                model_config=Constants.MODEL_CONFIG_YAML_FILENAME,
                model_analysis=Constants.MODEL_ANALYSIS_FILENAME,
                global_step=global_step))
            self._best_checkpoint_bleus.append(bleu)
            self._best_checkpoint_names.append(backup_dirname)
            if len(self._best_checkpoint_bleus) > self._maximum_keep_models:
                tidx = numpy.argsort(self._best_checkpoint_bleus)
                _bleus = [self._best_checkpoint_bleus[i] for i in tidx]
                _names = [self._best_checkpoint_names[i] for i in tidx]
                self._best_checkpoint_bleus = _bleus[1:]
                self._best_checkpoint_names = _names[1:]
                os.system("rm -rf {}".format(_names[0]))
            self._write_ckpt_bleulog()

예제 #21

0

파일 보기

파일: metrics.py 프로젝트: CRyan2016/NJUNMT-tf

def multi_bleu_score_from_file(hypothesis_file, references_files):
    """ Computes corpus-level BLEU from hypothesis file
      and reference file(s).

    Args:
        hypothesis_file: A string.
        references_files: A string. The name of reference file or the prefix.
    Returns: A float.
    """
    with open_file(hypothesis_file) as fp:
        hypothesis = fp.readlines()
    references = []
    for ref_file in get_labels_files(references_files):
        with open_file(ref_file) as fp:
            references.append(fp.readlines())
    references = list(map(list, zip(*references)))
    return multi_bleu_score(hypothesis, references)

예제 #22

0

파일 보기

파일: attention.py 프로젝트: xiaofeng5611/NJUNMT-tf

def dump_attentions(output_filename_prefix, attentions):
    """ Dumps attention as json format.

    Args:
        output_filename_prefix: A string.
        attentions: A dict of attention arrays.
    """
    tf.logging.info("Saving attention information into {}.attention.".format(output_filename_prefix))
    with open_file(output_filename_prefix + ".attention", mode="wb") as f:
        f.write(json.dumps(attentions).encode("utf-8"))

예제 #23

0

파일 보기

    def dump(model_config, output_dir):
        """ Dumps model configurations.

        Args:
            model_config: A dict.
            output_dir: A string, the output directory.
        """
        model_config_filename = os.path.join(output_dir, Constants.MODEL_CONFIG_YAML_FILENAME)
        if not gfile.Exists(output_dir):
            gfile.MakeDirs(output_dir)
        with open_file(model_config_filename, mode="w") as file:
            yaml.dump(model_config, file)

예제 #24

0

파일 보기

파일: text_metrics_spec.py 프로젝트: CRyan2016/NJUNMT-tf

    def _prepare(self):
        """ Prepares for evaluation.

        Builds the model with reuse=True, mode=EVAL and preprocesses
        data file(s).
        """
        text_inputter = TextLineInputter(dataset=self._dataset,
                                         data_field_name="eval_features_file",
                                         batch_size=self._batch_size)
        self._infer_data = text_inputter.make_feeding_data()
        self._model_configs = update_infer_params(  # update inference parameters
            self._model_configs,
            beam_size=self._beam_size,
            maximum_labels_length=self._maximum_labels_length,
            length_penalty=self._length_penalty)
        estimator_spec = model_fn(model_configs=self._model_configs,
                                  mode=ModeKeys.INFER,
                                  dataset=self._dataset,
                                  name=self._model_name,
                                  reuse=True,
                                  verbose=False)
        self._predict_ops = estimator_spec.predictions
        tmp_trans_dir = os.path.join(self._model_configs["model_dir"],
                                     Constants.TMP_TRANS_DIRNAME)
        if not gfile.Exists(tmp_trans_dir):
            gfile.MakeDirs(tmp_trans_dir)
        self._tmp_trans_file_prefix = os.path.join(
            tmp_trans_dir, Constants.TMP_TRANS_FILENAME_PREFIX)
        self._read_ckpt_bleulog()
        # load references
        self._references = []
        for rfile in self._dataset.eval_labels_file:
            with open_file(rfile) as fp:
                self._references.append(fp.readlines())
        self._references = list(map(list, zip(*self._references)))
        with open_file(self._dataset.eval_features_file) as fp:
            self._sources = fp.readlines()
        self._bad_count = 0
        self._best_bleu_score = 0.

예제 #25

0

파일 보기

파일: data_reader.py 프로젝트: zhengzx-nlp/NJUNMT-tf-server

    def reset(self,
              do_shuffle=False,
              shuffle_to_file=None,
              argsort_index=None):
        """ Resets this reader and shuffle (if needed).

        Args:
            do_shuffle: Whether to shuffle data.
            shuffle_to_file: A string.
            argsort_index: A list of integers

        Returns: The `argsort_index` if do shuffling.
        """
        # TODO
        self._data_index = 0
        if self._filename is not None:
            self._data.seek(0)
        if do_shuffle:
            if self._filename is None:  # list of data
                _ = shuffle_to_file
                if not argsort_index:
                    argsort_index = numpy.arange(len(self._data))
                    numpy.random.shuffle(argsort_index)
                self._data = self._data[argsort_index]  # do shuffle
            else:  # from file
                assert shuffle_to_file, ("`shuffle_to_file` must be provided.")
                tf.logging.info("shuffling data:\t{} ==> {}".format(
                    self._filename, shuffle_to_file))
                data_list = self._data.readlines()
                close_file(self._data)
                if argsort_index is None:
                    argsort_index = numpy.arange(len(data_list))
                    numpy.random.shuffle(argsort_index)
                with open_file(shuffle_to_file, "utf-8", "w") as fw:
                    for idx in argsort_index:
                        fw.write(data_list[idx].strip() + "\n")
                del data_list[:]
                self._data = open_file(shuffle_to_file, "utf-8", "r")
        return argsort_index

예제 #26

0

파일 보기

파일: text_metrics_spec.py 프로젝트: KIngpon/NJUNMT-tf

    def _prepare(self):
        """ Prepares for evaluation.

        Builds the model with reuse=True, mode=EVAL and preprocesses
        data file(s).
        """
        self._model_configs = update_infer_params(  # update inference parameters
            self._model_configs,
            beam_size=self._beam_size,
            maximum_labels_length=self._maximum_labels_length,
            length_penalty=self._length_penalty)
        estimator_spec = model_fn(model_configs=self._model_configs,
                                  mode=ModeKeys.INFER, dataset=self._dataset,
                                  name=self._model_name, reuse=True, verbose=False)
        self._predict_ops = estimator_spec.predictions
        text_inputter = TextLineInputter(
            dataset=self._dataset,
            data_field_name="eval_features_file",
            batch_size=self._batch_size)
        self._infer_data = text_inputter.make_feeding_data(
            input_fields=estimator_spec.input_fields)
        tmp_trans_dir = os.path.join(self._model_configs["model_dir"], Constants.TMP_TRANS_DIRNAME)
        if not gfile.Exists(tmp_trans_dir):
            gfile.MakeDirs(tmp_trans_dir)
        self._tmp_trans_file_prefix = os.path.join(tmp_trans_dir, Constants.TMP_TRANS_FILENAME_PREFIX)
        self._read_ckpt_bleulog()
        # load references
        self._references = []
        for rfile in self._dataset.eval_labels_file:
            with open_file(rfile) as fp:
                if self._char_level:
                    self._references.append(to_chinese_char(fp.readlines()))
                else:
                    self._references.append(fp.readlines())
        self._references = list(map(list, zip(*self._references)))
        with open_file(self._dataset.eval_features_file) as fp:
            self._sources = fp.readlines()
        self._bad_count = 0
        self._best_bleu_score = 0.

예제 #27

0

파일 보기

파일: data_iterator.py 프로젝트: KIngpon/NJUNMT-tf

    def __init__(self, source, target,
                 vocab_source, vocab_target,
                 batch_size=128,
                 n_words_src=-1,
                 n_words_trg=-1):
        # read in batch datas
        f_source = open_file(source)
        if gfile.Exists(target):
            f_target = open_file(target)
        else:
            f_target = open_file(target + "0")

        ss_buf = []
        tt_buf = []
        for ss, tt in zip(f_source, f_target):
            ss = vocab_source.convert_to_idlist(ss.strip().split(), n_words_src)
            tt = vocab_target.convert_to_idlist(tt.strip().split(), n_words_trg)
            ss_buf.append(ss)
            tt_buf.append(tt)
        f_source.close()
        f_target.close()
        tlen = numpy.array([len(t) for t in tt_buf])
        tidx = tlen.argsort()
        _ss_buf = [ss_buf[i] for i in tidx]
        _tt_buf = [tt_buf[i] for i in tidx]
        ss_buf = _ss_buf
        tt_buf = _tt_buf
        self.batch_source_buffer = []
        self.batch_target_buffer = []
        self.batch_data_idx = 0
        self.batch_size = batch_size
        while self.batch_data_idx < len(ss_buf):
            self.batch_source_buffer.append(
                padding_batch_data(ss_buf[self.batch_data_idx: self.batch_data_idx + batch_size], vocab_source.eos_id))
            self.batch_target_buffer.append(
                padding_batch_data(tt_buf[self.batch_data_idx: self.batch_data_idx + batch_size], vocab_target.eos_id))
            self.batch_data_idx += batch_size
        self.reset()

예제 #28

0

파일 보기

    def load(model_dir):
        """ Loads model configurations.

        Args:
            model_dir: A string, the directory.

        Returns: A dict.
        """
        model_config_filename = os.path.join(model_dir, Constants.MODEL_CONFIG_YAML_FILENAME)
        if not gfile.Exists(model_config_filename):
            raise OSError("Fail to find model config file: %s" % model_config_filename)
        with open_file(model_config_filename, mode="r") as file:
            model_configs = yaml.load(file)
        return model_configs

예제 #29

0

파일 보기

파일: metrics.py 프로젝트: zhengzx-nlp/NJUNMT-tf-server

def multi_bleu_score_from_file(hypothesis_file,
                               references_files,
                               char_level=False):
    """ Computes corpus-level BLEU from hypothesis file
      and reference file(s).

    Args:
        hypothesis_file: A string.
        references_files: A string. The name of reference file or the prefix.
        char_level: Whether evaluate at char-level (for Chinese only).
    Returns: A float.
    """
    with open_file(hypothesis_file) as fp:
        hypothesis = fp.readlines()
    references = []
    for ref_file in access_multiple_files(references_files):
        with open_file(ref_file) as fp:
            if char_level:
                references.append((to_chinese_char(fp.readlines())))
            else:
                references.append(fp.readlines())
    references = list(map(list, zip(*references)))
    return multi_bleu_score(hypothesis, references)

예제 #30

0

파일 보기

        def __init__(self, parent):
            """ Initializes.

            Args:
                parent: A `ParallelTextInputter` object.
            """
            self._parent = parent
            self._features_file = self._parent._features_file
            self._labels_file = self._parent._labels_file
            if not gfile.Exists(self._labels_file):
                self._labels_file = self._labels_file + "0"
            if self._parent._shuffle_every_epoch:
                self._shuffle_features_file = self._features_file.strip().split("/")[-1] \
                                              + "." + self._parent._shuffle_every_epoch
                self._shuffle_labels_file = self._labels_file.strip().split("/")[-1] \
                                            + "." + self._parent._shuffle_every_epoch
                self._shuffle()
            self._features = open_file(self._features_file, encoding="utf-8")
            self._labels = open_file(self._labels_file, encoding="utf-8")
            self._features_buffer = []
            self._labels_buffer = []
            self._features_len_buffer = []
            self._labels_len_buffer = []
            self._end_of_data = False

예제 #31

0

파일 보기

파일: metrics.py 프로젝트: KIngpon/NJUNMT-tf

def multi_bleu_score_from_file(
        hypothesis_file,
        references_files,
        char_level=False):
    """ Computes corpus-level BLEU from hypothesis file
      and reference file(s).

    Args:
        hypothesis_file: A string.
        references_files: A string. The name of reference file or the prefix.
        char_level: Whether evaluate at char-level (for Chinese only).
    Returns: A float.
    """
    with open_file(hypothesis_file) as fp:
        hypothesis = fp.readlines()
    references = []
    for ref_file in get_labels_files(references_files):
        with open_file(ref_file) as fp:
            if char_level:
                references.append((to_chinese_char(fp.readlines())))
            else:
                references.append(fp.readlines())
    references = list(map(list, zip(*references)))
    return multi_bleu_score(hypothesis, references)

예제 #32

0

파일 보기

    def _make_feeding_data_from(self,
                                filename,
                                maximum_line_length=None,
                                maximum_encoded_length=None):
        """ Processes the data file and return an iterable instance for loop.

        Args:
            filename: A specific data file.
            maximum_line_length: The maximum sequence length. If provided,
              sentences exceeding this value will be ignore.
            maximum_encoded_length: The maximum length of symbols (especially
              after BPE is applied). If provided symbols of one sentence exceeding
              this value will be ignore.

        Returns: An iterable instance that packs feeding dictionary
                   for `tf.Session().run` according to the `filename`.
        """
        features = open_file(filename, encoding="utf-8")
        str_buf = []
        ss_buf = []
        for ss in features:
            if maximum_line_length and len(
                    ss.strip().split()) > maximum_line_length:
                continue
            encoded_ss = self._vocab.convert_to_idlist(ss.strip().split())
            if maximum_encoded_length and len(
                    encoded_ss) - 1 > maximum_encoded_length:
                continue
            bpe_ss = self._vocab.bpe_encode(ss.strip())
            str_buf.append(bpe_ss)
            ss_buf.append(encoded_ss)
        close_file(features)
        data = []
        batch_data_idx = 0
        while batch_data_idx < len(ss_buf):
            x, len_x = padding_batch_data(
                ss_buf[batch_data_idx:batch_data_idx + self._batch_size],
                self._vocab.eos_id)
            str_x = str_buf[batch_data_idx:batch_data_idx + self._batch_size]
            batch_data_idx += self._batch_size
            data.append((str_x, len_x, {
                self.input_fields[GlobalNames.PH_FEATURE_IDS_NAME]:
                x,
                self.input_fields[GlobalNames.PH_FEATURE_LENGTH_NAME]:
                len_x
            }))
        return data

예제 #33

0

파일 보기

    def _make_feeding_data_from(self, filename, maximum_length=None):
        """ Processes the data file and return an iterable instance for loop.

        Args:
            filename: A specific data file.
            maximum_length: The maximum length of symbols (especially
              after BPE is applied). If provided symbols of one sentence exceeding
              this value will be ignore.

        Returns: An iterable instance that packs feeding dictionary
                   for `tf.Session().run` according to the `filename`.
        """
        features = open_file(filename, encoding="utf-8")
        ss_buf = []
        encoded_ss = read_line_with_filter(features, maximum_length,
                                           self._preprocessing_fn)
        while encoded_ss != "":
            ss_buf.append(encoded_ss)
            encoded_ss = read_line_with_filter(features, maximum_length,
                                               self._preprocessing_fn)
        close_file(features)
        data = []
        batch_data_idx = 0
        while batch_data_idx < len(ss_buf):
            x, len_x = padding_batch_data(
                ss_buf[batch_data_idx:batch_data_idx + self._batch_size],
                self._padding)
            batch_data_idx += self._batch_size
            if "features" in self._data_field_name:
                data.append({
                    "feature_ids": x,
                    "feed_dict": {
                        self.input_fields[Constants.FEATURE_IDS_NAME]: x,
                        self.input_fields[Constants.FEATURE_LENGTH_NAME]: len_x
                    }
                })
            else:
                data.append({
                    "label_ids": x,
                    "feed_dict": {
                        self.input_fields[Constants.LABEL_IDS_NAME]: x,
                        self.input_fields[Constants.LABEL_LENGTH_NAME]: len_x
                    }
                })
        return data

예제 #34

0

파일 보기

    def _make_feeding_data_from(self,
                                filename,
                                input_fields,
                                maximum_length=None):
        """ Processes the data file and return an iterable instance for loop.

        Args:
            filename: A specific data file.
            input_fields: A dict of placeholders.
            maximum_length: The maximum length of symbols (especially
              after BPE is applied). If provided symbols of one sentence exceeding
              this value will be ignore.

        Returns: An iterable instance that packs feeding dictionary
                   for `tf.Session().run` according to the `filename`.
        """
        features = open_file(filename, encoding="utf-8")
        ss_buf = []
        encoded_ss = read_line_with_filter(features, maximum_length,
                                           self._preprocessing_fn)
        while encoded_ss != "":
            ss_buf.append(encoded_ss)
            encoded_ss = read_line_with_filter(features, maximum_length,
                                               self._preprocessing_fn)
        close_file(features)
        data = []
        batch_data_idx = 0
        name_prefix = Constants.FEATURE_NAME_PREFIX \
            if "features" in self._data_field_name else Constants.LABEL_NAME_PREFIX

        while batch_data_idx < len(ss_buf):
            data.append(
                pack_feed_dict(
                    name_prefixs=name_prefix,
                    origin_datas=ss_buf[batch_data_idx:batch_data_idx +
                                        self._batch_size],
                    paddings=self._padding,
                    input_fields=input_fields))
            batch_data_idx += self._batch_size
        return data

예제 #35

0

파일 보기

파일: data_reader.py 프로젝트: zhengzx-nlp/NJUNMT-tf-server

    def __init__(self, data, maximum_length=None, preprocessing_fn=None):
        """ Initializes the parameters for LineReader.

        Args:
            data: A string indicating the name of data file or a list of data list.
            maximum_length: An integer, the maximum length of one line (after
              preprocessed if `preprocessing_fn` is provided).
            preprocessing_fn: A callable function.
        """
        self._maximum_length = maximum_length
        self._preprocessing_fn = preprocessing_fn
        self._data_index = 0
        if isinstance(data, six.string_types):
            self._filename = access_multiple_files(data)[0]
            self._data = open_file(self._filename, encoding="utf-8", mode="r")
        elif isinstance(data, list):
            self._filename = None
            self._data = numpy.array(data)
        else:
            raise ValueError("Unrecognized type of `data`: {}, "
                             "which should be string or list".format(
                                 type(data)))

예제 #36

0

파일 보기

파일: text_inputter.py 프로젝트: xiaY1fAn/NJUNMT-tf

    def _SmallParallelData(self,
                           features_file,
                           labels_file,
                           maximum_features_length=None,
                           maximum_labels_length=None,
                           maximum_encoded_features_length=None,
                           maximum_encoded_labels_length=None):
        """ Function for reading small scale parallel data.

        Args:
            features_file: The path of features file.
            labels_file: The path of labels file.
            maximum_features_length: The maximum sequence length of "features" field.
              If provided, sentences exceeding this value will be ignore.
            maximum_labels_length: The maximum sequence length of "labels" field.
              If provided, sentences exceeding this value will be ignore.
            maximum_encoded_features_length: The maximum length of feature symbols (especially
              after BPE is applied) . If provided, the number of symbols of one sentence
              exceeding this value will be ignore.
            maximum_encoded_labels_length: The maximum length of label symbols (especially
              after BPE is applied) . If provided, the number of symbols of one sentence
              exceeding this value will be ignore.

        Returns: A list of feeding data.
        """
        eval_features = open_file(features_file, encoding="utf-8")
        if gfile.Exists(labels_file):
            eval_labels = open_file(labels_file, encoding="utf-8")
        else:
            eval_labels = open_file(labels_file + "0", encoding="utf-8")
        ss_buf = []
        tt_buf = []
        for ss, tt in zip(eval_features, eval_labels):
            if maximum_features_length and len(ss.strip().split()) > maximum_features_length:
                continue
            if maximum_labels_length and len(tt.strip().split()) > maximum_labels_length:
                continue
            encoded_ss = self._vocab_source.convert_to_idlist(ss.strip().split(" "))
            if maximum_encoded_features_length and len(encoded_ss) - 1 > maximum_encoded_features_length:
                continue
            encoded_tt = self._vocab_target.convert_to_idlist(tt.strip().split(" "))
            if maximum_encoded_labels_length and len(encoded_tt) - 1 > maximum_encoded_labels_length:
                continue
            ss_buf.append(encoded_ss)
            tt_buf.append(encoded_tt)
        close_file(eval_features)
        close_file(eval_labels)
        if self._bucketing:
            tlen = numpy.array([len(t) for t in tt_buf])
            tidx = tlen.argsort()
            _ss_buf = [ss_buf[i] for i in tidx]
            _tt_buf = [tt_buf[i] for i in tidx]
            ss_buf = _ss_buf
            tt_buf = _tt_buf
        data = []
        batch_data_idx = 0
        while batch_data_idx < len(ss_buf):
            x, len_x = padding_batch_data(
                ss_buf[batch_data_idx: batch_data_idx + self._batch_size],
                self._vocab_source.eos_id)
            y, len_y = padding_batch_data(
                tt_buf[batch_data_idx: batch_data_idx + self._batch_size],
                self._vocab_target.eos_id)
            batch_data_idx += self._batch_size
            data.append((len(len_x), {
                self.input_fields[GlobalNames.PH_FEATURE_IDS_NAME]: x,
                self.input_fields[GlobalNames.PH_FEATURE_LENGTH_NAME]: len_x,
                self.input_fields[GlobalNames.PH_LABEL_IDS_NAME]: y,
                self.input_fields[GlobalNames.PH_LABEL_LENGTH_NAME]: len_y}))
        return data

예제 #37

0

파일 보기

def infer(
        sess,
        prediction_op,
        infer_data,
        output,
        vocab_source,
        vocab_target,
        delimiter=" ",
        output_attention=False,
        to_char_level=False,
        verbose=True):
    """ Infers data and save the prediction results.

    Args:
        sess: `tf.Session`.
        prediction_op: Tensorflow operation for inference.
        infer_data: An iterable instance that each element
          is a packed feeding dictionary for `sess`.
        output: Output file name, `str`.
        vocab_source: A `Vocab` instance for source side feature map.
        vocab_target: A `Vocab` instance for target side feature map.
        alpha: A scalar number, length penalty rate. If not provided
          or < 0, simply average each beam by length of predicted
          sequence.
        delimiter: The delimiter of output token sequence.
        output_attention: Whether to output attention information.
        to_char_level: Whether to split words into characters
          (only for Chinese).
        verbose: Print inference information if set True.

    Returns: A tuple `(sources, hypothesis)`, two lists of
      strings.
    """
    attentions = dict()
    hypothesis = []
    scores = []
    sources = []
    cnt = 0
    for data in infer_data:
        source_tokens = [vocab_source.convert_to_wordlist(
            x, bpe_decoding=False, reverse_seq=False)
                         for x in data["feature_ids"]]
        x_str = [delimiter.join(x) for x in source_tokens]
        prediction, score, att = _infer(
            sess=sess,
            feed_dict=data["feed_dict"],
            prediction_op=prediction_op,
            batch_size=len(x_str),
            top_k=1,
            output_attention=output_attention)

        sources.extend(x_str)
        scores.append(score)
        hypothesis.extend([delimiter.join(vocab_target.convert_to_wordlist(prediction[sample_idx]))
                           for sample_idx in range(len(prediction))])
        if output_attention and att is not None:
            candidate_tokens = [vocab_target.convert_to_wordlist(
                prediction[idx], bpe_decoding=False, reverse_seq=False)
                                for idx in range(len(x_str))]

            attentions.update(pack_batch_attention_dict(
                cnt, source_tokens, candidate_tokens, att))
        cnt += len(x_str)
        if verbose:
            tf.logging.info(cnt)
    if to_char_level:
        hypothesis = to_chinese_char(hypothesis)
    if output:
        with open_file(output, mode="w") as fw:
            fw.write("\n".join(hypothesis) + "\n")
    if output_attention:
        dump_attentions(output, attentions)
    return sources, hypothesis, numpy.concatenate(scores, axis=0)