def prepare_sequence(self,
                         sentences,
                         sos=False,
                         eos=False,
                         add_noise=False):
        """Prepare inputs to feed the model"""

        n_sen = list(sentences).copy()

        sos = self.tokenizer.SOS_TK if sos else ""
        eos = self.tokenizer.EOS_TK if eos else ""

        for i in range(len(n_sen)):
            if add_noise:
                n_sen[i] = pp.add_noise([n_sen[i]], self.tokenizer.maxlen)[0]

            n_sen[i] = self.tokenizer.encode(sos +
                                             n_sen[i][:self.tokenizer.maxlen -
                                                      2] + eos)
            n_sen[i] = np.pad(n_sen[i],
                              (0, self.tokenizer.maxlen - len(n_sen[i])))

            if self.one_hot_process:
                n_sen[i] = self.tokenizer.encode_one_hot(n_sen[i])

        return np.asarray(n_sen, dtype=np.int16)
示例#2
0
    raw_path = os.path.join("..", "raw")
    data_path = os.path.join("..", "data")
    source_path = os.path.join(data_path, f"{args.source}.txt")
    output_path = os.path.join("..", "output", args.source, args.mode)
    target_path = os.path.join(output_path, "checkpoint_weights.hdf5")

    max_text_length = 128
    charset_base = string.printable[:95]
    charset_special = """ÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöùúûüý"""

    if args.transform:
        data = Dataset(source=os.path.join(raw_path, args.source))
        data.read_lines(maxlen=max_text_length)

        valid_noised = pp.add_noise(data.dataset['valid'], max_text_length)
        test_noised = pp.add_noise(data.dataset['test'], max_text_length)

        valid_metrics = ev.ocr_metrics(ground_truth=data.dataset['valid'],
                                       data=valid_noised)

        info = "\n".join([
            f"####",
            f"#### {args.source} partitions (number of sentences)",
            f"####",
            f"#### Total:      {data.size['total']}",
            f"####",
            f"#### Train:      {data.size['train']}",
            f"#### Validation: {data.size['valid']}",
            f"####\n",
            f"#### Validation Error Rate:",