Exemplo n.º 1
0
def preprocessor_helper(x, y, charset, partition, maxTextLength):
    charset = charset.numpy().decode()
    partition = partition.numpy().decode()
    maxTextLength = maxTextLength.numpy()
    y = y.numpy()
    x = x.numpy()
    tokenizer = Tokenizer(filters=string.printable.translate(
        str.maketrans("", "", charset)),
                          charset=charset)
    if y.any():
        y_ = []
        for word in y:
            seq = tokenizer.texts_to_sequences(word.decode())[0]
            padded_seq = np.pad(seq, (0, maxTextLength - len(seq)))
            y_.append(padded_seq)

        y = np.array(y_)

    if partition in ["train"]:
        x = pp.augmentation(x,
                            rotation_range=5.0,
                            scale_range=0.05,
                            height_shift_range=0.025,
                            width_shift_range=0.05,
                            erode_range=5,
                            dilate_range=3)

    x = pp.normalization(x)

    if y.any():
        return x, y
    else:
        return x
Exemplo n.º 2
0
def predict(image_name):
    output_image_path = os.path.join("api", "temp")
    input_image_path = os.path.join(output_image_path, image_name)

    tokenizer = Tokenizer()

    model = MyModel(vocab_size=tokenizer.vocab_size,
                    beam_width=20,
                    stop_tolerance=15,
                    reduce_tolerance=10)
    model.compile(learning_rate=0.001)
    model.load_checkpoint(target=target_path)

    imgproc.__execute__(input_image_path, output_image_path)

    text = []
    confidence = []

    image_lines = sorted(
        glob(
            os.path.join(output_image_path,
                         image_name.split('.')[0], "lines", "*.png")))

    for img in image_lines:
        img = pp.preprocess_image(img, target_image_size, predict=True)
        img = pp.normalization([img])

        predicts, probabilities = model.predict(img, ctc_decode=True)

        predicts = tokenizer.sequences_to_texts(predicts)
        confidence.append(f"{predicts[0]} ==> {probabilities[0]}")
        text.append(Speller("en").autocorrect_sentence(predicts[0][0]))

    return "\n".join(text)
Exemplo n.º 3
0
    def __init__(self, partition):

        self.source_path = config.source_path
        self.partition = partition
        self.charset = config.charset
        self.maxTextLength = config.maxTextLength
        self.batch_size = config.batch_size
        self.buf_size = config.buf_size
        self.prefetch_size = config.prefetch_size
        self.tokenizer = Tokenizer()

        with h5py.File(self.source_path, "r") as f:
            self.imgs = f[self.partition]["image"][:]
            self.labels = f[self.partition]["label"][:]

        self.size = len(self.labels)
 def __init__(self,
              source_path,
              charset,
              partition,
              batch_size=32,
              maxTextLength,
              buf_size=0):
     self.maxTextLength = maxTextLength
     self.tokenizer = Tokenizer(filters=string.printable.translate(
         str.maketrans("", "", charset)),
                                charset=charset)
     # self.tokenizer.fit_on_texts(charset)
     self.batch_size = batch_size
     self.partition = partition
     self.dataset = h5py.File(source_path, 'r')[self.partition]
     self.size = self.dataset['label'].shape[0]
     self.steps = int(np.ceil(self.size / self.batch_size))
     self.buf_size = buf_size
Exemplo n.º 5
0
    def __init__(self,
                 source_path,
                 partition,
                 charset,
                 maxTextLength,
                 batch_size=32,
                 buf_size=1000):
        self.maxTextLength = maxTextLength
        self.tokenizer = Tokenizer(charset=charset)
        self.batch_size = batch_size
        self.partition = partition
        self.dataset = h5py.File(source_path, 'r')[self.partition]
        self.size = self.dataset['label'].shape[0]
        self.steps = int(np.ceil(self.size / self.batch_size))
        self.buf_size = buf_size

        if self.partition in ['train'] and self.buf_size:
            self.img_buf = self.dataset['image'][0:self.buf_size]
            self.lab_buf = self.dataset['label'][0:self.buf_size]
Exemplo n.º 6
0
    def make_binary_dataset(input_prefix, output_prefix, lang):
        dict = dictionary.Dictionary.load(dict_path(lang))
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))

        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_path(output_prefix, lang, 'bin'))

        def consumer(tensor):
            ds.add_item(tensor)

        input_file = '{}{}'.format(input_prefix,
                                   ('.' + lang) if lang is not None else '')
        res = Tokenizer.binarize(input_file, dict, consumer)
        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, res['nseq'], res['ntok'],
            100 * res['nunk'] / res['ntok'], dict.unk_word))
        ds.finalize(dataset_dest_path(output_prefix, lang, 'idx'))
Exemplo n.º 7
0
        output_image_path = os.path.join(input_image_path, "out")
        os.makedirs(output_image_path, exist_ok=True)

        if args.image:
            images = sorted(glob(os.path.join(input_image_path, args.image)))
        else:
            images = sorted(glob(os.path.join(input_image_path, "*.png")))
        
        from network.model import MyModel
        from data.tokenizer import Tokenizer
        from data import imgproc

        import matplotlib.pyplot as plt
        from data import data_preprocessor as pp 

        tokenizer = Tokenizer()

        model = MyModel(vocab_size=tokenizer.vocab_size,
                        beam_width=10,
                        stop_tolerance=15,
                        reduce_tolerance=10)
        model.compile(learning_rate=0.001)
        model.load_checkpoint(target=target_path)
#####
#####
#####
#####
#####
        imgproc.execute(images, output_image_path)

        for image in images:
Exemplo n.º 8
0
 def build_dictionary(filenames):
     d = dictionary.Dictionary()
     for filename in filenames:
         Tokenizer.add_file_to_dictionary(filename, d, tokenize_line)
     return d
Exemplo n.º 9
0
def main(args):
    print(args)
    os.makedirs(args.destdir, exist_ok=True)
    target = not args.only_source

    def build_dictionary(filenames):
        d = dictionary.Dictionary()
        for filename in filenames:
            Tokenizer.add_file_to_dictionary(filename, d, tokenize_line)
        return d

    def train_path(lang):
        return '{}{}'.format(args.trainpref, ('.' + lang) if lang else '')

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += f'.{lang}'
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args.destdir, file_name(prefix, lang))

    def dict_path(lang):
        return dest_path('dict', lang) + '.txt'

    def dataset_dest_path(output_prefix, lang, extension):
        base = f'{args.destdir}/{output_prefix}'
        lang_part = f'.{args.source_lang}-{args.target_lang}.{lang}' if lang is not None else ''
        return f'{base}{lang_part}.{extension}'

    if args.joined_dictionary:
        assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary'
        assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary'
        src_dict = build_dictionary(
            set([
                train_path(lang)
                for lang in [args.source_lang, args.target_lang]
            ]))
        tgt_dict = src_dict
    else:
        if args.srcdict:
            src_dict = dictionary.Dictionary.load(args.srcdict)
        else:
            assert args.trainpref, "--trainpref must be set if --srcdict is not specified"
            src_dict = build_dictionary([train_path(args.source_lang)])
        if target:
            if args.tgtdict:
                tgt_dict = dictionary.Dictionary.load(args.tgtdict)
            else:
                assert args.trainpref, "--trainpref must be set if --tgtdict is not specified"
                tgt_dict = build_dictionary([train_path(args.target_lang)])

    src_dict.finalize(
        threshold=args.thresholdsrc,
        nwords=args.nwordssrc,
        padding_factor=args.padding_factor,
    )
    src_dict.save(dict_path(args.source_lang))
    if target:
        if not args.joined_dictionary:
            tgt_dict.finalize(
                threshold=args.thresholdtgt,
                nwords=args.nwordstgt,
                padding_factor=args.padding_factor,
            )
        tgt_dict.save(dict_path(args.target_lang))

    def make_binary_dataset(input_prefix, output_prefix, lang):
        dict = dictionary.Dictionary.load(dict_path(lang))
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))

        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_path(output_prefix, lang, 'bin'))

        def consumer(tensor):
            ds.add_item(tensor)

        input_file = '{}{}'.format(input_prefix,
                                   ('.' + lang) if lang is not None else '')
        res = Tokenizer.binarize(input_file, dict, consumer)
        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, res['nseq'], res['ntok'],
            100 * res['nunk'] / res['ntok'], dict.unk_word))
        ds.finalize(dataset_dest_path(output_prefix, lang, 'idx'))

    def make_dataset(input_prefix, output_prefix, lang):
        if args.output_format == 'binary':
            make_binary_dataset(input_prefix, output_prefix, lang)
        elif args.output_format == 'raw':
            # Copy original text file to destination folder
            output_text_file = dest_path(
                output_prefix +
                '.{}-{}'.format(args.source_lang, args.target_lang),
                lang,
            )
            shutil.copyfile(file_name(input_prefix, lang), output_text_file)

    def make_all(lang):
        if args.trainpref:
            make_dataset(args.trainpref, 'train', lang)
        if args.validpref:
            for k, validpref in enumerate(args.validpref.split(',')):
                outprefix = 'valid{}'.format(k) if k > 0 else 'valid'
                make_dataset(validpref, outprefix, lang)
        if args.testpref:
            for k, testpref in enumerate(args.testpref.split(',')):
                outprefix = 'test{}'.format(k) if k > 0 else 'test'
                make_dataset(testpref, outprefix, lang)

    make_all(args.source_lang)
    if target:
        make_all(args.target_lang)

    print('| Wrote preprocessed data to {}'.format(args.destdir))

    if args.alignfile:
        assert args.trainpref, "--trainpref must be set if --alignfile is specified"
        src_file_name = train_path(args.source_lang)
        tgt_file_name = train_path(args.target_lang)
        src_dict = dictionary.Dictionary.load(dict_path(args.source_lang))
        tgt_dict = dictionary.Dictionary.load(dict_path(args.target_lang))
        freq_map = {}
        with open(args.alignfile, 'r') as align_file:
            with open(src_file_name, 'r') as src_file:
                with open(tgt_file_name, 'r') as tgt_file:
                    for a, s, t in zip_longest(align_file, src_file, tgt_file):
                        si = Tokenizer.tokenize(s,
                                                src_dict,
                                                add_if_not_exist=False)
                        ti = Tokenizer.tokenize(t,
                                                tgt_dict,
                                                add_if_not_exist=False)
                        ai = list(map(lambda x: tuple(x.split('-')),
                                      a.split()))
                        for sai, tai in ai:
                            srcidx = si[int(sai)]
                            tgtidx = ti[int(tai)]
                            if srcidx != src_dict.unk(
                            ) and tgtidx != tgt_dict.unk():
                                assert srcidx != src_dict.pad()
                                assert srcidx != src_dict.eos()
                                assert tgtidx != tgt_dict.pad()
                                assert tgtidx != tgt_dict.eos()

                                if srcidx not in freq_map:
                                    freq_map[srcidx] = {}
                                if tgtidx not in freq_map[srcidx]:
                                    freq_map[srcidx][tgtidx] = 1
                                else:
                                    freq_map[srcidx][tgtidx] += 1

        align_dict = {}
        for srcidx in freq_map.keys():
            align_dict[srcidx] = max(freq_map[srcidx],
                                     key=freq_map[srcidx].get)

        with open(
                os.path.join(
                    args.destdir,
                    'alignment.{}-{}.txt'.format(args.source_lang,
                                                 args.target_lang)), 'w') as f:
            for k, v in align_dict.items():
                print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
Exemplo n.º 10
0
import matplotlib.pyplot as plt
import cv2
import numpy as np
from data.tokenizer import Tokenizer
from data.generator import DataGenerator_tf
from config import config
from data import data_preprocessor as pp

tokenizer = Tokenizer(charset=config.charset)
dg = DataGenerator_tf("train")
ds = dg.create_dataset()
for i, l in ds.take(1):
    print(i.shape, tokenizer.sequences_to_texts(np.swapaxes([l.numpy()], 0,
                                                            1)))
    plt.subplot(121)
    plt.imshow(pp.adjust_to_see(i[0].numpy()), cmap="gray")
    plt.subplot(122)
    plt.imshow(pp.adjust_to_see(i[1].numpy()), cmap="gray")
    plt.show()
Exemplo n.º 11
0
class Datagenerator(tf.keras.utils.Sequence):
    def __init__(self,
                 source_path,
                 partition,
                 charset,
                 maxTextLength,
                 batch_size=32,
                 buf_size=1000):
        self.maxTextLength = maxTextLength
        self.tokenizer = Tokenizer(charset=charset)
        self.batch_size = batch_size
        self.partition = partition
        self.dataset = h5py.File(source_path, 'r')[self.partition]
        self.size = self.dataset['label'].shape[0]
        self.steps = int(np.ceil(self.size / self.batch_size))
        self.buf_size = buf_size

        if self.partition in ['train'] and self.buf_size:
            self.img_buf = self.dataset['image'][0:self.buf_size]
            self.lab_buf = self.dataset['label'][0:self.buf_size]

        # for p in self.partitions:
        #     self.size[p] = self.dataset[p]['image'].shape[0]
        #     self.steps[p] = int(np.ceil(self.size[p]/self.batch_size))
        #     self.index[p] = 0

    def __getitem__(self, idx):
        if self.partition in ['valid', 'test'] or not self.buf_size:
            index = idx * self.batch_size
            until = index + self.batch_size

            x = np.array(self.dataset['image'][index:until])
            if self.partition in ['train']:
                x = pp.augmentation(
                    x,
                    rotation_range=config.rotation_range,
                    scale_range=config.scale_range,
                    height_shift_range=config.height_shift_range,
                    width_shift_range=config.width_shift_range,
                    erode_range=config.erode_range,
                    dilate_range=config.dilate_range)
            x = pp.normalization(x)
            if self.partition in ['valid', 'train']:
                y = self.dataset['label'][index:until]
                # y = [self.tokenizer.texts_to_sequences(word.decode())[0] for word in y]
                # y = np.array([np.pad(np.asarray(seq), (0, self.maxTextLength-len(seq)), constant_values=(-1, self.PAD)) for seq in y])
                y_ = []
                for line in y:
                    seq = self.tokenizer.texts_to_sequences(line.decode())[0]
                    padded_seq = np.pad(seq,
                                        (0, self.maxTextLength - len(seq)))
                    y_.append(padded_seq)

                y = np.array(y_)

                return (x, y)
            return x

        else:
            index = idx * self.batch_size + self.buf_size
            until = index + self.batch_size

            zipped = list(zip(self.img_buf, self.lab_buf))
            np.random.shuffle(zipped)

            X, Y = zip(*zipped)
            X = list(X)
            Y = list(Y)

            x = np.array(X[:self.batch_size])
            y = Y[:self.batch_size]

            if until < self.size:
                X[:self.batch_size] = self.dataset['image'][index:until]
                Y[:self.batch_size] = self.dataset['label'][index:until]

            elif index < self.size:
                X = X[until - self.size:]
                Y = Y[until - self.size:]
                until = self.size
                X[:until - index] = self.dataset['image'][index:until]
                Y[:until - index] = self.dataset['label'][index:until]

            else:
                X = X[self.batch_size:]
                Y = Y[self.batch_size:]

            self.img_buf = X
            self.lab_buf = Y

            x = pp.augmentation(x,
                                rotation_range=config.rotation_range,
                                scale_range=config.scale_range,
                                height_shift_range=config.height_shift_range,
                                width_shift_range=config.width_shift_range,
                                erode_range=config.erode_range,
                                dilate_range=config.dilate_range)
            x = pp.normalization(x)
            # y = [self.tokenizer.texts_to_sequences(word.decode())[0] for word in y]
            # y = np.array([np.pad(np.asarray(seq), (0, self.maxTextLength-len(seq)), constant_values=(-1, self.PAD)) for seq in y])
            y_ = []
            for line in y:
                seq = self.tokenizer.texts_to_sequences(line.decode())[0]
                padded_seq = np.pad(seq, (0, self.maxTextLength - len(seq)))
                y_.append(padded_seq)

            y = np.array(y_)

            return (x, y)

    def __len__(self):
        return self.steps

    def on_epoch_end(self):
        if self.partition in ['train'] and self.buf_size:
            self.img_buf = self.dataset['image'][0:self.buf_size]
            self.lab_buf = self.dataset['label'][0:self.buf_size]
Exemplo n.º 12
0
class DataGenerator_tf():
    def __init__(self, partition):

        self.source_path = config.source_path
        self.partition = partition
        self.charset = config.charset
        self.maxTextLength = config.maxTextLength
        self.batch_size = config.batch_size
        self.buf_size = config.buf_size
        self.prefetch_size = config.prefetch_size
        self.tokenizer = Tokenizer()

        with h5py.File(self.source_path, "r") as f:
            self.imgs = f[self.partition]["image"][:]
            self.labels = f[self.partition]["label"][:]

        self.size = len(self.labels)

    def preprocessor_helper(self, x, y):
        y = y.numpy()
        x = x.numpy()

        if y.any():
            y_ = []
            for line in y:
                seq = self.tokenizer.texts_to_sequences(line.decode())[0]
                padded_seq = np.pad(seq, (0, self.maxTextLength - len(seq)))
                y_.append(padded_seq)

            y = np.array(y_)

        if self.partition in ["train"]:
            x = pp.augmentation(x,
                                rotation_range=config.rotation_range,
                                scale_range=config.scale_range,
                                height_shift_range=config.height_shift_range,
                                width_shift_range=config.width_shift_range,
                                erode_range=config.erode_range,
                                dilate_range=config.dilate_range)

        x = pp.normalization(x)

        if y.any():
            return x, y
        else:
            return x

    def get_img_label(self, x):
        index = x.numpy()
        if self.partition in ["test"]:
            return self.imgs[index]
        else:
            return self.imgs[index], self.labels[index]

    def create_dataset(self):
        indexes = [i for i in range(self.size)]
        if self.partition in ["train"]:
            np.random.shuffle(indexes)

        index_ds = tf.data.Dataset.from_tensor_slices(indexes)
        if self.partition in ["train"]:
            ds = index_ds.map(lambda x: tf.py_function(
                self.get_img_label, [x], [tf.uint8, tf.string])).shuffle(
                    self.buf_size).batch(self.batch_size)
            final_ds = ds.map(lambda x, y: tf.py_function(
                self.preprocessor_helper, [x, y], [tf.float32, tf.float32]))
        elif self.partition in ["valid"]:
            ds = index_ds.map(lambda x: tf.py_function(
                self.get_img_label, [x], [tf.uint8, tf.string])).batch(
                    self.batch_size)
            final_ds = ds.map(lambda x, y: tf.py_function(
                self.preprocessor_helper, [x, y], [tf.float32, tf.float32]))
        else:
            ds = index_ds.map(lambda x: tf.py_function(
                self.get_img_label, [x], [tf.uint8])).batch(self.batch_size)
            final_ds = ds.map(lambda x: tf.py_function(
                self.preprocessor_helper, [x, False], [tf.float32]))

        return final_ds.prefetch(self.prefetch_size)
Exemplo n.º 13
0
    else:
        ds = index_ds.map(lambda x: tf.py_function(
            get_img_label, [x], [tf.uint8])).batch(batch_size)
        final_ds = ds.map(lambda x: tf.py_function(preprocessor_helper, [
            x, False, charset, partition, maxTextLength
        ], [tf.float32]))

    return final_ds.prefetch(prefetch_size)


if __name__ == "__main__":
    import matplotlib.pyplot as plt
    import cv2
    charset = string.printable[:84]
    tokenizer = Tokenizer(filters=string.printable.translate(
        str.maketrans("", "", charset)),
                          charset=charset)
    ds = create_dataset("../data/dataset_hdf5/iam_words.hdf5",
                        string.printable[:84], "train", 32, 2, 10, 2)
    for i, l in ds.take(1):
        print(i.shape,
              tokenizer.sequences_to_texts(np.swapaxes([l.numpy()], 0, 1)))
        plt.subplot(121)
        plt.imshow(pp.adjust_to_see(i[0].numpy()), cmap="gray")
        plt.subplot(122)
        plt.imshow(pp.adjust_to_see(i[1].numpy()), cmap="gray")
        plt.show()

# class Datagenerator(tf.keras.utils.Sequence):
#     def __init__(self, source_path, charset, partition, maxTextLength, batch_size=32, buf_size=0):
#         self.maxTextLength = maxTextLength