def line_generator(ctx, font, maxlines, encoding, normalization, renormalize, reorder, font_size, font_weight, language, max_length, strip, disable_degradation, alpha, beta, distort, distortion_sigma, legacy, output, text): """ Generates artificial text line training data. """ import errno import numpy as np from kraken import linegen from kraken.lib.util import make_printable lines: Set[str] = set() if not text: return with log.progressbar(text, label='Reading texts') as bar: for t in text: with click.open_file(t, encoding=encoding) as fp: logger.info('Reading {}'.format(t)) for l in fp: lines.add(l.rstrip('\r\n')) if normalization: lines = set([unicodedata.normalize(normalization, line) for line in lines]) if strip: lines = set([line.strip() for line in lines]) if max_length: lines = set([line for line in lines if len(line) < max_length]) logger.info('Read {} lines'.format(len(lines))) message('Read {} unique lines'.format(len(lines))) if maxlines and maxlines < len(lines): message('Sampling {} lines\t'.format(maxlines), nl=False) llist = list(lines) lines = set(llist[idx] for idx in np.random.randint(0, len(llist), maxlines)) message('\u2713', fg='green') try: os.makedirs(output) except OSError as e: if e.errno != errno.EEXIST: raise # calculate the alphabet and print it for verification purposes alphabet: Set[str] = set() for line in lines: alphabet.update(line) chars = [] combining = [] for char in sorted(alphabet): k = make_printable(char) if k != char: combining.append(k) else: chars.append(k) message('Σ (len: {})'.format(len(alphabet))) message('Symbols: {}'.format(''.join(chars))) if combining: message('Combining Characters: {}'.format(', '.join(combining))) lg = linegen.LineGenerator(font, font_size, font_weight, language) with log.progressbar(lines, label='Writing images') as bar: for idx, line in enumerate(bar): logger.info(line) try: if renormalize: im = lg.render_line(unicodedata.normalize(renormalize, line)) else: im = lg.render_line(line) except KrakenCairoSurfaceException as e: logger.info('{}: {} {}'.format(e.message, e.width, e.height)) continue if not disable_degradation and not legacy: im = linegen.degrade_line(im, alpha=alpha, beta=beta) im = linegen.distort_line(im, abs(np.random.normal(distort)), abs(np.random.normal(distortion_sigma))) elif legacy: im = linegen.ocropy_degrade(im) im.save('{}/{:06d}.png'.format(output, idx)) with open('{}/{:06d}.gt.txt'.format(output, idx), 'wb') as fp: if reorder: fp.write(get_display(line).encode('utf-8')) else: fp.write(line.encode('utf-8'))
def line_generator(ctx, font, maxlines, encoding, normalization, renormalize, font_size, language, max_length, strip, disable_degradation, binarize, mean, sigma, density, distort, distortion_sigma, legacy, output, text): """ Generates artificial text line training data. """ lines = set() if not text: return st_time = time.time() for t in text: with click.open_file(t, encoding=encoding) as fp: if ctx.meta['verbose'] > 0: click.echo(u'[{:2.4f}] Reading {}'.format(time.time() - st_time, t)) else: spin('Reading texts') lines.update(fp.readlines()) if normalization: lines = set([unicodedata.normalize(normalization, line) for line in lines]) if strip: lines = set([line.strip() for line in lines]) if max_length: lines = set([line for line in lines if len(line) < max_length]) if ctx.meta['verbose'] > 0: click.echo(u'[{:2.4f}] Read {} lines'.format(time.time() - st_time, len(lines))) else: click.secho(u'\b\u2713', fg='green', nl=False) click.echo('\033[?25h\n', nl=False) click.echo('Read {} unique lines'.format(len(lines))) if maxlines and maxlines < len(lines): click.echo('Sampling {} lines\t'.format(maxlines), nl=False) lines = list(lines) lines = [lines[idx] for idx in np.random.randint(0, len(lines), maxlines)] click.secho(u'\u2713', fg='green') try: os.makedirs(output) except OSError as e: if e.errno != errno.EEXIST: raise lines = [line.strip() for line in lines] # calculate the alphabet and print it for verification purposes alphabet = set() for line in lines: alphabet.update(line) chars = [] combining = [] for char in sorted(alphabet): if unicodedata.combining(char): combining.append(unicodedata.name(char)) else: chars.append(char) click.echo(u'Σ (len: {})'.format(len(alphabet))) click.echo(u'Symbols: {}'.format(''.join(chars))) if combining: click.echo(u'Combining Characters: {}'.format(', '.join(combining))) lg = linegen.LineGenerator(font, font_size, language) for idx, line in enumerate(lines): if ctx.meta['verbose'] > 0: click.echo(u'[{:2.4f}] {}'.format(time.time() - st_time, line)) else: spin('Writing images') try: if renormalize: im = lg.render_line(unicodedata.normalize(renormalize, line)) else: im = lg.render_line(line) except KrakenCairoSurfaceException as e: if ctx.meta['verbose'] > 0: click.echo('[{:2.4f}] {}: {} {}'.format(time.time() - st_time, e.message, e.width, e.height)) else: click.secho(u'\b\u2717', fg='red') click.echo('{}: {} {}'.format(e.message, e.width, e.height)) continue if not disable_degradation and not legacy: im = linegen.distort_line(im, np.random.normal(distort), np.random.normal(distortion_sigma)) im = linegen.degrade_line(im, np.random.normal(mean), np.random.normal(sigma), np.random.normal(density)) elif legacy: im = linegen.ocropy_degrade(im) if binarize: im = binarization.nlbin(im) im.save('{}/{:06d}.png'.format(output, idx)) with open('{}/{:06d}.gt.txt'.format(output, idx), 'wb') as fp: fp.write(line.encode('utf-8')) if ctx.meta['verbose'] == 0: click.secho(u'\b\u2713', fg='green', nl=False) click.echo('\033[?25h\n', nl=False)
def line_generator(ctx, font, maxlines, encoding, normalization, renormalize, reorder, font_size, font_weight, language, max_length, strip, disable_degradation, alpha, beta, distort, distortion_sigma, legacy, output, text): """ Generates artificial text line training data. """ import errno import numpy as np from kraken import linegen from kraken.lib.util import make_printable lines: Set[str] = set() if not text: return with log.progressbar(text, label='Reading texts') as bar: for t in text: with click.open_file(t, encoding=encoding) as fp: logger.info('Reading {}'.format(t)) for l in fp: lines.add(l.rstrip('\r\n')) if normalization: lines = set( [unicodedata.normalize(normalization, line) for line in lines]) if strip: lines = set([line.strip() for line in lines]) if max_length: lines = set([line for line in lines if len(line) < max_length]) logger.info('Read {} lines'.format(len(lines))) message('Read {} unique lines'.format(len(lines))) if maxlines and maxlines < len(lines): message('Sampling {} lines\t'.format(maxlines), nl=False) llist = list(lines) lines = set(llist[idx] for idx in np.random.randint(0, len(llist), maxlines)) message('\u2713', fg='green') try: os.makedirs(output) except OSError as e: if e.errno != errno.EEXIST: raise # calculate the alphabet and print it for verification purposes alphabet: Set[str] = set() for line in lines: alphabet.update(line) chars = [] combining = [] for char in sorted(alphabet): k = make_printable(char) if k != char: combining.append(k) else: chars.append(k) message('Σ (len: {})'.format(len(alphabet))) message('Symbols: {}'.format(''.join(chars))) if combining: message('Combining Characters: {}'.format(', '.join(combining))) lg = linegen.LineGenerator(font, font_size, font_weight, language) with log.progressbar(lines, label='Writing images') as bar: for idx, line in enumerate(bar): logger.info(line) try: if renormalize: im = lg.render_line( unicodedata.normalize(renormalize, line)) else: im = lg.render_line(line) except KrakenCairoSurfaceException as e: logger.info('{}: {} {}'.format(e.message, e.width, e.height)) continue if not disable_degradation and not legacy: im = linegen.degrade_line(im, alpha=alpha, beta=beta) im = linegen.distort_line( im, abs(np.random.normal(distort)), abs(np.random.normal(distortion_sigma))) elif legacy: im = linegen.ocropy_degrade(im) im.save('{}/{:06d}.png'.format(output, idx)) with open('{}/{:06d}.gt.txt'.format(output, idx), 'wb') as fp: if reorder: fp.write(get_display(line).encode('utf-8')) else: fp.write(line.encode('utf-8'))
def line_generator(ctx, font, maxlines, encoding, normalization, renormalize, reorder, font_size, font_weight, language, max_length, strip, disable_degradation, binarize, mean, sigma, density, distort, distortion_sigma, legacy, output, text): """ Generates artificial text line training data. """ lines = set() if not text: return st_time = time.time() for t in text: with click.open_file(t, encoding=encoding) as fp: if ctx.meta['verbose'] > 0: click.echo(u'[{:2.4f}] Reading {}'.format( time.time() - st_time, t)) else: spin('Reading texts') lines.update(fp.readlines()) if normalization: lines = set( [unicodedata.normalize(normalization, line) for line in lines]) if strip: lines = set([line.strip() for line in lines]) if max_length: lines = set([line for line in lines if len(line) < max_length]) if ctx.meta['verbose'] > 0: click.echo(u'[{:2.4f}] Read {} lines'.format(time.time() - st_time, len(lines))) else: click.secho(u'\b\u2713', fg='green', nl=False) click.echo('\033[?25h\n', nl=False) click.echo('Read {} unique lines'.format(len(lines))) if maxlines and maxlines < len(lines): click.echo('Sampling {} lines\t'.format(maxlines), nl=False) lines = list(lines) lines = [ lines[idx] for idx in np.random.randint(0, len(lines), maxlines) ] click.secho(u'\u2713', fg='green') try: os.makedirs(output) except OSError as e: if e.errno != errno.EEXIST: raise lines = [line.strip() for line in lines] # calculate the alphabet and print it for verification purposes alphabet = set() for line in lines: alphabet.update(line) chars = [] combining = [] for char in sorted(alphabet): if unicodedata.combining(char): combining.append(unicodedata.name(char)) else: chars.append(char) click.echo(u'Σ (len: {})'.format(len(alphabet))) click.echo(u'Symbols: {}'.format(''.join(chars))) if combining: click.echo(u'Combining Characters: {}'.format(', '.join(combining))) lg = linegen.LineGenerator(font, font_size, font_weight, language) for idx, line in enumerate(lines): if ctx.meta['verbose'] > 0: click.echo(u'[{:2.4f}] {}'.format(time.time() - st_time, line)) else: spin('Writing images') try: if renormalize: im = lg.render_line(unicodedata.normalize(renormalize, line)) else: im = lg.render_line(line) except KrakenCairoSurfaceException as e: if ctx.meta['verbose'] > 0: click.echo('[{:2.4f}] {}: {} {}'.format( time.time() - st_time, e.message, e.width, e.height)) else: click.secho(u'\b\u2717', fg='red') click.echo('{}: {} {}'.format(e.message, e.width, e.height)) continue if not disable_degradation and not legacy: im = linegen.distort_line(im, abs(np.random.normal(distort)), abs(np.random.normal(distortion_sigma))) im = linegen.degrade_line(im, abs(np.random.normal(mean)), abs(np.random.normal(sigma)), abs(np.random.normal(density))) elif legacy: im = linegen.ocropy_degrade(im) if binarize: try: im = binarization.nlbin(im) except KrakenInputException as e: click.echo('{}'.format(e.message)) continue im.save('{}/{:06d}.png'.format(output, idx)) with open('{}/{:06d}.gt.txt'.format(output, idx), 'wb') as fp: if reorder: fp.write(get_display(line).encode('utf-8')) else: fp.write(line.encode('utf-8')) if ctx.meta['verbose'] == 0: click.secho(u'\b\u2713', fg='green', nl=False) click.echo('\033[?25h\n', nl=False)