def test_reopen_encoded(self): for encoding in ('utf-8', 'utf-16'): with codecs.open(self.temporary_path, 'w', encoding) as fobj: fobj.write('something') with open(self.temporary_path, 'r') as fobj: reopened_fobj = reopen_encoded(fobj, fobj.mode, encoding) self.assertEqual(reopened_fobj.encoding.lower(), encoding)
def _main(): """Command line interface to the module. """ from argparse import ArgumentParser, FileType from gutenberg._util.os import reopen_encoded parser = ArgumentParser(description='Remove headers and footers from a ' 'Project Gutenberg text') parser.add_argument('infile', type=FileType('r')) parser.add_argument('outfile', type=FileType('w')) args = parser.parse_args() with reopen_encoded(args.infile, 'r', 'utf8') as infile: text = infile.read() clean_text = strip_headers(text) with reopen_encoded(args.outfile, 'w', 'utf8') as outfile: outfile.write(clean_text)
def _main(): """Command line interface to the module. """ from argparse import ArgumentParser, FileType from gutenberg._util.os import reopen_encoded parser = ArgumentParser(description='Download a Project Gutenberg text') parser.add_argument('etextno', type=int) parser.add_argument('outfile', type=FileType('w')) args = parser.parse_args() try: text = load_etext(args.etextno) except ValueError as ex: parser.error(str(ex)) else: with reopen_encoded(args.outfile, 'w', 'utf8') as outfile: outfile.write(text)
def _main(): """Command line interface to the module. """ from argparse import ArgumentParser, FileType from gutenberg import Error from gutenberg._util.os import reopen_encoded parser = ArgumentParser(description='Download a Project Gutenberg text') parser.add_argument('etextno', type=int) parser.add_argument('outfile', type=FileType('w')) args = parser.parse_args() try: text = load_etext(args.etextno) with reopen_encoded(args.outfile, 'w', 'utf8') as outfile: outfile.write(text) except Error as error: parser.error(str(error))
def _main(): """Command line interface to the module. """ from argparse import ArgumentParser, FileType from gutenberg import Error from gutenberg._util.os import reopen_encoded parser = ArgumentParser(description="Download a Project Gutenberg text") parser.add_argument("etextno", type=int) parser.add_argument("outfile", type=FileType("w")) args = parser.parse_args() try: text = load_etext(args.etextno) with reopen_encoded(args.outfile, "w", "utf8") as outfile: outfile.write(text) except Error as error: parser.error(str(error))
def _main(): """Command line interface to the module. """ from argparse import ArgumentParser, FileType from gutenberg import Error from gutenberg._util.os import reopen_encoded parser = ArgumentParser(description='Download a Project Gutenberg text') parser.add_argument('etextno', type=int) parser.add_argument('outfile', type=FileType('w')) parser.add_argument('--mirror', '-m', type=str, default=None) parser.add_argument('--prefer-ascii', '-a', type=bool, default=False) args = parser.parse_args() try: text = load_etext(args.etextno, mirror=args.mirror, prefer_ascii=args.prefer_ascii) with reopen_encoded(args.outfile, 'w', 'utf8') as outfile: outfile.write(text) except Error as error: parser.error(str(error))
function, writing the result with the same filename to <outdir>. ''' import glob import sys from tqdm import tqdm import gutenberg.cleanup.strip_headers as strip_headers from gutenberg._util.os import reopen_encoded from gutenberg import Error if len(sys.argv) != 3: print('usage: python3 clean.py <indir> <outdir>') sys.exit(1) indir = sys.argv[1] outdir = sys.argv[2] files = glob.glob(indir + '/*.txt') for f in tqdm(files): try: with reopen_encoded(open(f, 'r'), 'r', 'utf8') as infile: cleaned = strip_headers(infile.read()) short = f.split('/')[-1] with open(outdir + '/' + short, 'w', encoding='utf8') as outfile: outfile.write(cleaned) except: print('Error processing', f, '; skipping...')