Exemplo n.º 1
0
    def test_reopen_encoded(self):
        for encoding in ('utf-8', 'utf-16'):
            with codecs.open(self.temporary_path, 'w', encoding) as fobj:
                fobj.write('something')

            with open(self.temporary_path, 'r') as fobj:
                reopened_fobj = reopen_encoded(fobj, fobj.mode, encoding)
                self.assertEqual(reopened_fobj.encoding.lower(), encoding)
Exemplo n.º 2
0
def _main():
    """Command line interface to the module.

    """
    from argparse import ArgumentParser, FileType
    from gutenberg._util.os import reopen_encoded

    parser = ArgumentParser(description='Remove headers and footers from a '
                                        'Project Gutenberg text')
    parser.add_argument('infile', type=FileType('r'))
    parser.add_argument('outfile', type=FileType('w'))
    args = parser.parse_args()

    with reopen_encoded(args.infile, 'r', 'utf8') as infile:
        text = infile.read()
        clean_text = strip_headers(text)

    with reopen_encoded(args.outfile, 'w', 'utf8') as outfile:
        outfile.write(clean_text)
Exemplo n.º 3
0
def _main():
    """Command line interface to the module.

    """
    from argparse import ArgumentParser, FileType
    from gutenberg._util.os import reopen_encoded

    parser = ArgumentParser(description='Download a Project Gutenberg text')
    parser.add_argument('etextno', type=int)
    parser.add_argument('outfile', type=FileType('w'))
    args = parser.parse_args()

    try:
        text = load_etext(args.etextno)
    except ValueError as ex:
        parser.error(str(ex))
    else:
        with reopen_encoded(args.outfile, 'w', 'utf8') as outfile:
            outfile.write(text)
Exemplo n.º 4
0
def _main():
    """Command line interface to the module.

    """
    from argparse import ArgumentParser, FileType
    from gutenberg import Error
    from gutenberg._util.os import reopen_encoded

    parser = ArgumentParser(description='Download a Project Gutenberg text')
    parser.add_argument('etextno', type=int)
    parser.add_argument('outfile', type=FileType('w'))
    args = parser.parse_args()

    try:
        text = load_etext(args.etextno)
        with reopen_encoded(args.outfile, 'w', 'utf8') as outfile:
            outfile.write(text)
    except Error as error:
        parser.error(str(error))
Exemplo n.º 5
0
def _main():
    """Command line interface to the module.

    """
    from argparse import ArgumentParser, FileType
    from gutenberg import Error
    from gutenberg._util.os import reopen_encoded

    parser = ArgumentParser(description="Download a Project Gutenberg text")
    parser.add_argument("etextno", type=int)
    parser.add_argument("outfile", type=FileType("w"))
    args = parser.parse_args()

    try:
        text = load_etext(args.etextno)
        with reopen_encoded(args.outfile, "w", "utf8") as outfile:
            outfile.write(text)
    except Error as error:
        parser.error(str(error))
Exemplo n.º 6
0
def _main():
    """Command line interface to the module.

    """
    from argparse import ArgumentParser, FileType
    from gutenberg import Error
    from gutenberg._util.os import reopen_encoded

    parser = ArgumentParser(description='Download a Project Gutenberg text')
    parser.add_argument('etextno', type=int)
    parser.add_argument('outfile', type=FileType('w'))
    parser.add_argument('--mirror', '-m', type=str, default=None)
    parser.add_argument('--prefer-ascii', '-a', type=bool, default=False)
    args = parser.parse_args()

    try:
        text = load_etext(args.etextno,
                          mirror=args.mirror,
                          prefer_ascii=args.prefer_ascii)
        with reopen_encoded(args.outfile, 'w', 'utf8') as outfile:
            outfile.write(text)
    except Error as error:
        parser.error(str(error))
Exemplo n.º 7
0
function, writing the result with the same filename to <outdir>.
'''

import glob
import sys

from tqdm import tqdm

import gutenberg.cleanup.strip_headers as strip_headers
from gutenberg._util.os import reopen_encoded
from gutenberg import Error

if len(sys.argv) != 3:
    print('usage: python3 clean.py <indir> <outdir>')
    sys.exit(1)

indir = sys.argv[1]
outdir = sys.argv[2]

files = glob.glob(indir + '/*.txt')
for f in tqdm(files):
    try:
        with reopen_encoded(open(f, 'r'), 'r', 'utf8') as infile:
            cleaned = strip_headers(infile.read())

        short = f.split('/')[-1]
        with open(outdir + '/' + short, 'w', encoding='utf8') as outfile:
            outfile.write(cleaned)
    except:
        print('Error processing', f, '; skipping...')