예제 #1
0
def load_etext(etextno, refresh_cache=False, mirror=None, prefer_ascii=False):
    """Returns a unicode representation of the full body of a Project Gutenberg
    text. After making an initial remote call to Project Gutenberg's servers,
    the text is persisted locally.

    """
    etextno = validate_etextno(etextno)
    cached = os.path.join(_TEXT_CACHE, '{}.txt.gz'.format(etextno))

    if refresh_cache:
        remove(cached)
    if not os.path.exists(cached):
        makedirs(os.path.dirname(cached))
        download_uri = _format_download_uri(etextno, mirror, prefer_ascii)
        response = requests.get(download_uri)
        # Ensure proper UTF-8 saving. There might be instances of ebooks or
        # mirrors which advertise a broken encoding, and this will break
        # downstream usages. For example, #55517 from aleph.gutenberg.org:
        #
        # from gutenberg.acquire import load_etext
        # print(load_etext(55517, refresh_cache=True)[0:1000])
        #
        # response.encoding will be 'ISO-8859-1' while the file is UTF-8
        if response.encoding != response.apparent_encoding:
            response.encoding = response.apparent_encoding
        text = response.text
        with closing(gzip.open(cached, 'w')) as cache:
            cache.write(text.encode('utf-8'))

    with closing(gzip.open(cached, 'r')) as cache:
        text = cache.read().decode('utf-8')
    return text
예제 #2
0
 def test_is_invalid_etext(self):
     with self.assertRaises(ValueError):
         validate_etextno('not-a-positive-integer')
     with self.assertRaises(ValueError):
         validate_etextno(-123)
     with self.assertRaises(ValueError):
         validate_etextno(0)
     with self.assertRaises(ValueError):
         validate_etextno(12.3)
예제 #3
0
def load_etext(etextno, refresh_cache=False, mirror=None, prefer_ascii=False):
    """Returns a unicode representation of the full body of a Project Gutenberg
    text. After making an initial remote call to Project Gutenberg's servers,
    the text is persisted locally.
    """
    etextno = validate_etextno(etextno)
    cached = os.path.join(_TEXT_CACHE, '{}.txt.gz'.format(etextno))

    if refresh_cache:
        remove(cached)
    if not os.path.exists(cached):
        makedirs(os.path.dirname(cached))
        download_uri = _format_download_uri(etextno, mirror, prefer_ascii)
        response = requests.get(download_uri)
        # Ensure proper UTF-8 saving. There might be instances of ebooks or
        # mirrors which advertise a broken encoding, and this will break
        # downstream usages. For example, #55517 from aleph.gutenberg.org:
        #
        # from gutenberg.acquire import load_etext
        # print(load_etext(55517, refresh_cache=True)[0:1000])
        #
        # response.encoding will be 'ISO-8859-1' while the file is UTF-8
        if response.encoding != response.apparent_encoding:
            response.encoding = response.apparent_encoding
        text = response.text
        with closing(gzip.open(cached, 'w')) as cache:
            cache.write(text.encode('utf-8'))

    with closing(gzip.open(cached, 'r')) as cache:
        text = cache.read().decode('utf-8')
    return text


# def _main():
#     """Command line interface to the module.
#     """
#     from argparse import ArgumentParser, FileType
#     from gutenberg import Error
#     from gutenberg._util.os import reopen_encoded

#     parser = ArgumentParser(description='Download a Project Gutenberg text')
#     parser.add_argument('etextno', type=int)
#     parser.add_argument('outfile', type=FileType('w'))
#     parser.add_argument('--mirror', '-m', type=str, default=None)
#     parser.add_argument('--prefer-ascii', '-a', type=bool, default=False)
#     args = parser.parse_args()

#     try:
#         text = load_etext(args.etextno,
#                           mirror=args.mirror,
#                           prefer_ascii=args.prefer_ascii)
#         with reopen_encoded(args.outfile, 'w', 'utf8') as outfile:
#             outfile.write(text)
#     except Error as error:
#         parser.error(str(error))

# if __name__ == '__main__':
#     _main()
예제 #4
0
def load_etext(etextno, mirror=None):
    """Returns a unicode representation of the full body of a Project Gutenberg
    text.
    """
    etextno = validate_etextno(etextno)
    download_uri = _format_download_uri(etextno, mirror)
    response = requests.get(download_uri)
    text = response.text
    return text
예제 #5
0
파일: api.py 프로젝트: Python3pkg/gutenberg
    def _uri_to_etext(cls, uri_ref):
        """Converts the representation used to identify a text in the
        meta-data RDF graph to a human-friendly integer text identifier.

        """
        try:
            return validate_etextno(int(os.path.basename(uri_ref.toPython())))
        except InvalidEtextIdException:
            return None
예제 #6
0
파일: api.py 프로젝트: c-w/Gutenberg
    def _uri_to_etext(cls, uri_ref):
        """Converts the representation used to identify a text in the
        meta-data RDF graph to a human-friendly integer text identifier.

        """
        try:
            return validate_etextno(int(os.path.basename(uri_ref.toPython())))
        except InvalidEtextIdException:
            return None
예제 #7
0
def load_etext_from_cache(etextno):
    """Load an etext only if it's already cached."""
    etextno = validate_etextno(etextno)
    cached = os.path.join(_TEXT_CACHE, "{}.txt.gz".format(etextno))

    if not os.path.exists(cached):
        text = ""
    else:
        with closing(gzip.open(cached, "r")) as cache:
            text = cache.read().decode("utf-8")
    return text
예제 #8
0
def load_etext(etextno, refresh_cache=False):
    """Returns a unicode representation of the full body of a Project Gutenberg
    text. After making an initial remote call to Project Gutenberg's servers,
    the text is persisted locally.

    """
    etextno = validate_etextno(etextno)
    cached = os.path.join(_TEXT_CACHE, '{0}.txt.gz'.format(etextno))

    if refresh_cache:
        remove(cached)
    if not os.path.exists(cached):
        makedirs(os.path.dirname(cached))
        download_uri = _format_download_uri(etextno)
        response = requests.get(download_uri)
        text = response.text
        with closing(gzip.open(cached, 'w')) as cache:
            cache.write(text.encode('utf-8'))

    with closing(gzip.open(cached, 'r')) as cache:
        text = cache.read().decode('utf-8')
    return text
예제 #9
0
def load_etext(etextno, refresh_cache=False):
    """Returns a unicode representation of the full body of a Project Gutenberg
    text. After making an initial remote call to Project Gutenberg's servers,
    the text is persisted locally.

    """
    etextno = validate_etextno(etextno)
    cached = os.path.join(_TEXT_CACHE, '{0}.txt.gz'.format(etextno))

    if refresh_cache:
        remove(cached)
    if not os.path.exists(cached):
        makedirs(os.path.dirname(cached))
        download_uri = _format_download_uri(etextno)
        response = requests.get(download_uri)
        response.encoding = 'utf-8'
        text = response.text
        with contextlib.closing(gzip.open(cached, 'w')) as cache:
            cache.write(text.encode('utf-8'))
    else:
        with contextlib.closing(gzip.open(cached, 'r')) as cache:
            text = cache.read().decode('utf-8')
    return text
예제 #10
0
 def test_is_valid_etext(self):
     self.assertIsNotNone(validate_etextno(1))
     self.assertIsNotNone(validate_etextno(12))
     self.assertIsNotNone(validate_etextno(123))
     self.assertIsNotNone(validate_etextno(1234))
예제 #11
0
 def test_is_valid_etext(self):
     self.assertIsNotNone(validate_etextno(1))
     self.assertIsNotNone(validate_etextno(12))
     self.assertIsNotNone(validate_etextno(123))
     self.assertIsNotNone(validate_etextno(1234))
예제 #12
0
 def test_is_valid_etext(self):
     self.assertTrue(validate_etextno(1) is not None)
     self.assertTrue(validate_etextno(12) is not None)
     self.assertTrue(validate_etextno(123) is not None)
     self.assertTrue(validate_etextno(1234) is not None)
예제 #13
0
 def test_is_valid_etext(self):
     self.assertTrue(validate_etextno(1) is not None)
     self.assertTrue(validate_etextno(12) is not None)
     self.assertTrue(validate_etextno(123) is not None)
     self.assertTrue(validate_etextno(1234) is not None)