def load_etext(etextno, refresh_cache=False, mirror=None, prefer_ascii=False): """Returns a unicode representation of the full body of a Project Gutenberg text. After making an initial remote call to Project Gutenberg's servers, the text is persisted locally. """ etextno = validate_etextno(etextno) cached = os.path.join(_TEXT_CACHE, '{}.txt.gz'.format(etextno)) if refresh_cache: remove(cached) if not os.path.exists(cached): makedirs(os.path.dirname(cached)) download_uri = _format_download_uri(etextno, mirror, prefer_ascii) response = requests.get(download_uri) # Ensure proper UTF-8 saving. There might be instances of ebooks or # mirrors which advertise a broken encoding, and this will break # downstream usages. For example, #55517 from aleph.gutenberg.org: # # from gutenberg.acquire import load_etext # print(load_etext(55517, refresh_cache=True)[0:1000]) # # response.encoding will be 'ISO-8859-1' while the file is UTF-8 if response.encoding != response.apparent_encoding: response.encoding = response.apparent_encoding text = response.text with closing(gzip.open(cached, 'w')) as cache: cache.write(text.encode('utf-8')) with closing(gzip.open(cached, 'r')) as cache: text = cache.read().decode('utf-8') return text
def test_is_invalid_etext(self): with self.assertRaises(ValueError): validate_etextno('not-a-positive-integer') with self.assertRaises(ValueError): validate_etextno(-123) with self.assertRaises(ValueError): validate_etextno(0) with self.assertRaises(ValueError): validate_etextno(12.3)
def load_etext(etextno, refresh_cache=False, mirror=None, prefer_ascii=False): """Returns a unicode representation of the full body of a Project Gutenberg text. After making an initial remote call to Project Gutenberg's servers, the text is persisted locally. """ etextno = validate_etextno(etextno) cached = os.path.join(_TEXT_CACHE, '{}.txt.gz'.format(etextno)) if refresh_cache: remove(cached) if not os.path.exists(cached): makedirs(os.path.dirname(cached)) download_uri = _format_download_uri(etextno, mirror, prefer_ascii) response = requests.get(download_uri) # Ensure proper UTF-8 saving. There might be instances of ebooks or # mirrors which advertise a broken encoding, and this will break # downstream usages. For example, #55517 from aleph.gutenberg.org: # # from gutenberg.acquire import load_etext # print(load_etext(55517, refresh_cache=True)[0:1000]) # # response.encoding will be 'ISO-8859-1' while the file is UTF-8 if response.encoding != response.apparent_encoding: response.encoding = response.apparent_encoding text = response.text with closing(gzip.open(cached, 'w')) as cache: cache.write(text.encode('utf-8')) with closing(gzip.open(cached, 'r')) as cache: text = cache.read().decode('utf-8') return text # def _main(): # """Command line interface to the module. # """ # from argparse import ArgumentParser, FileType # from gutenberg import Error # from gutenberg._util.os import reopen_encoded # parser = ArgumentParser(description='Download a Project Gutenberg text') # parser.add_argument('etextno', type=int) # parser.add_argument('outfile', type=FileType('w')) # parser.add_argument('--mirror', '-m', type=str, default=None) # parser.add_argument('--prefer-ascii', '-a', type=bool, default=False) # args = parser.parse_args() # try: # text = load_etext(args.etextno, # mirror=args.mirror, # prefer_ascii=args.prefer_ascii) # with reopen_encoded(args.outfile, 'w', 'utf8') as outfile: # outfile.write(text) # except Error as error: # parser.error(str(error)) # if __name__ == '__main__': # _main()
def load_etext(etextno, mirror=None): """Returns a unicode representation of the full body of a Project Gutenberg text. """ etextno = validate_etextno(etextno) download_uri = _format_download_uri(etextno, mirror) response = requests.get(download_uri) text = response.text return text
def _uri_to_etext(cls, uri_ref): """Converts the representation used to identify a text in the meta-data RDF graph to a human-friendly integer text identifier. """ try: return validate_etextno(int(os.path.basename(uri_ref.toPython()))) except InvalidEtextIdException: return None
def load_etext_from_cache(etextno): """Load an etext only if it's already cached.""" etextno = validate_etextno(etextno) cached = os.path.join(_TEXT_CACHE, "{}.txt.gz".format(etextno)) if not os.path.exists(cached): text = "" else: with closing(gzip.open(cached, "r")) as cache: text = cache.read().decode("utf-8") return text
def load_etext(etextno, refresh_cache=False): """Returns a unicode representation of the full body of a Project Gutenberg text. After making an initial remote call to Project Gutenberg's servers, the text is persisted locally. """ etextno = validate_etextno(etextno) cached = os.path.join(_TEXT_CACHE, '{0}.txt.gz'.format(etextno)) if refresh_cache: remove(cached) if not os.path.exists(cached): makedirs(os.path.dirname(cached)) download_uri = _format_download_uri(etextno) response = requests.get(download_uri) text = response.text with closing(gzip.open(cached, 'w')) as cache: cache.write(text.encode('utf-8')) with closing(gzip.open(cached, 'r')) as cache: text = cache.read().decode('utf-8') return text
def load_etext(etextno, refresh_cache=False): """Returns a unicode representation of the full body of a Project Gutenberg text. After making an initial remote call to Project Gutenberg's servers, the text is persisted locally. """ etextno = validate_etextno(etextno) cached = os.path.join(_TEXT_CACHE, '{0}.txt.gz'.format(etextno)) if refresh_cache: remove(cached) if not os.path.exists(cached): makedirs(os.path.dirname(cached)) download_uri = _format_download_uri(etextno) response = requests.get(download_uri) response.encoding = 'utf-8' text = response.text with contextlib.closing(gzip.open(cached, 'w')) as cache: cache.write(text.encode('utf-8')) else: with contextlib.closing(gzip.open(cached, 'r')) as cache: text = cache.read().decode('utf-8') return text
def test_is_valid_etext(self): self.assertIsNotNone(validate_etextno(1)) self.assertIsNotNone(validate_etextno(12)) self.assertIsNotNone(validate_etextno(123)) self.assertIsNotNone(validate_etextno(1234))
def test_is_valid_etext(self): self.assertTrue(validate_etextno(1) is not None) self.assertTrue(validate_etextno(12) is not None) self.assertTrue(validate_etextno(123) is not None) self.assertTrue(validate_etextno(1234) is not None)