def search(title_list=None, authors_list=None): """ Receives two lists of books to download. Searches for their availability on Gutenberg and returns list of gutenberg IDs. """ if title_list is None: title_list = [] if authors_list is None: authors_list = [] book_list = set() for title in title_list: found_texts = list(get_etexts("title", title)) if found_texts: print(f"Found {title}") for id_number in found_texts: book_list.add(id_number) for author in authors_list: found_texts = list(get_etexts("author", author)) if found_texts: print(f"Found {author}") for id_number in found_texts: book_list.add(id_number) return list(book_list)
def trial(): text = strip_headers(load_etext(2701)).strip() print(text) # prints 'MOBY DICK; OR THE WHALE\n\nBy Herman Melville ...' print(get_metadata( 'title', 2701)) # prints frozenset([u'Moby Dick; Or, The Whale']) print(get_metadata('author', 2701)) # prints frozenset([u'Melville, Hermann']) print(get_etexts( 'title', 'Moby Dick; Or, The Whale')) # prints frozenset([2701, ...]) print(get_etexts('author', 'Melville, Herman')) # prints frozenset([2701, ...])
def load_gutenberg(self, language='en'): texts = get_etexts('author', self.author) texts = { t: list(get_metadata("title", t))[0] for t in texts if list(get_metadata("language", t))[0] == language } new_texts = dict() dupes = list() for k, d in texts.items(): d = d.replace("\r\n", " ") if d not in dupes: dupes.append(d) new_texts[k] = d try: self.books[d] = strip_headers( load_etext(k)).strip().split("\r\n\r\n") except UnknownDownloadUriException: print( f'Book "{d}" does not have a text format and was not loaded.' ) del new_texts[k] dupes.remove(d) continue self.tokens[d] = [ nltk.pos_tag(nltk.word_tokenize(self.books[d][b])) for b in range(len(self.books[d])) ] else: pass texts = new_texts print(texts)
def init_books(author_file, json_file): """initialize book list with texts and save it to disk""" with open(author_file) as f: authors = list(f) authors = [i.strip() for i in authors] books = [] for author in authors: s = get_etexts('author', author) for i in s: try: if list(get_metadata('language', i))[0] == 'en': title, etext = list(get_metadata( 'title', i))[0], strip_headers(load_etext(i)).strip() b = Book(i, title, etext) books.append(b) except UnknownDownloadUriException: # this book does not have a load_etext corresponding to it. pass with open(json_file, 'wb') as f: pickle.dump(books, f) print(len(books))
def get_joyce_texts(): joyce_keys = get_etexts('author', 'Joyce, James') joyce_titles = [] joyce_texts = {} for key in joyce_keys: joyce_titles.append(get_metadata('title', key)) joyce_texts[key] = strip_headers(load_etext(key)).strip() return (joyce_texts)
def get_text(self, title, author): """ This function will access the title and author of a book from the Gutenberg project and save the data as a csv file PROBLEM HERE -- gutenberg goes down a lot, so getting a full text did not work. To bypass that, I downloaded some books of mixed languages. """ guten_number = get_etexts('title', title)[0] text = strip_headers(load_etext(guten_number)).strip() return (text)
def _run_get_etexts_for_feature(self, feature): for testcase in self.sample_data(): for feature_value in getattr(testcase, feature): actual = get_etexts(feature, feature_value) self.assertTrue( testcase.etextno in actual, u("didn't retrieve {etextno} when querying for books that " 'have {feature}="{feature_value}" (got {actual}).'). format(etextno=testcase.etextno, feature=feature, feature_value=feature_value, actual=actual))
def _run_get_etexts_for_feature(self, feature): for testcase in self.sample_data(): for feature_value in getattr(testcase, feature): actual = get_etexts(feature, feature_value) self.assertTrue( testcase.etextno in actual, u("didn't retrieve {etextno} when querying for books that " 'have {feature}="{feature_value}" (got {actual}).') .format( etextno=testcase.etextno, feature=feature, feature_value=feature_value, actual=actual))
def search(query: str, include: Optional[str] = None) -> List[dict]: fields = parse_include(include) if include else [] conjunction = parse_search(query) parts = iter(get_etexts(field, value) for field, value in conjunction) results = set(next(parts)) [results.intersection_update(part) for part in parts] # type: ignore return [ dict([('text_id', text_id)] + [(field, get_metadata(field, text_id)) for field in fields]) for text_id in results ]
def _run_get_etexts_for_feature(self, feature): for testcase in self.sample_data(): for feature_value in getattr(testcase, feature): actual = get_etexts(feature, feature_value) if testcase.is_phantom: self.assertNotIn(testcase.etextno, actual) else: self.assertIn( testcase.etextno, actual, "didn't retrieve {etextno} when querying for books " 'that have {feature}="{feature_value}" (got {actual}).' .format(etextno=testcase.etextno, feature=feature, feature_value=feature_value, actual=actual))
def get_books_by_lang(): try: bookids = list(get_etexts('language', 'fr')) if args.random: shuffle(bookids) return bookids except InvalidCacheException: print(""" You need to create a Gutenberg cache first: Run those in your venv: python -c 'from gutenberg.acquire import get_metadata_cache; get_metadata_cache().populate();' It might take a few hours. """) return list()
def _run_get_etexts_for_feature(self, feature): for testcase in self.sample_data(): for feature_value in getattr(testcase, feature): actual = get_etexts(feature, feature_value) if testcase.is_phantom: self.assertNotIn(testcase.etextno, actual) else: self.assertIn( testcase.etextno, actual, "didn't retrieve {etextno} when querying for books " 'that have {feature}="{feature_value}" (got {actual}).' .format( etextno=testcase.etextno, feature=feature, feature_value=feature_value, actual=actual))
def count_words(args: argparse.Namespace) -> None: """Count the words in all Gutenberg books for a given language.""" # Pull the list of book IDs if not args.quiet: print("Processing Project Gutenberg books...") etexts = get_etexts("language", args.language) etexts_iter = tqdm.tqdm(list(etexts)) if not args.quiet else etexts # Load each book and count the words word_counts = collections.Counter() etexts = [] failed_etexts = [] for i, etext in enumerate(etexts_iter): try: etexts.append(load_etext_from_cache(etext)) except GutenbergError as e: failed_etexts.append(etext) print("Failure: ", e) continue # For efficiency, only periodically turn the texts into word counts if i % PROCESS_CHUNK_SIZE == 0: word_counts += _count_words_in_etexts(etexts) etexts = [] # Also trim the least common words, since they're usually # gibberish and it's helpful to keep memory pressure down word_counts = collections.Counter( dict(word_counts.most_common(MAX_WORD_COUNT_LENGTH)) ) word_counts += _count_words_in_etexts(etexts) del word_counts[""] # Output the word counts to a file if not args.quiet: print( f"Failed to download {len(failed_etexts)} books. (A few of these are " "normal, as some books have no text.)" ) print(f'--- Failed: {", ".join(str(etext) for etext in failed_etexts)}') print("Writing word counts to disk...") _output_word_counts(word_counts, args.output) if not args.quiet: print(f"Done! See word counts in {args.output}.")
def prime_text_cache(args: argparse.Namespace) -> None: """ Primes the Project Gutenberg text cache so text retrieval is entirely local. This will download all Gutenberg book texts onto your local machine, which will take many hours and ~10-20GB. """ if not args.quiet: print("Downloading Project Gutenberg book texts...") etexts = get_etexts("language", args.language) # Cycle through mirrors so as not to overload anyone's servers and get rate-limited etexts_with_mirrors = list(zip(etexts, itertools.cycle(MIRRORS))) etexts_iter = ( tqdm.tqdm(etexts_with_mirrors) if not args.quiet else etexts_with_mirrors ) success_count = 0 total_count = 0 try: for etext, mirror in etexts_iter: total_count += 1 try: load_etext(etext, mirror=mirror) success_count += 1 except GutenbergError as e: if not args.quiet: print(f"Failure (mirror: {mirror}) ", e) continue except KeyboardInterrupt: pass except Exception: print("Error with mirror: ", mirror, etext) raise if not args.quiet: print(f"{success_count} / {total_count} books downloaded to cache") print("Done!")
""" Created on Wed Aug 12 18:06:45 2015 @author: Tony Description: Pull etext numbers from Project Gutenberg for an author 1) First pip install gutenberg 0.4.0 library for Python from the command line """ from gutenberg.query import get_etexts from gutenberg.query import get_metadata from gutenberg.acquire import load_etext from gutenberg.cleanup import strip_headers # get the catalogue numbers of all the texts # by Wilhelm Grimm in Project Gutenberg bookList=get_etexts('author', 'Grimm, Wilhelm Carl') # gives bookList = [12704, 12705, 12706, 12707, 12708, 12709, 12710, 37381, 20051, 28044, 30510, 22555, 20050, 11027, 16846, 12250, 20027, 19068, 2591] #Once We can associate a number with a title we can pull the text for number in bookList: print(number,get_metadata('title',number)) print('\n HHHHHHHHHHHHHHH Now for the full text HHHHHHHHHHHHHHHHHHH \n') # Once we have the text number we can print the text # example 11027 is the number for Grimm's Fairy Stories # can be tempermental truncating text at top (console limit?) may need to trick around etext = strip_headers(load_etext(11027)).strip() print(etext)
# -*- coding: utf-8 -*- """ Created on Sun Sep 20 13:05:59 2015 @author: weizhi """ from gutenberg.query import get_etexts from gutenberg.query import get_metadata print(get_metadata('title', 2701)) # prints 'Moby Dick; Or, The Whale' print(get_metadata('author', 2701)) # prints 'Melville, Hermann' print(get_etexts('title', 'Moby Dick; Or, The Whale')) # prints (2701, ...) print(get_etexts('author', 'Melville, Hermann')) # prints (2701, ...)
return URIRef(value) class LanguageExtractor(_SimplePredicateRelationshipExtractor): """Extracts book languages. """ @classmethod def feature_name(cls): return 'language' @classmethod def predicate(cls): return DCTERMS.language / RDF.value @classmethod def contains(cls, value): return Literal(value) if __name__ == '__main__': from gutenberg.acquire.metadata import set_metadata_cache, SleepycatMetadataCache cache = SleepycatMetadataCache('/Users/deanjones/gutenberg_data') set_metadata_cache(cache) from gutenberg.query import get_etexts # texts = gutenberg.query.api.get_etexts('language', 'en') # print len(texts) print get_etexts('language', 'en')
def search_by_title(title): result = q.get_etexts('title', title) return list(result)
# -*- coding: utf-8 -*- """ Created on Sun Sep 20 13:05:59 2015 @author: weizhi """ from gutenberg.query import get_etexts from gutenberg.query import get_metadata print(get_metadata('title', 2701)) # prints 'Moby Dick; Or, The Whale' print(get_metadata('author', 2701)) # prints 'Melville, Hermann' print(get_etexts('title', 'Moby Dick; Or, The Whale')) # prints (2701, ...) print(get_etexts('author', 'Melville, Hermann')) # prints (2701, ...)
print("Hello. Welcome to the Gutenberg Analyser.") print("We begin by downloading the relevant texts for each author.") print("\n\n\n") print("Enter the number of authors whose works you want to download: ") n = int(input()) for j in range(n): print( "Enter the name of the author. Please make sure that string that you enter matches the author name on Project Gutenberg exactly" ) author = input() print("Name entered by you is: ", author) print("Loading books.....") originalList = (get_etexts('author', author)) dictionaryOfNames = OrderedDict( ) #contains names of the books and language of the book listOfTexts = [] #contains book number for i in originalList: try: text = strip_headers(load_etext(i)).strip() title = set(get_metadata('title', i)) lanugage = set(get_metadata('language', i)) dictionaryOfNames[title.pop()] = lanugage.pop() listOfTexts.append(i) except: pass #print("error found in download number",i)