def search(self): """ Parse a query string and do a search in the Gutenberg cache """ query_string = self.titleQuery if query_string: # parse query and lookup in gutenbergcache cache = GutenbergCache.get_cache() query_results = cache.native_query( sql_query= "select * from titles where upper(name) like upper('%{query}%') limit {limit}" .format(query=query_string, limit=self.nbr_results)) # get the results self.searchResults = list(query_results) # display info message n_results = len(self.searchResults) self.infoBox.setText("{n} result{s} have been found".format( n=n_results, s="s" if n_results > 0 else "")) # TODO: display results # Update the results list with the search results # in order to display them for idx in self.searchResults: result_string = str(idx[1]) self.titleLabels.append(result_string) self.titleLabels = self.titleLabels self.clearButton.setDisabled(False) self.addButton.setDisabled(self.selectedTitles == list()) self.controlArea.setDisabled(False) else: self.infoBox.setText("You didn't search anything", "warning")
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some books first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar( self, iterations=len(self.myBasket), ) selectedTexts = list() text_content = list() annotations = list() # get the Gutenberg cache cache = GutenbergCache.get_cache() try: # TODO: Retrieve selected texts from gutenberg for text in self.myBasket: # Get the id of the text query_id = cache.native_query( sql_query= "select gutenbergbookid from books where id == {selected_id}" .format(selected_id=text[2])) gutenberg_id = list(query_id) # Get the text with Gutenbergpy gutenberg_text = gutenbergpy.textget.strip_headers( gutenbergpy.textget.get_text_by_id(gutenberg_id[0][0])) text_content.append(gutenberg_text) annotations.append(text[1]) progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except Exception: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from Gutenberg", "error") self.controlArea.setDisabled(False) return # TODO: send gutenberg texts as output # Store downloaded lyrics strings in input objects... for text in text_content: newInput = Input(text, self.captionTitle) self.createdInputs.append(newInput) # If there"s only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # TODO: annotate with book metadata # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update({"title": annotations[idx]}) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send("Gutenberg importation", self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
import gutenbergpy.textget from gutenbergpy.gutenbergcache import GutenbergCache # create cache from scratchfrom scratch GutenbergCache.create(refresh=True, download=True, unpack=True, parse=True, cache=True, deleteTemp=True) # get the default cache (SQLite) cache = GutenbergCache.get_cache() # For the query function you can use the following fields: languages authors types titles subjects publishers bookshelves print( cache.query(downloadtype=[ 'application/plain', 'text/plain', 'text/html; charset=utf-8' ])) # Print stripped text print( gutenbergpy.textget.strip_headers( gutenbergpy.textget.get_text_by_id(1000)))
def search(self): """ Parse a query string and do a search in the Gutenberg cache """ query_string = self.titleQuery query_author = self.authorQuery language = self.lang_dict[self.langQuery] # informs the user that he didn't change anything if self.langQuery == 'Any' and query_string == '' and self.authorQuery == '': self.infoBox.setText( "You can't search only by language, if it's set to Any", "warning") else: # Recode author to name, first_name if len(query_author.split()) == 2: if "," not in query_author: query_author = "%, ".join(query_author.split()[::-1]) # parse query and lookup in gutenbergcache cache = GutenbergCache.get_cache() # searches the database try: query_results = cache.native_query(sql_query=""" /* Creates a new table with one author per book by selecting the greatest author id */ WITH unique_book_author AS (SELECT * FROM book_authors WHERE authorid IN (SELECT MAX(authorid) FROM book_authors GROUP BY bookid)) /* Selects title, author, gutenberg id and language */ SELECT titles.name, authors.name, books.gutenbergbookid, languages.name FROM titles /* Merges every needed table into one on shared attributes */ INNER JOIN books ON books.id = titles.bookid INNER JOIN unique_book_author ON books.id = unique_book_author.bookid INNER JOIN authors ON authors.id = unique_book_author.authorid INNER JOIN languages ON books.languageid = languages.id /* Matches users query using % wildcard for more permissive query */ WHERE upper(titles.name) LIKE "%{title}%" AND upper(authors.name) LIKE "%{author}%" AND languages.name LIKE "%{lang}%" LIMIT {limit} """.format(title=query_string, author=query_author, lang=language, limit=self.nbr_results)) except Exception as exc: print(exc) self.infoBox.setText( "An error occurred while interrogating the cache.", "error") return # get the results Results = list(query_results) self.searchResults = list() # creates better results for result in Results: result = list(result) # replaces all newlines types result[0] = re.sub(r'[\n\r]+', r', ', result[0]) # recodes athor from: name, first_name to: fisrt_name name result[1] = " ".join(result[1].split(", ")[::-1]) # gets the key from the lang_dict for the coresponding language abbreviation result[3] = [ key for key, value in self.lang_dict.items() if value == result[3] ][0] self.searchResults.append(result) # display info message n_results = len(self.searchResults) self.infoBox.setText("{n} result{s} have been found".format( n=n_results, s="s" if n_results > 0 else "")) self.clearResults() # Update the results list with the search results # in order to display them for idx in self.searchResults: result_string = "{title} — {author} — {lang}".format( title=idx[0], author=idx[1], lang=idx[3]) self.titleLabels.append(result_string) self.titleLabels = self.titleLabels self.clearButton.setDisabled(False) self.addButton.setDisabled(self.selectedTitles == list()) self.controlArea.setDisabled(False)