def find_book_matches_at_site(self, book_data): urls_gotten_from_form = self.__get_search_link_from_book_data(book_data) if not urls_gotten_from_form: return None site_book_data_total = [] for url in urls_gotten_from_form: relevant_book_links = self.__get_book_links_froms_search_site(url) if relevant_book_links != None: site_book_data_list = [] with concurrent.futures.ThreadPoolExecutor() as executor: Future_Threads = [] for book_link in relevant_book_links: Future_Threads.append(executor.submit(self.get_book_data_from_site, book_link)) for future in concurrent.futures.as_completed(Future_Threads): site_book_data_list.append(future.result()) site_book_data_total += site_book_data_list cleaned_book_links = [] cleaned_site_book_data_total = [] for site_book in site_book_data_total: if site_book[13] not in cleaned_book_links: cleaned_book_links.append(site_book[13]) cleaned_site_book_data_total.append(site_book) return Par_Scrape.site_book_data_relevancy(book_data, cleaned_site_book_data_total)
def find_book_matches_at_site(self, book_data): """ args: book_data (List[]): book_data that will be used to search the target website returns: site_book_data_list ([[SiteBookData[], rating],...]): a list of site_book_data's with their relevant ratings. synopsis: The purpose of this function is to use a book_data object, and then use that to search Scribd.com for related book_data objects (known as site_book_data objects), and then sort them in order of how related they are to the book_data object. """ # Perform whatever form making for the website in order to get a relevant search link #url_gotten_from_form = "https://www.scribd.com/search?content_type=books&page=1&query=name%20of%20the%20wind&language=1" url_gotten_from_from = self.__get_search_link_from_book_data_form( book_data) if not url_gotten_from_from: return [] ''' \/\/ the following should not change \/\/ ''' #print("url_gotten_from_form: ", url_gotten_from_from) site_book_data_total = [] for url in url_gotten_from_from: relevant_book_links = self.__get_book_links_from_search_site(url) if relevant_book_links != None: site_book_data_list = [] with concurrent.futures.ThreadPoolExecutor() as executor: Future_Threads = [] for book_link in relevant_book_links: Future_Threads.append( executor.submit(self.get_book_data_from_site, book_link)) for future in concurrent.futures.as_completed( Future_Threads): site_book_data_list.append(future.result()) site_book_data_total += site_book_data_list cleaned_book_links = [] cleaned_site_book_data_total = [] for site_book in site_book_data_total: if site_book[13] not in cleaned_book_links: cleaned_book_links.append(site_book[13]) cleaned_site_book_data_total.append(site_book) return Par_Scrape.site_book_data_relevancy( book_data, cleaned_site_book_data_total)
def find_book_matches_at_site(self, book_data): """ args: book_data (List[]): book_data that will be used to search the target website returns: site_book_data_list ([[SiteBookData[], rating],...]): a list of site_book_data's with their relevant ratings. synopsis: The purpose of this function is to use a book_data object, and then use that to search googlebooks for related book_data objects (known as site_book_data objects), and then sort them in order of how related they are to the book_data object. """ site_book_data_total = [] if book_data[0] != None and book_data[0].upper() != "DIGITAL" and book_data[0].upper() != "PRINT": return site_book_data_total # Perform whatever form making for the website in order to get a relevant search link #url_gotten_from_form = "https://books.google.com/" url_gotten_from_form = self._get_search_link_from_book_data_form(book_data) # check to ensure search page exists if not url_gotten_from_form: return site_book_data_total for url in url_gotten_from_form: relevant_book_links = self._get_book_links_from_search_site(url, 0) if relevant_book_links != None: site_book_data_list = [] with concurrent.futures.ThreadPoolExecutor() as executor: Future_Threads = [] for book_link in relevant_book_links: Future_Threads.append(executor.submit(self.get_book_data_from_site, book_link)) for future in concurrent.futures.as_completed(Future_Threads): site_book_data_list.append(future.result()) site_book_data_total += site_book_data_list return Par_Scrape.site_book_data_relevancy(book_data, site_book_data_total)
def find_book_matches_at_site(self, book_data): site_book_data_list = [] url = self.get_search_link_from_book_data_form(book_data) if url == None: return None # print(url) # get the links from 4 pages of search results total_relevant_book_links = [] for i in range(1, 3): # Perform whatever form making for the website in order to get a relevant search link url_gotten_from_form = url + "&pageNumber=" + str(i) response = requests.get(url_gotten_from_form) content = response.content # Get relevant book links relevant_book_links = self._get_book_links_from_search_site( content) if relevant_book_links != None: total_relevant_book_links += relevant_book_links with concurrent.futures.ThreadPoolExecutor() as executor: Future_Threads = [] for book_link in total_relevant_book_links: Future_Threads.append( executor.submit(self.get_book_data_from_site, book_link)) for future in concurrent.futures.as_completed(Future_Threads): site_book_data_list.append(future.result()) ''' for book_link in total_relevant_book_links: site_book_data_list.append(self.get_book_data_from_site(book_link)) ''' # sort by relevancy return Par_Scrape.site_book_data_relevancy(book_data, site_book_data_list)
def find_book_matches_at_site(self, book_data): """ args: book_data (requests.get): represents search terms and is needed to fill out the test bookstore web form. returns: relevancy_list ([[[SiteBookData],float]]): relevancy_list is a list of lists including a float and SiteBookData, with the float representing how closely the SiteBookData matches the original search terms (book_data). synopsis: The purposes of this function is to: 1) Use Mechanize to get relevant book detail links from the text bookstore. 2) Parse SiteBookData based on those links. 3) Put SiteBookData into a list sorted based similar it is to the original search terms, along with a float that quantifies that similarity. """ br = mechanize.Browser() try: response = br.open('http://localhost:8000/bookstore/') except: print("\nError accessing Test Bookstore. Please make sure it is running.\n") return [] br.select_form(nr=0) control = br.form.find_control("searcher") if control.type != "text": return [] searchString = '' if ((book_data[1] != None) or (book_data[4] != None) or (book_data[9] != None)): if (book_data[1] != None): searchString += book_data[1] + ' ' if (book_data[4] != None): searchString += book_data[4] + ' ' if (book_data[9] != None): searchString += book_data[9] else: return [] control.value = searchString br.submit() #print(br.geturl()) relevant_book_links = self._navigate_pages(br,3) site_book_data_list = [] #get site_book_data from book_links and place into list with concurrent.futures.ThreadPoolExecutor() as executor: Future_Threads = [] for book_link in relevant_book_links: Future_Threads.append(executor.submit(self.get_book_data_from_site, book_link)) for future in concurrent.futures.as_completed(Future_Threads): site_book_data_list.append(future.result()) ''' for book_link in relevant_book_links: site_book_data_list.append(self.get_book_data_from_site(book_link)) ''' #sort by relevancy return Par_Scrape.site_book_data_relevancy(book_data, site_book_data_list)