Пример #1
0
    def _get_book_subtitle(self, content):
        """
        args:
            content (requests.get):
                content is needed in order to scrape the book's
                subtitle
        returns:
            subtitle (String):
                the subtitle of the book that is being scraped.
        synopsis:
            The purpose of this function is to determine what
            the book's subtitle is.
        """

        try:
            subtitle = Par_Scrape.parse(content, ("//*[@class='subtitle']/text()"))
           
            # Process here is to compensate for google books placement of the text().
            # if the text is in a deeper <div> tag it will be able to access it. 
            if len(subtitle) == 0:
                subtitle = Par_Scrape.parse(content, ("//*[@class='subtitle']/span[@dir='ltr']/text()"))

            if len(subtitle) == 0:
                return None

            true_subtitle = ""

            for part in subtitle:
                true_subtitle += part

            return true_subtitle
        except:
            return None
    def find_book_matches_at_site(self, book_data):
        urls_gotten_from_form = self.__get_search_link_from_book_data(book_data)

        if not urls_gotten_from_form:
            return None

        site_book_data_total = []

        for url in urls_gotten_from_form:
            relevant_book_links = self.__get_book_links_froms_search_site(url)
            if relevant_book_links != None:
                site_book_data_list = []

                with concurrent.futures.ThreadPoolExecutor() as executor:
                    Future_Threads = []
                    for book_link in relevant_book_links:
                        Future_Threads.append(executor.submit(self.get_book_data_from_site, book_link))

                    for future in concurrent.futures.as_completed(Future_Threads):
                        site_book_data_list.append(future.result())
                site_book_data_total += site_book_data_list

        cleaned_book_links = []
        cleaned_site_book_data_total = []
        for site_book in site_book_data_total:
            if site_book[13] not in cleaned_book_links:
                cleaned_book_links.append(site_book[13])
                cleaned_site_book_data_total.append(site_book)
        
        return Par_Scrape.site_book_data_relevancy(book_data, cleaned_site_book_data_total)
Пример #3
0
    def _form_search_submission(self, search):
        """
        args:
            search (String):
                This is the parameter that will be searched for in the
                bookstore
        returns:
            link (String):
                This is the link that was generated based upon the
                search parameter
            None
        synopsis:
            The purpose of this function is to check whether or not
            a search link is valid.  If it is, then return the link,
            otherwise return None.
        """
        time.sleep(.3)
        br = mechanize.Browser()
        br.set_handle_robots(False)
        br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
        response = br.open("https://books.google.com/")
        br.form = list(br.forms())[0]
        control = br.form.controls[1]
        if control.type != "text":
            return None
        control.value = search
        br.submit()
        link = br.geturl()

        test_validity = requests.get(link)
        returned =  Par_Scrape.parse(test_validity.content, "//span[@class='JZCD0c r0bn4c rQMQod']")
        if len(returned) != 0:
            return None

        return link
    def _get_book_description(self, content):
        """
        args:
            content (requests.get):
                content is needed in orde to scrape the book's
                description
        returns:
            description (String):
                description is the book's description that is
                being scraped.
        synopsis:
            The purpose of this function is to determine what
            the book's description
        """

        try:
            description = Par_Scrape.parse(content, self.content_table + "//div[@class='book-description']/p/text()")

            if len(description) > 1:
                true_description = ""

                # removal of HTML. Does miss some '\' characters
                for part in description:
                    soup = BeautifulSoup(part, features='lxml')
                    text_part = soup.getText()
                    
                    if text_part != '':
                        true_description += text_part + " "
                return true_description

            return description[0]
        except:
            return None
    def _get_book_authors(self, content):
        """
        args:
            content (requests.get):
                content is needed in order to get the authors
                names.
        returns:
            authors (String):
                authors is the book's authors
        synopsis:
            The purpose of this function is to determine what the
            authors are for the book being scraped.
        """

        try:        
            authors = Par_Scrape.parse(content, self.content_table + "//h4[@class='book-written-by']//a/text()")
            
            if len(authors) > 1:
                all_authors = ""
                for writer in authors:
                    
                    if writer == authors[-1]:
                        all_authors += writer
                    else:
                        all_authors += writer + ", "    
                return all_authors

            # purpose of return format is due to object list incase more than one author.
            return authors[0]
        except:
            return None
Пример #6
0
    def __get_book_description(self, content):
        """
        args:
            content (requests.get):
                content is needed in orde to scrape the book's
                description
        returns:
            description (String):
                description is the book's description that is
                being scraped.
        synopsis:
            The purpose of this function is to determine what
            the book's description
        """

        try:
            return Par_Scrape.parse(content, (
                self.meta_Father_Type +
                "/following-sibling::meta[@property='og:description']/@content"
            ))[0].replace("\n",
                          "").replace("\t",
                                      "").replace("\'",
                                                  "").replace("\xa0", " ")
        except:
            return None
Пример #7
0
    def _get_book_authors(self, content):
        """
        args:
            content (requests.get):
                content is needed in order to get the authors
                names.
        returns:
            authors (String):
                authors is the book's authors
        synopsis:
            The purpose of this function is to determine what the
            authors are for the book being scraped.
        """

        try:
            authors = Par_Scrape.parse(content, (self.content_table + "/tr[@class='metadata_row']/td[@class='metadata_label']/span[contains(text(), 'Author')]" + "/../following-sibling::td/a/span/text()"))
            all_authors = ""

            if len(authors) == 0:
                return None

            # Process here is to compensate for multiple authors.
            for author in authors:
                all_authors += author + " "

            return all_authors
        except:
            return None
Пример #8
0
    def _get_book_description(self, content):
        """
        args:
            content (requests.get):
                content is needed in orde to scrape the book's
                description
        returns:
            description (String):
                description is the book's description that is
                being scraped.
        synopsis:
            The purpose of this function is to determine what
            the book's description
        """
        try:
            desc_parts = Par_Scrape.parse(content, ("//*[@id='synopsistext']//text()"))

            # Process here is to compensate for returning empty variables
            # as well as acquiring all the text in between HTML tags.
            # Solution to the dynamic html formatting of google books descriptions.
            if len(desc_parts) == 0:
                return None

            full_desc = ""            
            for parts in desc_parts:
                full_desc += parts
                
            return full_desc
        except:
            return None
Пример #9
0
 def _get_book_isbn_13(self, content):
     """
     args:
         content (requests.get):
             content is needed in order to scrape the book's
             isbn_13.
     returns:
         isbn_13 (String):
             isbn_13 is the book's isbn_13 that is being
             scraped.
     synopsis:
         The purpose of this function is to determine the
         book's isbn_13.
     """
     
     try:
         data = Par_Scrape.parse(content, (self.content_table + "/tr[@class='metadata_row']/td[@class='metadata_label']/span[contains(text(), 'ISBN')]" + "/../following-sibling::td/span/text()"))
         
         # Process here is to compensate for google books displaying both isbn_10
         # and isbn_13. Seperates the two and only returns the isbn_13 by length compare 
         isbn_13 = [x.strip() for x in data[0].split(',')]
         for x in isbn_13:
             if len(x) == 13:
                 return x
         return None
     except:
         return None
Пример #10
0
    def _get_book_description(self, content):
        """
        args:
            content (requests.get):
                content is needed in orde to scrape the book's
                description
        returns:
            description (String):
                description is the book's description that is
                being scraped.
        synopsis:
            The purpose of this function is to determine what
            the book's description
        """

        try:
            description_elements = Par_Scrape.parse(content,".//div[@class='right']/p")[1:]
            description = ""
            for i in range(len(description_elements)):
                if not description_elements[i].text is None:
                    description += description_elements[i].text + ' '
            soup = BeautifulSoup(description, features='lxml')
            description_text = soup.get_text()
            return description_text.strip()
        except:
            return None
 def __get_authors__(self, content):
     try:
         authors_element = Par_Scrape.parse(content, "//*[@class='value-field Colaborador']")[0]
         authors = authors_element.text
         author = authors.split("Autor:")[-1]
         return author
     except:
         return None
 def __get_subtitle__(self, content):
     try:
         subtitle_element = Par_Scrape.parse(content, "//*[@class='value-field Subtitulo']")
         if len(subtitle_element)>0:
             subtitle = subtitle_element[0].text
             return subtitle
     except:
         return None
Пример #13
0
    def find_book_matches_at_site(self, book_data):
        """
        args:
            book_data (List[]):
                book_data that will be used to search the target
                website
        returns:
            site_book_data_list ([[SiteBookData[], rating],...]):
                a list of site_book_data's with their relevant ratings.
        synopsis:
            The purpose of this function is to use a book_data object,
            and then use that to search Scribd.com for related
            book_data objects (known as site_book_data objects),
            and then sort them in order of how related they are to
            the book_data object.
        """

        # Perform whatever form making for the website in order to get a relevant search link
        #url_gotten_from_form = "https://www.scribd.com/search?content_type=books&page=1&query=name%20of%20the%20wind&language=1"
        url_gotten_from_from = self.__get_search_link_from_book_data_form(
            book_data)

        if not url_gotten_from_from:
            return []
        ''' \/\/ the following should not change \/\/ '''

        #print("url_gotten_from_form: ", url_gotten_from_from)

        site_book_data_total = []

        for url in url_gotten_from_from:
            relevant_book_links = self.__get_book_links_from_search_site(url)
            if relevant_book_links != None:
                site_book_data_list = []

                with concurrent.futures.ThreadPoolExecutor() as executor:
                    Future_Threads = []
                    for book_link in relevant_book_links:
                        Future_Threads.append(
                            executor.submit(self.get_book_data_from_site,
                                            book_link))

                    for future in concurrent.futures.as_completed(
                            Future_Threads):
                        site_book_data_list.append(future.result())

                site_book_data_total += site_book_data_list

        cleaned_book_links = []
        cleaned_site_book_data_total = []
        for site_book in site_book_data_total:
            if site_book[13] not in cleaned_book_links:
                cleaned_book_links.append(site_book[13])
                cleaned_site_book_data_total.append(site_book)

        return Par_Scrape.site_book_data_relevancy(
            book_data, cleaned_site_book_data_total)
Пример #14
0
    def find_book_matches_at_site(self, book_data):
        """
        args:
            book_data (List[]):
                book_data that will be used to search the target
                website
        returns:
            site_book_data_list ([[SiteBookData[], rating],...]):
                a list of site_book_data's with their relevant ratings.
        synopsis:
            The purpose of this function is to use a book_data object,
            and then use that to search googlebooks for related
            book_data objects (known as site_book_data objects),
            and then sort them in order of how related they are to
            the book_data object.
        """

        site_book_data_total = []

        if book_data[0] != None and book_data[0].upper() != "DIGITAL" and book_data[0].upper() != "PRINT":
            return site_book_data_total


        # Perform whatever form making for the website in order to get a relevant search link
        #url_gotten_from_form = "https://books.google.com/"
        url_gotten_from_form = self._get_search_link_from_book_data_form(book_data)
        
        # check to ensure search page exists
        if not url_gotten_from_form:
            return site_book_data_total  

        for url in url_gotten_from_form:

            relevant_book_links = self._get_book_links_from_search_site(url, 0)
            if relevant_book_links != None:
                site_book_data_list = []
                
                with concurrent.futures.ThreadPoolExecutor() as executor:
                    Future_Threads = []

                    for book_link in relevant_book_links:
                        Future_Threads.append(executor.submit(self.get_book_data_from_site, book_link))
            
                    for future in concurrent.futures.as_completed(Future_Threads):
                        site_book_data_list.append(future.result())
                site_book_data_total += site_book_data_list

        
        return Par_Scrape.site_book_data_relevancy(book_data, site_book_data_total)
Пример #15
0
    def _get_book_series(self, content):
        """
        args:
            content (requests.get):
                content is needed in order to get the series.
        returns:
            series (String):
                series is the book's series
        synopsis:
            The purpose of this function is to determine what the
            series is for the book being scraped (if it exists).
        """

        try:
            return Par_Scrape.parse(content,".//tr[td='Series:']/td[@class='bookDetail']")[0].text
        except:
            return None
Пример #16
0
    def _get_book_id(self, content):
        """
        args:
            content (requests.get):
                content is needed in order to scrape the book's id.
        returns:
            id (String):
                id is the book's id, as determined by the website
        synopsis:
            The purpose of this function is to determine what the
            book's id that is being scraped.
        """

        try:
            return Par_Scrape.parse(content,".//tr[td='ISBN 13#:']/td[@class='bookDetail']")[0].text
        except:
            return None
Пример #17
0
    def _get_book_title(self, content):
        """
        args:
            content (requests.get)
                content is required in order to scrape the book's
                title.
        returns:
            title (String):
                title is the book's title that is being scraped.
        synopsis:
            The purpose of this function is to determine what the book's
            title is.
        """

        try:
            return Par_Scrape.parse(content, self.content_table + "//h1[@class='audiobookTitle']/text()")[0]
        except:
            return None
Пример #18
0
    def _get_book_url(self, content):
        """
        args:
            content (requests.get):
                content is needed in order to scrape the book's url.
        returns:
            url (String):
                url is book's url that is normally used, as determined
                by the website.
        synopsis:
            The purpose of this function is to determine what the book's
            url is that is being scraped.  This is required in order for
            functions to work properly.
        """

        try:
            return Par_Scrape.parse(content, "//*[@class='bookcover']/a/@href")
        except:
            return None
Пример #19
0
 def _get_book_sale_price(self, content):
     """
     args:
         content (Request.get):
             content is needed in order to scrape the audiobook's
             price if applicable
     returns:
         price:
             This is this price of the ebook.
         (None):
             If there is no ebook for the book searched.
     synopsis:
         The purpose of this function is to parse the to scrape
         for the audiobook's price and return it if applicable.
     """
     try:
         return Par_Scrape.parse(content, self.content_table + "//div[@class='fleft button-text']/div/p/text()")
     except:
         return None
Пример #20
0
    def _get_book_url(self, content):
        """
        args:
            content (requests.get):
                content is needed in order to scrape the book's url.
        returns:
            url (String):
                url is book's url that is normally used, as determined
                by the website.
        synopsis:
            The purpose of this function is to determine what the book's
            url is that is being scraped.  This is required in order for
            functions to work properly.
        """

        try:
            return "http://*****:*****@class='bookDetail']")[0].text + "/details"
        except:
            return None
Пример #21
0
    def _get_book_image_url(self, content):
        """
        args:
            content (requests.get):
                content is required in order to scrape the book image's
                url.
        returns:
            image_url (String):
                image_url is the book's url for the book's cover
                image.
        synopsis:
            This purpose of this function is to determine what the
            url is for the book's cover image.
        """

        try:
            return Par_Scrape.parse(content, "//*[@title='Front Cover']/@src")[0]
        except:
            return None
Пример #22
0
    def __get_book_authors(self, content):
        """
        args:
            content (requests.get):
                content is needed in order to get the authors
                names.
        returns:
            authors (String):
                authors is the book's authors
        synopsis:
            The purpose of this function is to determine what the
            authors are for the book being scraped.
        """

        try:
            return Par_Scrape.parse(
                content, (self.right_col +
                          "/div[@class='contributors']/p/span/a/text()"))[0]
        except:
            return None
Пример #23
0
 def _get_book_format(self, content):
     """
     args:
     returns:
         format (String):
             format is what type of book was scraped
             google books only has E-books available on their
             own site.
     synopsis:
         The purpose of this function is to return the book format
         "DIGITAL" or "PRINT" since google books only has E-books on their 
         own site with links to "PRINT" books.
     """
     try:
         if Par_Scrape.parse(content, "//*[@id='gb-get-book-not-available']"):
             return "PRINT"
         else:    
             return "DIGITAL"
     except:
         return None
Пример #24
0
 def _get_book_sale_status(self, content):
     """
     args:
         content (requests.get):
             content is needed in order to scrape the book's
             subtitle
     returns:
         sale_status (Boolean):
             the sales status of the book that is being scraped.
     synopsis:
         The purpose of this function is to determine if the
         book is available for sale.
     """
     try:
         if Par_Scrape.parse(content, "//*[@id='gb-get-book-not-available']"):
             return False
         else:
             return True
     except:
         return None
Пример #25
0
 def _get_book_sale_price(self, content):
     """
     args:
         content (Request.get):
             content is needed in order to scrape the e-book's
             price if applicable
     returns:
         price:
             This is this price of the ebook.
         (None):
             If there is no ebook for the book searched.
     synopsis:
         The purpose of this function is to parse the to scrape
         for the e-book's price and return it.
     """
     try:
         if Par_Scrape.parse(content, "//*[@id='gb-get-book-not-available']"):
             return None
     except:
         return None
Пример #26
0
    def __get_book_title(self, content):
        """
        args:
            content (requests.get)
                content is required in order to scrape the book's
                title.
        returns:
            title (String):
                title is the book's title that is being scraped.
        synopsis:
            Thepurpose of this function is to determine what the book's
            title is.
        """

        try:
            return Par_Scrape.parse(
                content,
                (self.meta_Father_Type +
                 "/following-sibling::meta[@property='og:title']/@content"))[0]
        except:
            return None
Пример #27
0
    def _get_book_sale_status(self, content):
        """
        args:
            content (requests.get):
                content is needed in order to scrape the book's
                subtitle
        returns:
            sale_status (Boolean):
                the sales status of the book that is being scraped.
        synopsis:
            The purpose of this function is to determine if the
            book is available for sale.
        """

        try:
            if Par_Scrape.parse(content, self.content_table + "//span[@class='nonmember-notify save-later-text']"):
                return False
            else:
                return True
        except:
            return None
Пример #28
0
    def find_book_matches_at_site(self, book_data):
        site_book_data_list = []

        url = self.get_search_link_from_book_data_form(book_data)
        if url == None:
            return None
        # print(url)
        # get the links from 4 pages of search results

        total_relevant_book_links = []

        for i in range(1, 3):

            # Perform whatever form making for the website in order to get a relevant search link
            url_gotten_from_form = url + "&pageNumber=" + str(i)
            response = requests.get(url_gotten_from_form)
            content = response.content

            # Get relevant book links
            relevant_book_links = self._get_book_links_from_search_site(
                content)

            if relevant_book_links != None:
                total_relevant_book_links += relevant_book_links

        with concurrent.futures.ThreadPoolExecutor() as executor:
            Future_Threads = []
            for book_link in total_relevant_book_links:
                Future_Threads.append(
                    executor.submit(self.get_book_data_from_site, book_link))

            for future in concurrent.futures.as_completed(Future_Threads):
                site_book_data_list.append(future.result())
        '''
        for book_link in total_relevant_book_links:
            site_book_data_list.append(self.get_book_data_from_site(book_link))
        '''
        # sort by relevancy
        return Par_Scrape.site_book_data_relevancy(book_data,
                                                   site_book_data_list)
Пример #29
0
    def __get_book_url(self, content):
        """
        args:
            content (requests.get):
                content is needed in order to scrape the book's url.
        returns:
            url (String):
                url is book's url that is normally used, as determined
                by the website.
        synopsis:
            The purpose of this function is to determine what the book's
            url is that is being scraped.  This is required in order for
            functions to work properly.
        """

        try:
            return Par_Scrape.parse(
                content,
                (self.meta_Father_Type +
                 "/following-sibling::meta[@property='og:url']/@content"))[0]
        except:
            return None
Пример #30
0
    def __get_book_image_url(self, content):
        """
        args:
            content (requests.get):
                content is required in order to scrape the book's
                url.
        returns:
            image_url (String):
                image_url is the book's url for the book's cover
                image.
        synopsis:
            This purpose of this function is to determine what the
            book's url is for the cover image.
        """

        try:
            return Par_Scrape.parse(
                content,
                (self.meta_Father_Type +
                 "/following-sibling::meta[@property='og:image']/@content"))[0]
        except:
            return None