Python Parent_Scrape.parse示例，Parsers.Parent_Scrape.parse Python示例

示例#1

0

显示文件

    def _get_book_subtitle(self, content):
        """
        args:
            content (requests.get):
                content is needed in order to scrape the book's
                subtitle
        returns:
            subtitle (String):
                the subtitle of the book that is being scraped.
        synopsis:
            The purpose of this function is to determine what
            the book's subtitle is.
        """

        try:
            subtitle = Par_Scrape.parse(content, ("//*[@class='subtitle']/text()"))
           
            # Process here is to compensate for google books placement of the text().
            # if the text is in a deeper <div> tag it will be able to access it. 
            if len(subtitle) == 0:
                subtitle = Par_Scrape.parse(content, ("//*[@class='subtitle']/span[@dir='ltr']/text()"))

            if len(subtitle) == 0:
                return None

            true_subtitle = ""

            for part in subtitle:
                true_subtitle += part

            return true_subtitle
        except:
            return None

示例#2

0

显示文件

 def _get_book_isbn_13(self, content):
     """
     args:
         content (requests.get):
             content is needed in order to scrape the book's
             isbn_13.
     returns:
         isbn_13 (String):
             isbn_13 is the book's isbn_13 that is being
             scraped.
     synopsis:
         The purpose of this function is to determine the
         book's isbn_13.
     """
     
     try:
         data = Par_Scrape.parse(content, (self.content_table + "/tr[@class='metadata_row']/td[@class='metadata_label']/span[contains(text(), 'ISBN')]" + "/../following-sibling::td/span/text()"))
         
         # Process here is to compensate for google books displaying both isbn_10
         # and isbn_13. Seperates the two and only returns the isbn_13 by length compare 
         isbn_13 = [x.strip() for x in data[0].split(',')]
         for x in isbn_13:
             if len(x) == 13:
                 return x
         return None
     except:
         return None

示例#3

0

显示文件

    def _get_book_description(self, content):
        """
        args:
            content (requests.get):
                content is needed in orde to scrape the book's
                description
        returns:
            description (String):
                description is the book's description that is
                being scraped.
        synopsis:
            The purpose of this function is to determine what
            the book's description
        """

        try:
            description_elements = Par_Scrape.parse(content,".//div[@class='right']/p")[1:]
            description = ""
            for i in range(len(description_elements)):
                if not description_elements[i].text is None:
                    description += description_elements[i].text + ' '
            soup = BeautifulSoup(description, features='lxml')
            description_text = soup.get_text()
            return description_text.strip()
        except:
            return None

示例#4

0

显示文件

    def _form_search_submission(self, search):
        """
        args:
            search (String):
                This is the parameter that will be searched for in the
                bookstore
        returns:
            link (String):
                This is the link that was generated based upon the
                search parameter
            None
        synopsis:
            The purpose of this function is to check whether or not
            a search link is valid.  If it is, then return the link,
            otherwise return None.
        """
        time.sleep(.3)
        br = mechanize.Browser()
        br.set_handle_robots(False)
        br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
        response = br.open("https://books.google.com/")
        br.form = list(br.forms())[0]
        control = br.form.controls[1]
        if control.type != "text":
            return None
        control.value = search
        br.submit()
        link = br.geturl()

        test_validity = requests.get(link)
        returned =  Par_Scrape.parse(test_validity.content, "//span[@class='JZCD0c r0bn4c rQMQod']")
        if len(returned) != 0:
            return None

        return link

示例#5

0

显示文件

文件： Main_Audiobooks.py 项目： bthomp24/D2D-Test-Bookstore

    def _get_book_description(self, content):
        """
        args:
            content (requests.get):
                content is needed in orde to scrape the book's
                description
        returns:
            description (String):
                description is the book's description that is
                being scraped.
        synopsis:
            The purpose of this function is to determine what
            the book's description
        """

        try:
            description = Par_Scrape.parse(content, self.content_table + "//div[@class='book-description']/p/text()")

            if len(description) > 1:
                true_description = ""

                # removal of HTML. Does miss some '\' characters
                for part in description:
                    soup = BeautifulSoup(part, features='lxml')
                    text_part = soup.getText()
                    
                    if text_part != '':
                        true_description += text_part + " "
                return true_description

            return description[0]
        except:
            return None

示例#6

0

显示文件

文件： Main_Audiobooks.py 项目： bthomp24/D2D-Test-Bookstore

    def _get_book_authors(self, content):
        """
        args:
            content (requests.get):
                content is needed in order to get the authors
                names.
        returns:
            authors (String):
                authors is the book's authors
        synopsis:
            The purpose of this function is to determine what the
            authors are for the book being scraped.
        """

        try:        
            authors = Par_Scrape.parse(content, self.content_table + "//h4[@class='book-written-by']//a/text()")
            
            if len(authors) > 1:
                all_authors = ""
                for writer in authors:
                    
                    if writer == authors[-1]:
                        all_authors += writer
                    else:
                        all_authors += writer + ", "    
                return all_authors

            # purpose of return format is due to object list incase more than one author.
            return authors[0]
        except:
            return None

示例#7

0

显示文件

文件： Main_Scribd.py 项目： bthomp24/D2D-Test-Bookstore

    def __get_book_description(self, content):
        """
        args:
            content (requests.get):
                content is needed in orde to scrape the book's
                description
        returns:
            description (String):
                description is the book's description that is
                being scraped.
        synopsis:
            The purpose of this function is to determine what
            the book's description
        """

        try:
            return Par_Scrape.parse(content, (
                self.meta_Father_Type +
                "/following-sibling::meta[@property='og:description']/@content"
            ))[0].replace("\n",
                          "").replace("\t",
                                      "").replace("\'",
                                                  "").replace("\xa0", " ")
        except:
            return None

示例#8

0

显示文件

    def _get_book_authors(self, content):
        """
        args:
            content (requests.get):
                content is needed in order to get the authors
                names.
        returns:
            authors (String):
                authors is the book's authors
        synopsis:
            The purpose of this function is to determine what the
            authors are for the book being scraped.
        """

        try:
            authors = Par_Scrape.parse(content, (self.content_table + "/tr[@class='metadata_row']/td[@class='metadata_label']/span[contains(text(), 'Author')]" + "/../following-sibling::td/a/span/text()"))
            all_authors = ""

            if len(authors) == 0:
                return None

            # Process here is to compensate for multiple authors.
            for author in authors:
                all_authors += author + " "

            return all_authors
        except:
            return None

示例#9

0

显示文件

    def _get_book_description(self, content):
        """
        args:
            content (requests.get):
                content is needed in orde to scrape the book's
                description
        returns:
            description (String):
                description is the book's description that is
                being scraped.
        synopsis:
            The purpose of this function is to determine what
            the book's description
        """
        try:
            desc_parts = Par_Scrape.parse(content, ("//*[@id='synopsistext']//text()"))

            # Process here is to compensate for returning empty variables
            # as well as acquiring all the text in between HTML tags.
            # Solution to the dynamic html formatting of google books descriptions.
            if len(desc_parts) == 0:
                return None

            full_desc = ""            
            for parts in desc_parts:
                full_desc += parts
                
            return full_desc
        except:
            return None

示例#10

0

显示文件

文件： Main_Livraria_Cultura.py 项目： bthomp24/D2D-Test-Bookstore

 def __get_authors__(self, content):
     try:
         authors_element = Par_Scrape.parse(content, "//*[@class='value-field Colaborador']")[0]
         authors = authors_element.text
         author = authors.split("Autor:")[-1]
         return author
     except:
         return None

示例#11

0

显示文件

文件： Main_Livraria_Cultura.py 项目： bthomp24/D2D-Test-Bookstore

 def __get_subtitle__(self, content):
     try:
         subtitle_element = Par_Scrape.parse(content, "//*[@class='value-field Subtitulo']")
         if len(subtitle_element)>0:
             subtitle = subtitle_element[0].text
             return subtitle
     except:
         return None

示例#12

0

显示文件

    def _get_book_series(self, content):
        """
        args:
            content (requests.get):
                content is needed in order to get the series.
        returns:
            series (String):
                series is the book's series
        synopsis:
            The purpose of this function is to determine what the
            series is for the book being scraped (if it exists).
        """

        try:
            return Par_Scrape.parse(content,".//tr[td='Series:']/td[@class='bookDetail']")[0].text
        except:
            return None

示例#13

0

显示文件

    def _get_book_id(self, content):
        """
        args:
            content (requests.get):
                content is needed in order to scrape the book's id.
        returns:
            id (String):
                id is the book's id, as determined by the website
        synopsis:
            The purpose of this function is to determine what the
            book's id that is being scraped.
        """

        try:
            return Par_Scrape.parse(content,".//tr[td='ISBN 13#:']/td[@class='bookDetail']")[0].text
        except:
            return None

示例#14

0

显示文件

文件： Main_Audiobooks.py 项目： bthomp24/D2D-Test-Bookstore

    def _get_book_title(self, content):
        """
        args:
            content (requests.get)
                content is required in order to scrape the book's
                title.
        returns:
            title (String):
                title is the book's title that is being scraped.
        synopsis:
            The purpose of this function is to determine what the book's
            title is.
        """

        try:
            return Par_Scrape.parse(content, self.content_table + "//h1[@class='audiobookTitle']/text()")[0]
        except:
            return None

示例#15

0

显示文件

    def _get_book_url(self, content):
        """
        args:
            content (requests.get):
                content is needed in order to scrape the book's url.
        returns:
            url (String):
                url is book's url that is normally used, as determined
                by the website.
        synopsis:
            The purpose of this function is to determine what the book's
            url is that is being scraped.  This is required in order for
            functions to work properly.
        """

        try:
            return Par_Scrape.parse(content, "//*[@class='bookcover']/a/@href")
        except:
            return None

示例#16

0

显示文件

    def _get_book_url(self, content):
        """
        args:
            content (requests.get):
                content is needed in order to scrape the book's url.
        returns:
            url (String):
                url is book's url that is normally used, as determined
                by the website.
        synopsis:
            The purpose of this function is to determine what the book's
            url is that is being scraped.  This is required in order for
            functions to work properly.
        """

        try:
            return "http://*****:*****@class='bookDetail']")[0].text + "/details"
        except:
            return None

示例#17

0

显示文件

    def _get_book_image_url(self, content):
        """
        args:
            content (requests.get):
                content is required in order to scrape the book image's
                url.
        returns:
            image_url (String):
                image_url is the book's url for the book's cover
                image.
        synopsis:
            This purpose of this function is to determine what the
            url is for the book's cover image.
        """

        try:
            return Par_Scrape.parse(content, "//*[@title='Front Cover']/@src")[0]
        except:
            return None

示例#18

0

显示文件

文件： Main_Audiobooks.py 项目： bthomp24/D2D-Test-Bookstore

 def _get_book_sale_price(self, content):
     """
     args:
         content (Request.get):
             content is needed in order to scrape the audiobook's
             price if applicable
     returns:
         price:
             This is this price of the ebook.
         (None):
             If there is no ebook for the book searched.
     synopsis:
         The purpose of this function is to parse the to scrape
         for the audiobook's price and return it if applicable.
     """
     try:
         return Par_Scrape.parse(content, self.content_table + "//div[@class='fleft button-text']/div/p/text()")
     except:
         return None

示例#19

0

显示文件

文件： Main_Scribd.py 项目： bthomp24/D2D-Test-Bookstore

    def __get_book_authors(self, content):
        """
        args:
            content (requests.get):
                content is needed in order to get the authors
                names.
        returns:
            authors (String):
                authors is the book's authors
        synopsis:
            The purpose of this function is to determine what the
            authors are for the book being scraped.
        """

        try:
            return Par_Scrape.parse(
                content, (self.right_col +
                          "/div[@class='contributors']/p/span/a/text()"))[0]
        except:
            return None

示例#20

0

显示文件

 def _get_book_format(self, content):
     """
     args:
     returns:
         format (String):
             format is what type of book was scraped
             google books only has E-books available on their
             own site.
     synopsis:
         The purpose of this function is to return the book format
         "DIGITAL" or "PRINT" since google books only has E-books on their 
         own site with links to "PRINT" books.
     """
     try:
         if Par_Scrape.parse(content, "//*[@id='gb-get-book-not-available']"):
             return "PRINT"
         else:    
             return "DIGITAL"
     except:
         return None

示例#21

0

显示文件

 def _get_book_sale_status(self, content):
     """
     args:
         content (requests.get):
             content is needed in order to scrape the book's
             subtitle
     returns:
         sale_status (Boolean):
             the sales status of the book that is being scraped.
     synopsis:
         The purpose of this function is to determine if the
         book is available for sale.
     """
     try:
         if Par_Scrape.parse(content, "//*[@id='gb-get-book-not-available']"):
             return False
         else:
             return True
     except:
         return None

示例#22

0

显示文件

 def _get_book_sale_price(self, content):
     """
     args:
         content (Request.get):
             content is needed in order to scrape the e-book's
             price if applicable
     returns:
         price:
             This is this price of the ebook.
         (None):
             If there is no ebook for the book searched.
     synopsis:
         The purpose of this function is to parse the to scrape
         for the e-book's price and return it.
     """
     try:
         if Par_Scrape.parse(content, "//*[@id='gb-get-book-not-available']"):
             return None
     except:
         return None

示例#23

0

显示文件

文件： Main_Scribd.py 项目： bthomp24/D2D-Test-Bookstore

    def __get_book_title(self, content):
        """
        args:
            content (requests.get)
                content is required in order to scrape the book's
                title.
        returns:
            title (String):
                title is the book's title that is being scraped.
        synopsis:
            Thepurpose of this function is to determine what the book's
            title is.
        """

        try:
            return Par_Scrape.parse(
                content,
                (self.meta_Father_Type +
                 "/following-sibling::meta[@property='og:title']/@content"))[0]
        except:
            return None

示例#24

0

显示文件

文件： Main_Audiobooks.py 项目： bthomp24/D2D-Test-Bookstore

    def _get_book_sale_status(self, content):
        """
        args:
            content (requests.get):
                content is needed in order to scrape the book's
                subtitle
        returns:
            sale_status (Boolean):
                the sales status of the book that is being scraped.
        synopsis:
            The purpose of this function is to determine if the
            book is available for sale.
        """

        try:
            if Par_Scrape.parse(content, self.content_table + "//span[@class='nonmember-notify save-later-text']"):
                return False
            else:
                return True
        except:
            return None

示例#25

0

显示文件

文件： Main_Scribd.py 项目： bthomp24/D2D-Test-Bookstore

    def __get_book_url(self, content):
        """
        args:
            content (requests.get):
                content is needed in order to scrape the book's url.
        returns:
            url (String):
                url is book's url that is normally used, as determined
                by the website.
        synopsis:
            The purpose of this function is to determine what the book's
            url is that is being scraped.  This is required in order for
            functions to work properly.
        """

        try:
            return Par_Scrape.parse(
                content,
                (self.meta_Father_Type +
                 "/following-sibling::meta[@property='og:url']/@content"))[0]
        except:
            return None

示例#26

0

显示文件

文件： Main_Scribd.py 项目： bthomp24/D2D-Test-Bookstore

    def __get_book_image_url(self, content):
        """
        args:
            content (requests.get):
                content is required in order to scrape the book's
                url.
        returns:
            image_url (String):
                image_url is the book's url for the book's cover
                image.
        synopsis:
            This purpose of this function is to determine what the
            book's url is for the cover image.
        """

        try:
            return Par_Scrape.parse(
                content,
                (self.meta_Father_Type +
                 "/following-sibling::meta[@property='og:image']/@content"))[0]
        except:
            return None

示例#27

0

显示文件

文件： Main_Audiobooks.py 项目： bthomp24/D2D-Test-Bookstore

    def _get_book_image_url(self, content):
        """
        args:
            content (requests.get):
                content is required in order to scrape the book image's
                url.
        returns:
            image_url (String):
                image_url is the book's url for the book's cover
                image.
        synopsis:
            This purpose of this function is to determine what the
            url is for the book's cover image.
        """
        try:
            # audiobooks has the image without the front end of an acceptable URL
            tail_url = Par_Scrape.parse(content, self.content_table + "//img[@class='book-cover']/@src")[0]
            full_url = "https:" + tail_url

            return full_url        
        except:
            return None

示例#28

0

显示文件

    def _get_book_volume_number(self, content):
        """
        args:
            content (requests.get):
                content is needed in order to get the volume
                number.
        returns:
            volume_number (Int):
                volume_number is the book's volume_number
        synopsis:
            The purpose of this function is to determine what the
            volume_number is for the book being scraped (if it exists).
        """

        try:
            volume_number = Par_Scrape.parse(content,".//tr[td='Volume#:']/td[@class='bookDetail']")[0].text
            if volume_number == "None":
                return "None"
            else:
                return int(volume_number)
        except:
            return None

示例#29

0

显示文件

    def _get_ready_for_sale(self, content):
        """
        args:
            content (requests.get):
                content is needed in order to scrape the book's release
                date.
        returns:
            ready_for_sale (Boolean):
                ready_for_sale is the book's availability, as determined by
                the website
        synopsis:
            The purpose of this function is to determine if the book
            is available or not.
        """

        try:
            release_array = Par_Scrape.parse(content,".//tr[td='Release Date:  ']/td[@class='bookDetail']")[0].text.split('/')
            release_date = date(int(release_array[0]),int(release_array[1]),int(release_array[2]))
            today = date.today()
            return release_date <= today
        except:
            return None

示例#30

0

显示文件

文件： Main_Scribd.py 项目： bthomp24/D2D-Test-Bookstore

    def __get_book_isbn_13(self, content):
        """
        args:
            content (requests.get):
                content is needed in order to scrape the book's
                isbn_13.
        returns:
            isbn_13 (String):
                isbn_13 is the book's isbn_13 that is being
                scraped.
        synopsis:
            The purpose of this function is to determine the
            book's isbn_13.
        """

        try:
            return Par_Scrape.parse(
                content,
                (self.meta_Father_Type +
                 "/following-sibling::meta[@property='books:isbn']/@content"
                 ))[0]
        except:
            return None