Exemplo n.º 1
0
        def inner(link, index) -> [dict, int]:
            with self.session as session:
                try:
                    r = session.get(f"https://www.amazon.com/{link}")
                    r.raise_for_status()
                except Exception as ex:
                    log.warning(ex)
                    return
                long_soup = BS(r.text, "lxml")  #~4sec :/
                soup2 = long_soup.find(
                    "div",
                    attrs={
                        "cel_widget_id":
                        "dpx-books-ppd_csm_instrumentation_wrapper"
                    })
                if soup2 is None:
                    return
                try:
                    match = MetaRecord(
                        title="",
                        authors="",
                        source=MetaSourceInfo(id=self.__id__,
                                              description="Amazon Books",
                                              link="https://amazon.com/"),
                        url=f"https://www.amazon.com{link}",
                        #the more searches the slower, these are too hard to find in reasonable time or might not even exist
                        publisher="",  # very unreliable
                        publishedDate="",  # very unreliable
                        id=None,  # ?
                        tags=[]  # dont exist on amazon
                    )

                    try:
                        match.description = "\n".join(
                            soup2.find("div", attrs={"data-feature-name": "bookDescription"}).stripped_strings)\
                                                .replace("\xa0"," ")[:-9].strip().strip("\n")
                    except (AttributeError, TypeError):
                        return None  # if there is no description it is not a book and therefore should be ignored
                    try:
                        match.title = soup2.find("span",
                                                 attrs={
                                                     "id": "productTitle"
                                                 }).text
                    except (AttributeError, TypeError):
                        match.title = ""
                    try:
                        match.authors = [
                            next(
                                filter(
                                    lambda i: i != " " and i != "\n"
                                    and not i.startswith("{"),
                                    x.findAll(text=True))).strip()
                            for x in soup2.findAll("span",
                                                   attrs={"class": "author"})
                        ]
                    except (AttributeError, TypeError, StopIteration):
                        match.authors = ""
                    try:
                        match.rating = int(
                            soup2.find("span",
                                       class_="a-icon-alt").text.split(" ")
                            [0].split(".")[0])  # first number in string
                    except (AttributeError, ValueError):
                        match.rating = 0
                    try:
                        match.cover = soup2.find(
                            "img",
                            attrs={"class":
                                   "a-dynamic-image frontImage"})["src"]
                    except (AttributeError, TypeError):
                        match.cover = ""
                    return match, index
                except Exception as e:
                    log.error_or_exception(e)
                    return
Exemplo n.º 2
0
    def _parse_single_book(self,
                           id: str,
                           generic_cover: str = "") -> Optional[MetaRecord]:
        url = f"https://book.douban.com/subject/{id}/"

        try:
            r = self.session.get(url)
            r.raise_for_status()
        except Exception as e:
            log.warning(e)
            return None

        match = MetaRecord(
            id=id,
            title="",
            authors=[],
            url=url,
            source=MetaSourceInfo(
                id=self.__id__,
                description=self.DESCRIPTION,
                link=self.META_URL,
            ),
        )

        html = etree.HTML(r.content.decode("utf8"))

        match.title = html.xpath(self.TITTLE_XPATH)[0].text
        match.cover = html.xpath(
            self.COVER_XPATH)[0].attrib["href"] or generic_cover
        try:
            rating_num = float(html.xpath(self.RATING_XPATH)[0].text.strip())
        except Exception:
            rating_num = 0
        match.rating = int(-1 * rating_num // 2 * -1) if rating_num else 0

        tag_elements = html.xpath(self.TAGS_XPATH)
        if len(tag_elements):
            match.tags = [tag_element.text for tag_element in tag_elements]

        description_element = html.xpath(self.DESCRIPTION_XPATH)
        if len(description_element):
            match.description = html2text(
                etree.tostring(description_element[-1],
                               encoding="utf8").decode("utf8"))

        info = html.xpath(self.INFO_XPATH)

        for element in info:
            text = element.text
            if self.AUTHORS_PATTERN.search(text):
                next = element.getnext()
                while next is not None and next.tag != "br":
                    match.authors.append(next.text)
                    next = next.getnext()
            elif self.PUBLISHER_PATTERN.search(text):
                match.publisher = element.tail.strip()
            elif self.SUBTITLE_PATTERN.search(text):
                match.title = f'{match.title}:' + element.tail.strip()
            elif self.PUBLISHED_DATE_PATTERN.search(text):
                match.publishedDate = self._clean_date(element.tail.strip())
            elif self.SUBTITLE_PATTERN.search(text):
                match.series = element.getnext().text
            elif i_type := self.IDENTIFIERS_PATTERN.search(text):
                match.identifiers[i_type.group()] = element.tail.strip()