Exemplo n.º 1
0
    def _parse_search_result(self, result: Dict, generic_cover: str,
                             locale: str) -> MetaRecord:
        match = MetaRecord(
            id=result["id"],
            title=result["volumeInfo"]["title"],
            authors=result["volumeInfo"].get("authors", []),
            url=Google.BOOK_URL + result["id"],
            source=MetaSourceInfo(
                id=self.__id__,
                description=Google.DESCRIPTION,
                link=Google.META_URL,
            ),
        )

        match.cover = self._parse_cover(result=result,
                                        generic_cover=generic_cover)
        match.description = result["volumeInfo"].get("description", "")
        match.languages = self._parse_languages(result=result, locale=locale)
        match.publisher = result["volumeInfo"].get("publisher", "")
        match.publishedDate = result["volumeInfo"].get("publishedDate", "")
        match.rating = result["volumeInfo"].get("averageRating", 0)
        match.series, match.series_index = "", 1
        match.tags = result["volumeInfo"].get("categories", [])

        match.identifiers = {"google": match.id}
        match = self._parse_isbn(result=result, match=match)
        return match
Exemplo n.º 2
0
 def parse_single_book(self, match: MetaRecord, generic_cover: str,
                       locale: str) -> MetaRecord:
     response = requests.get(match.url)
     self.root = fromstring(response.text)
     match.cover = self._parse_cover(generic_cover=generic_cover)
     match.description = self._parse_description()
     match.languages = self._parse_languages(locale=locale)
     match.publisher = self._parse_publisher()
     match.publishedDate = self._parse_from_summary(
         attribute_name="datePublished")
     match.rating = self._parse_rating()
     match.series, match.series_index = self._parse_series()
     match.tags = self._parse_tags()
     match.identifiers = {
         "isbn": self._parse_isbn(),
         "lubimyczytac": match.id,
     }
     return match
Exemplo n.º 3
0
    def _parse_single_book(self,
                           id: str,
                           generic_cover: str = "") -> Optional[MetaRecord]:
        url = f"https://book.douban.com/subject/{id}/"

        try:
            r = self.session.get(url)
            r.raise_for_status()
        except Exception as e:
            log.warning(e)
            return None

        match = MetaRecord(
            id=id,
            title="",
            authors=[],
            url=url,
            source=MetaSourceInfo(
                id=self.__id__,
                description=self.DESCRIPTION,
                link=self.META_URL,
            ),
        )

        html = etree.HTML(r.content.decode("utf8"))

        match.title = html.xpath(self.TITTLE_XPATH)[0].text
        match.cover = html.xpath(
            self.COVER_XPATH)[0].attrib["href"] or generic_cover
        try:
            rating_num = float(html.xpath(self.RATING_XPATH)[0].text.strip())
        except Exception:
            rating_num = 0
        match.rating = int(-1 * rating_num // 2 * -1) if rating_num else 0

        tag_elements = html.xpath(self.TAGS_XPATH)
        if len(tag_elements):
            match.tags = [tag_element.text for tag_element in tag_elements]

        description_element = html.xpath(self.DESCRIPTION_XPATH)
        if len(description_element):
            match.description = html2text(
                etree.tostring(description_element[-1],
                               encoding="utf8").decode("utf8"))

        info = html.xpath(self.INFO_XPATH)

        for element in info:
            text = element.text
            if self.AUTHORS_PATTERN.search(text):
                next = element.getnext()
                while next is not None and next.tag != "br":
                    match.authors.append(next.text)
                    next = next.getnext()
            elif self.PUBLISHER_PATTERN.search(text):
                match.publisher = element.tail.strip()
            elif self.SUBTITLE_PATTERN.search(text):
                match.title = f'{match.title}:' + element.tail.strip()
            elif self.PUBLISHED_DATE_PATTERN.search(text):
                match.publishedDate = self._clean_date(element.tail.strip())
            elif self.SUBTITLE_PATTERN.search(text):
                match.series = element.getnext().text
            elif i_type := self.IDENTIFIERS_PATTERN.search(text):
                match.identifiers[i_type.group()] = element.tail.strip()