def _parse_search_result(self, result: Dict, generic_cover: str, locale: str) -> MetaRecord: match = MetaRecord( id=result["id"], title=result["volumeInfo"]["title"], authors=result["volumeInfo"].get("authors", []), url=Google.BOOK_URL + result["id"], source=MetaSourceInfo( id=self.__id__, description=Google.DESCRIPTION, link=Google.META_URL, ), ) match.cover = self._parse_cover(result=result, generic_cover=generic_cover) match.description = result["volumeInfo"].get("description", "") match.languages = self._parse_languages(result=result, locale=locale) match.publisher = result["volumeInfo"].get("publisher", "") match.publishedDate = result["volumeInfo"].get("publishedDate", "") match.rating = result["volumeInfo"].get("averageRating", 0) match.series, match.series_index = "", 1 match.tags = result["volumeInfo"].get("categories", []) match.identifiers = {"google": match.id} match = self._parse_isbn(result=result, match=match) return match
def _parse_search_result( self, result: Dict, generic_cover: str, locale: str ) -> MetaRecord: series = result["volume"].get("name", "") series_index = result.get("issue_number", 0) issue_name = result.get("name", "") match = MetaRecord( id=result["id"], title=f"{series}#{series_index} - {issue_name}", authors=result.get("authors", []), url=result.get("site_detail_url", ""), source=MetaSourceInfo( id=self.__id__, description=ComicVine.DESCRIPTION, link=ComicVine.META_URL, ), series=series, ) match.cover = result["image"].get("original_url", generic_cover) match.description = result.get("description", "") match.publishedDate = result.get("store_date", result.get("date_added")) match.series_index = series_index match.tags = ["Comics", series] match.identifiers = {"comicvine": match.id} return match
def _parse_search_result( self, result: Dict, generic_cover: str, locale: str ) -> MetaRecord: match = MetaRecord( id=result.get("pub_url", result.get("eprint_url", "")), title=result["bib"].get("title"), authors=result["bib"].get("author", []), url=result.get("pub_url", result.get("eprint_url", "")), source=MetaSourceInfo( id=self.__id__, description=self.__name__, link=scholar.META_URL ), ) match.cover = result.get("image", {}).get("original_url", generic_cover) match.description = unquote(result["bib"].get("abstract", "")) match.publisher = result["bib"].get("venue", "") match.publishedDate = result["bib"].get("pub_year") + "-01-01" match.identifiers = {"scholar": match.id} return match
def parse_search_results(self) -> List[MetaRecord]: matches = [] results = self.root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH) for result in results: title = self._parse_xpath_node( root=result, xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" f"{LubimyCzytac.TITLE_TEXT_PATH}", ) book_url = self._parse_xpath_node( root=result, xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" f"{LubimyCzytac.URL_PATH}", ) authors = self._parse_xpath_node( root=result, xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" f"{LubimyCzytac.AUTHORS_PATH}", take_first=False, ) if not all([title, book_url, authors]): continue matches.append( MetaRecord( id=book_url.replace(f"/ksiazka/", "").split("/")[0], title=title, authors=[strip_accents(author) for author in authors], url=LubimyCzytac.BASE_URL + book_url, source=MetaSourceInfo( id=self.metadata.__id__, description=self.metadata.__name__, link=LubimyCzytac.BASE_URL, ), ) ) return matches
def inner(link, index) -> [dict, int]: with self.session as session: try: r = session.get(f"https://www.amazon.com/{link}") r.raise_for_status() except Exception as ex: log.warning(ex) return long_soup = BS(r.text, "lxml") #~4sec :/ soup2 = long_soup.find( "div", attrs={ "cel_widget_id": "dpx-books-ppd_csm_instrumentation_wrapper" }) if soup2 is None: return try: match = MetaRecord( title="", authors="", source=MetaSourceInfo(id=self.__id__, description="Amazon Books", link="https://amazon.com/"), url=f"https://www.amazon.com{link}", #the more searches the slower, these are too hard to find in reasonable time or might not even exist publisher="", # very unreliable publishedDate="", # very unreliable id=None, # ? tags=[] # dont exist on amazon ) try: match.description = "\n".join( soup2.find("div", attrs={"data-feature-name": "bookDescription"}).stripped_strings)\ .replace("\xa0"," ")[:-9].strip().strip("\n") except (AttributeError, TypeError): return None # if there is no description it is not a book and therefore should be ignored try: match.title = soup2.find("span", attrs={ "id": "productTitle" }).text except (AttributeError, TypeError): match.title = "" try: match.authors = [ next( filter( lambda i: i != " " and i != "\n" and not i.startswith("{"), x.findAll(text=True))).strip() for x in soup2.findAll("span", attrs={"class": "author"}) ] except (AttributeError, TypeError, StopIteration): match.authors = "" try: match.rating = int( soup2.find("span", class_="a-icon-alt").text.split(" ") [0].split(".")[0]) # first number in string except (AttributeError, ValueError): match.rating = 0 try: match.cover = soup2.find( "img", attrs={"class": "a-dynamic-image frontImage"})["src"] except (AttributeError, TypeError): match.cover = "" return match, index except Exception as e: log.error_or_exception(e) return
def _parse_single_book(self, id: str, generic_cover: str = "") -> Optional[MetaRecord]: url = f"https://book.douban.com/subject/{id}/" try: r = self.session.get(url) r.raise_for_status() except Exception as e: log.warning(e) return None match = MetaRecord( id=id, title="", authors=[], url=url, source=MetaSourceInfo( id=self.__id__, description=self.DESCRIPTION, link=self.META_URL, ), ) html = etree.HTML(r.content.decode("utf8")) match.title = html.xpath(self.TITTLE_XPATH)[0].text match.cover = html.xpath( self.COVER_XPATH)[0].attrib["href"] or generic_cover try: rating_num = float(html.xpath(self.RATING_XPATH)[0].text.strip()) except Exception: rating_num = 0 match.rating = int(-1 * rating_num // 2 * -1) if rating_num else 0 tag_elements = html.xpath(self.TAGS_XPATH) if len(tag_elements): match.tags = [tag_element.text for tag_element in tag_elements] description_element = html.xpath(self.DESCRIPTION_XPATH) if len(description_element): match.description = html2text( etree.tostring(description_element[-1], encoding="utf8").decode("utf8")) info = html.xpath(self.INFO_XPATH) for element in info: text = element.text if self.AUTHORS_PATTERN.search(text): next = element.getnext() while next is not None and next.tag != "br": match.authors.append(next.text) next = next.getnext() elif self.PUBLISHER_PATTERN.search(text): match.publisher = element.tail.strip() elif self.SUBTITLE_PATTERN.search(text): match.title = f'{match.title}:' + element.tail.strip() elif self.PUBLISHED_DATE_PATTERN.search(text): match.publishedDate = self._clean_date(element.tail.strip()) elif self.SUBTITLE_PATTERN.search(text): match.series = element.getnext().text elif i_type := self.IDENTIFIERS_PATTERN.search(text): match.identifiers[i_type.group()] = element.tail.strip()