def build_meta(log, issue_id): """Build metadata record based on comicvine issue_id""" issue = pycomicvine.Issue( issue_id, field_list=[ "id", "name", "volume", "issue_number", "person_credits", "description", "store_date", "cover_date", ], ) if not issue or not issue.volume: log.warn("Unable to load Issue(%d)" % issue_id) return None title = "%s #%s" % (issue.volume.name, issue.issue_number) if issue.name: title = title + ": %s" % (issue.name) authors = [p.name for p in issue.person_credits] meta = Metadata(title, authors) meta.series = issue.volume.name meta.series_index = str(issue.issue_number) meta.set_identifier("comicvine", str(issue.id)) meta.comments = issue.description meta.has_cover = False if issue.volume.publisher: meta.publisher = issue.volume.publisher.name meta.pubdate = issue.store_date or issue.cover_date return meta
def run(self): try: self.log.info('Worker parsing url: %r' % self.url) book = Book.from_url(self.browser, self.url, self.timeout, self.log) if not book.get("title") or not book.get("authors"): self.log.error('Insufficient metadata found for %r' % self.url) return title = book["title"].encode('utf-8') authors = [a.encode('utf-8') for a in book["authors"]] mi = Metadata(title, authors) isbn = book.get("ean") or book.get("isbn") if isbn: mi.set_identifier("isbn", isbn) for attr in ("pubdate", "rating", "languages"): if attr in book: setattr(mi, attr, book[attr]) if book.get("publisher"): mi.publisher = book["publisher"].encode('utf-8') if book.get("cover_url"): self.plugin.cache_identifier_to_cover_url(isbn, book["cover_url"]) mi.has_cover = True self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi) except Exception as e: self.log.exception('Worker failed to fetch and parse url %r with error %r' % (self.url, e))
def _get_bookdetails(self, url): u = self.BASE_URL + url["url"] print("_get_bookdetails:: traukiam knygą iš %s" % u) resp = urllib2.urlopen(u) contents = resp.read() #print(contents) tree = etree.HTML(contents) authors = self._get_authors(tree) publisher = self._get_details(tree, self.details_publisher) year = self._get_year(tree) pages = self._get_details(tree, self.details_pages) isbn = self._get_details(tree, self.details_isbn) description = self._get_description(tree) cover = self._get_cover_url(tree) tags = self._get_tags(tree) mi = Metadata(url["title"], authors) mi.set_identifier("isbn", isbn) mi.comments = description mi.language = "LT" mi.tags = tags try: mi.set("publisher", publisher) except: print(u"_get_bookdetails:: nepavyko užsetinti leidėjo") try: mi.set("pubdate", datetime.datetime(year, 1, 2)) except: print(u"_get_bookdetails:: nepavyko užsetinti leidimo datos") try: if self.gui: print("YYYYRAAA GUI!!!") col = {} col["#value#"] = pages mi.set_user_metadata("#count", col) except: print(u"_get_bookdetails:: nepavyko užsetinti puslapių skaičiaus") if cover and isbn: print(u"_get_bookdetails:: kešuojam viršelį:", cover) self.cache_isbn_to_identifier(isbn, isbn) self.cache_identifier_to_cover_url(isbn, cover) mi.has_cover = True print(self.cached_identifier_to_cover_url(isbn)) return mi
def build_meta(log, issue_id): """Build metadata record based on comicvine issue_id.""" issue = PyComicvineWrapper(log).lookup_issue(issue_id) if issue: meta = Metadata(issue.get_full_title(), issue.get_authors()) meta.series = issue.volume_name meta.series_index = issue.issue_number meta.set_identifier('comicvine', str(issue.id)) meta.set_identifier('comicvine-volume', str(issue.volume_id)) meta.comments = issue.description meta.has_cover = False meta.publisher = issue.publisher_name meta.pubdate = issue.date return meta else: return None
def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import UNDEFINED_DATE root = parse_html(raw) mi = Metadata(self.basic_data['title'], self.basic_data['authors']) # Identifiers if self.basic_data['isbns']: mi.isbn = self.basic_data['isbns'][0] mi.set_identifier('edelweiss', self.sku) # Tags if self.basic_data['tags']: mi.tags = self.basic_data['tags'] mi.tags = [ t[1:].strip() if t.startswith('&') else t for t in mi.tags ] # Publisher mi.publisher = self.basic_data['publisher'] # Pubdate if self.basic_data['pubdate'] and self.basic_data[ 'pubdate'].year != UNDEFINED_DATE: mi.pubdate = self.basic_data['pubdate'] # Rating if self.basic_data['rating']: mi.rating = self.basic_data['rating'] # Comments comments = '' for cid in ('summary', 'contributorbio', 'quotes_reviews'): cid = 'desc_{}{}-content'.format(cid, self.sku) div = root.xpath('//*[@id="{}"]'.format(cid)) if div: comments += self.render_comments(div[0]) if comments: mi.comments = comments mi.has_cover = self.plugin.cached_identifier_to_cover_url( self.sku) is not None return mi
def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import UNDEFINED_DATE root = parse_html(raw) mi = Metadata(self.basic_data['title'], self.basic_data['authors']) # Identifiers if self.basic_data['isbns']: mi.isbn = self.basic_data['isbns'][0] mi.set_identifier('edelweiss', self.sku) # Tags if self.basic_data['tags']: mi.tags = self.basic_data['tags'] mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags] # Publisher mi.publisher = self.basic_data['publisher'] # Pubdate if self.basic_data['pubdate'] and self.basic_data['pubdate'].year != UNDEFINED_DATE: mi.pubdate = self.basic_data['pubdate'] # Rating if self.basic_data['rating']: mi.rating = self.basic_data['rating'] # Comments comments = '' for cid in ('summary', 'contributorbio', 'quotes_reviews'): cid = 'desc_{}{}-content'.format(cid, self.sku) div = root.xpath('//*[@id="{}"]'.format(cid)) if div: comments += self.render_comments(div[0]) if comments: mi.comments = comments mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None return mi
def build_meta(log, issue_id): '''Build metadata record based on comicvine issue_id''' issue = pycomicvine.Issue(issue_id, field_list=[ 'id', 'name', 'volume', 'issue_number', 'person_credits', 'description', 'store_date', 'cover_date']) if not issue or not issue.volume: log.warn('Unable to load Issue(%d)' % issue_id) return None title = '%s #%s' % (issue.volume.name, issue.issue_number) if issue.name: title = title + ': %s' % (issue.name) authors = [p.name for p in issue.person_credits] meta = Metadata(title, authors) meta.series = issue.volume.name meta.series_index = str(issue.issue_number) meta.set_identifier('comicvine', str(issue.id)) meta.set_identifier('comicvine-volume', str(issue.volume.id)) meta.comments = issue.description meta.has_cover = False if issue.volume.publisher: meta.publisher = issue.volume.publisher.name meta.pubdate = issue.store_date or issue.cover_date return meta
def parse_details(self, root): search_data = '' isbn = None try: self.log.info('Parse details:%s'%self.url) databazeknih_id = self.parse_databazeknih_id(self.url) self.log.info('Parsed DK identifier:%s'%databazeknih_id) except: self.log.exception('Error parsing databazeknih id for url: %r'%self.url) databazeknih_id = None # self.log.info('11') try: title = self.parse_title(root) self.log.info('Parsed title:%s'%title) except: self.log.exception('Error parsing title for url: %r'%self.url) title = None try: authors = self.parse_authors(root) self.log.info('Parsed authors:%s'%authors) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not databazeknih_id: self.log.error('Could not find title/authors/databazeknih id for %r'%self.url) self.log.error('DK id: %r Title: %r Authors: %r'%(databazeknih_id, title, authors)) return mi = Metadata(title, authors) self.log.info('dbki:%s'%databazeknih_id) mi.set_identifier('databazeknih', databazeknih_id) self.databazeknih_id = databazeknih_id try: (mi.series, mi.series_index) = self.parse_series(root) self.log.info('Parsed series:%s'%mi.series) self.log.info('Parsed series index:%s'%mi.series_index) except : self.log.exception('Error parsing series for url: %r'%self.url) series = None try: mi.comments = self.parse_comments(root) self.log.info('Parsed comments:%s'%mi.comments) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) self.log.info('Parsed URL for cover:%r'%self.cover_url) self.plugin.cache_identifier_to_cover_url(self.databazeknih_id, self.cover_url) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) try: mi.tags = self.parse_tags(root) self.log.info('Parsed tags:%s'%mi.tags) except: self.log.exception('Error parsing tags for url: %r'%self.url) try: mi.publisher = self.parse_publisher(root) self.log.info('Parsed publisher:%s'%mi.publisher) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_pubdate(root) self.log.info('Parsed pubdate:%s'%mi.pubdate) except: self.log.exception('Error parsing pubdate for url: %r'%self.url) try: mi.rating = self.parse_rating(root) self.log.info('Parsed rating:%s'%mi.rating) except: self.log.exception('Error parsing rating for url: %r'%self.url) mi.source_relevance = self.relevance # if series: # mi.series = series try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) if self.databazeknih_id: self.plugin.cache_isbn_to_identifier(self.isbn, self.databazeknih_id) # self.plugin.clean_downloaded_metadata(mi) # mi.isbn = check_isbn(mi.isbn) self.log.info(mi) self.result_queue.put(mi)
def _get_metadata(self, book_id, get_user_categories=True): # {{{ mi = Metadata(None, template_cache=self.formatter_template_cache) author_ids = self._field_ids_for('authors', book_id) aut_list = [self._author_data(i) for i in author_ids] aum = [] aus = {} aul = {} for rec in aut_list: aut = rec['name'] aum.append(aut) aus[aut] = rec['sort'] aul[aut] = rec['link'] mi.title = self._field_for('title', book_id, default_value=_('Unknown')) mi.authors = aum mi.author_sort = self._field_for('author_sort', book_id, default_value=_('Unknown')) mi.author_sort_map = aus mi.author_link_map = aul mi.comments = self._field_for('comments', book_id) mi.publisher = self._field_for('publisher', book_id) n = nowf() mi.timestamp = self._field_for('timestamp', book_id, default_value=n) mi.pubdate = self._field_for('pubdate', book_id, default_value=n) mi.uuid = self._field_for('uuid', book_id, default_value='dummy') mi.title_sort = self._field_for('sort', book_id, default_value=_('Unknown')) mi.book_size = self._field_for('size', book_id, default_value=0) mi.ondevice_col = self._field_for('ondevice', book_id, default_value='') mi.last_modified = self._field_for('last_modified', book_id, default_value=n) formats = self._field_for('formats', book_id) mi.format_metadata = {} mi.languages = list(self._field_for('languages', book_id)) if not formats: good_formats = None else: mi.format_metadata = FormatMetadata(self, book_id, formats) good_formats = FormatsList(formats, mi.format_metadata) mi.formats = good_formats mi.has_cover = _('Yes') if self._field_for( 'cover', book_id, default_value=False) else '' mi.tags = list(self._field_for('tags', book_id, default_value=())) mi.series = self._field_for('series', book_id) if mi.series: mi.series_index = self._field_for('series_index', book_id, default_value=1.0) mi.rating = self._field_for('rating', book_id) mi.set_identifiers( self._field_for('identifiers', book_id, default_value={})) mi.application_id = book_id mi.id = book_id composites = [] for key, meta in self.field_metadata.custom_iteritems(): mi.set_user_metadata(key, meta) if meta['datatype'] == 'composite': composites.append(key) else: val = self._field_for(key, book_id) if isinstance(val, tuple): val = list(val) extra = self._field_for(key + '_index', book_id) mi.set(key, val=val, extra=extra) for key in composites: mi.set(key, val=self._composite_for(key, book_id, mi)) user_cat_vals = {} if get_user_categories: user_cats = self.backend.prefs['user_categories'] for ucat in user_cats: res = [] for name, cat, ign in user_cats[ucat]: v = mi.get(cat, None) if isinstance(v, list): if name in v: res.append([name, cat]) elif name == v: res.append([name, cat]) user_cat_vals[ucat] = res mi.user_categories = user_cat_vals return mi
def _get_metadata(self, book_id, get_user_categories=True): # {{{ mi = Metadata(None, template_cache=self.formatter_template_cache) author_ids = self._field_ids_for('authors', book_id) aut_list = [self._author_data(i) for i in author_ids] aum = [] aus = {} aul = {} for rec in aut_list: aut = rec['name'] aum.append(aut) aus[aut] = rec['sort'] aul[aut] = rec['link'] mi.title = self._field_for('title', book_id, default_value=_('Unknown')) mi.authors = aum mi.author_sort = self._field_for('author_sort', book_id, default_value=_('Unknown')) mi.author_sort_map = aus mi.author_link_map = aul mi.comments = self._field_for('comments', book_id) mi.publisher = self._field_for('publisher', book_id) n = nowf() mi.timestamp = self._field_for('timestamp', book_id, default_value=n) mi.pubdate = self._field_for('pubdate', book_id, default_value=n) mi.uuid = self._field_for('uuid', book_id, default_value='dummy') mi.title_sort = self._field_for('sort', book_id, default_value=_('Unknown')) mi.book_size = self._field_for('size', book_id, default_value=0) mi.ondevice_col = self._field_for('ondevice', book_id, default_value='') mi.last_modified = self._field_for('last_modified', book_id, default_value=n) formats = self._field_for('formats', book_id) mi.format_metadata = {} mi.languages = list(self._field_for('languages', book_id)) if not formats: good_formats = None else: mi.format_metadata = FormatMetadata(self, book_id, formats) good_formats = FormatsList(formats, mi.format_metadata) mi.formats = good_formats mi.has_cover = _('Yes') if self._field_for('cover', book_id, default_value=False) else '' mi.tags = list(self._field_for('tags', book_id, default_value=())) mi.series = self._field_for('series', book_id) if mi.series: mi.series_index = self._field_for('series_index', book_id, default_value=1.0) mi.rating = self._field_for('rating', book_id) mi.set_identifiers(self._field_for('identifiers', book_id, default_value={})) mi.application_id = book_id mi.id = book_id composites = [] for key, meta in self.field_metadata.custom_iteritems(): mi.set_user_metadata(key, meta) if meta['datatype'] == 'composite': composites.append(key) else: val = self._field_for(key, book_id) if isinstance(val, tuple): val = list(val) extra = self._field_for(key+'_index', book_id) mi.set(key, val=val, extra=extra) for key in composites: mi.set(key, val=self._composite_for(key, book_id, mi)) user_cat_vals = {} if get_user_categories: user_cats = self.backend.prefs['user_categories'] for ucat in user_cats: res = [] for name,cat,ign in user_cats[ucat]: v = mi.get(cat, None) if isinstance(v, list): if name in v: res.append([name,cat]) elif name == v: res.append([name,cat]) user_cat_vals[ucat] = res mi.user_categories = user_cat_vals return mi
def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_only_date, UNDEFINED_DATE from css_selectors import Select root = parse_html(raw) selector = Select(root) sku = next(selector('div.sku.attGroup')) info = sku.getparent() top = info.getparent().getparent() banner = top.find('div') spans = banner.findall('span') title = '' for i, span in enumerate(spans): if i == 0 or '12pt' in span.get('style', ''): title += astext(span) else: break authors = [re.sub(r'\(.*\)', '', x).strip() for x in astext(spans[-1]).split(',')] mi = Metadata(title.strip(), authors) # Identifiers isbns = [check_isbn(x.strip()) for x in astext(sku).split(',')] for isbn in isbns: if isbn: self.plugin.cache_isbn_to_identifier(isbn, self.sku) isbns = sorted(isbns, key=lambda x:len(x) if x else 0, reverse=True) if isbns and isbns[0]: mi.isbn = isbns[0] mi.set_identifier('edelweiss', self.sku) # Tags bisac = tuple(selector('div.bisac.attGroup')) if bisac: bisac = astext(bisac[0]) mi.tags = [x.strip() for x in bisac.split(',')] mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags] # Publisher pub = tuple(selector('div.supplier.attGroup')) if pub: pub = astext(pub[0]) mi.publisher = pub # Pubdate pub = tuple(selector('div.shipDate.attGroupItem')) if pub: pub = astext(pub[0]) parts = pub.partition(':')[0::2] pub = parts[1] or parts[0] try: if ', Ship Date:' in pub: pub = pub.partition(', Ship Date:')[0] q = parse_only_date(pub, assume_utc=True) if q.year != UNDEFINED_DATE: mi.pubdate = q except: self.log.exception('Error parsing published date: %r'%pub) # Comments comm = '' general = tuple(selector('div#pd-general-overview-content')) if general: q = self.render_comments(general[0]) if q != '<p>No title summary available. </p>': comm += q general = tuple(selector('div#pd-general-contributor-content')) if general: comm += self.render_comments(general[0]) general = tuple(selector('div#pd-general-quotes-content')) if general: comm += self.render_comments(general[0]) if comm: mi.comments = comm # Cover img = tuple(selector('img.title-image[src]')) if img: href = img[0].get('src').replace('jacket_covers/medium/', 'jacket_covers/flyout/') self.plugin.cache_identifier_to_cover_url(self.sku, href) mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None return mi
def parse_book_page(self, url): # TODO: Support for login-based rating fetching # TODO: Move all parsing logic to methods in order to avoid dangling variables # TODO: Saving metadata in custom columns # TODO: Configurable embedding metadata in comment # TODO: missing items # original language, first polish publish date, publisher serie, form self.log.info('INFO: Downloading book page: {}'.format(url)) root_tag = self.get_lxml_root(url) if not root_tag: return None book_tag = self.get_book_tag(root_tag) if self.prefs['title']: book_title = self.parse_title(root_tag, book_tag, url) else: book_title = self.title if self.prefs['authors']: book_authors = self.parse_authors(root_tag, book_tag, url) else: book_authors = self.authors mi = Metadata(book_title, book_authors) additional_meta = {} if self.enabled('languages'): languages = self.parse_languages(root_tag, book_tag, url) if languages: mi.languages = languages if self.enabled('rating'): rating = self.parse_rating(root_tag, book_tag, url) if rating != None: mi.rating = rating if self.enabled('tags'): tags = self.parse_tags(root_tag, book_tag, url) if tags: mi.tags = tags if self.enabled('identifier'): identifier = self.parse_identifier(root_tag, book_tag, url) if identifier: mi.set_identifier(IDENTIFIER, identifier) if self.enabled('pubdate'): pubdate = self.parse_pubdate(root_tag, book_tag, url) if pubdate: mi.pubdate = pubdate if self.enabled('covers'): covers = self.parse_covers(root_tag, book_tag, url) if covers: mi.has_cover = True self.plugin.cached_identifier_to_cover_url('urls').extend( covers) else: self.plugin.cache_identifier_to_cover_url('nocover', True) # TODO: is this necessary? if self.enabled('series'): series = self.parse_series(root_tag, book_tag, url) if series: additional_meta['series'] = [ self.get_series_string(name, index) for name, index in series ] name, index = series[0] mi.series = name if index is not None: mi.series_index = index if self.enabled('translators'): translators = self.parse_translators(root_tag, book_tag, url) if translators: additional_meta['translators'] = translators if self.enabled('original_title'): original_title = self.parse_original_title(root_tag, book_tag, url) if original_title: additional_meta['original_title'] = original_title if self.enabled('categories'): categories = self.parse_categories(root_tag, book_tag, url) if categories: additional_meta['categories'] = categories if self.enabled('genres'): genres = self.parse_genres(root_tag, book_tag, url) if genres: additional_meta['genres'] = genres if self.enabled('comments'): comments = self.parse_comments(root_tag, book_tag, url) or '' additional_comments = self.format_additional_comment( additional_meta) if comments or additional_comments: mi.comments = comments + additional_comments self.log.info('INFO: Parsing book page completed') return mi
def parse_details(self, root): try: legie_id = self.parse_legie_id(self.url) except: self.log.exception('Error parsing Legie id for url: %r' % self.url) legie_id = None try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r' % self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not legie_id: self.log.error('Could not find title/authors/Legie id for %r' % self.url) self.log.error('Legie: %r Title: %r Authors: %r' % (legie_id, title, authors)) return self.legie_id = legie_id rating = comments = series = series_index = None try: rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r' % self.url) try: comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: (series, series_index) = self.parse_series(root) except: self.log.info('Series not found.') try: tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r' % self.url) tags = None if legie_id: editions = self.get_editions() if editions: num_editions = len(editions) self.log.info('Nalezeno %d vydani' % num_editions) for edition in editions: (year, cover_url, publisher, isbn) = edition mi = Metadata(title, authors) self.legie_id = "%s#%s" % (legie_id, year) mi.set_identifier('legie', self.legie_id) mi.source_relevance = self.relevance mi.rating = rating mi.comments = comments mi.series = series mi.series_index = series_index if cover_url: mi.cover_url = self.cover_url = cover_url self.plugin.cache_identifier_to_cover_url( self.legie_id, self.cover_url) if tags: mi.tags = tags mi.has_cover = bool(self.cover_url) mi.publisher = publisher mi.isbn = isbn mi.pubdate = self.prepare_date(int(year)) mi.language = "ces" self.result_queue.put(mi) else: mi = Metadata(title, authors) mi.set_identifier('legie', self.legie_id) mi.source_relevance = self.relevance mi.rating = rating mi.comments = comments mi.series = series mi.series_index = series_index try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r' % self.url) if tags: mi.tags = tags mi.has_cover = bool(self.cover_url) mi.publisher = publisher mi.isbn = isbn mi.pubdate = self.prepare_date(int(year)) mi.language = "ces" self.result_queue.put(mi) if self.legie_id: if self.cover_url: self.plugin.cache_identifier_to_cover_url( self.legie_id, self.cover_url)
def load_details(self, url, timeout): def _format_item(str): return re.sub('^"(.*)"$', '\\1', unescape(str)) def _format_list(str): return [_.strip() for _ in _format_item(str).split(',')] def _find_meta(node, property): return [_.get('content') for _ in node if _.get('property') == property][0] def _format_date(date_text): year = int(date_text[0:4]) month = int(date_text[4:6]) day = int(date_text[6:]) return datetime.datetime(year, month, day, tzinfo=utc_tz) try: response = self.browser.open(url, timeout=timeout) root = lxml.html.fromstring(response.read()) # <meta> tag에서 불러오는 항목 # 책ID, 제목, ISBN, 이미지URL, 평점 meta = root.xpath('//meta[starts-with(@property, "og") or starts-with(@property, "books")]') # schema.org JSON에서 불러오는 항목 # 제목, 저자, 책소개, 출판사 ld_json = root.xpath('//script[@type="application/ld+json"]/text()') ld = [json.loads(_) for _ in ld_json] book_info = [_ for _ in ld if _['@type'] == 'Book'][0] except Exception as e: self.log.exception(e) ridibooks_id = re.search('id=([0-9]+)', url).group(1) isbn = _find_meta(meta, 'books:isbn') cover_url = _find_meta(meta, 'og:image') title = _find_meta(meta, 'og:title') authors = _format_list(book_info['author']['name']) if book_info.has_key('translator'): authors.extend([_ + u'(역자)' for _ in _format_list(book_info['translator']['name'])]) mi = Metadata(title, authors) mi.set_identifier('ridibooks', ridibooks_id) mi.cover_url = cover_url mi.has_cover = bool(cover_url) mi.publisher = _format_item(book_info['publisher']['name']) mi.pubdate = _format_date(book_info['datePublished']) mi.comments = _format_item(book_info['description']) mi.rating = float(_find_meta(meta, 'books:rating:normalized_value')) series = re.search(u'(.*)\s*(\d+)권', title) if series: mi.series = series.group(1) mi.series_index = float(series.group(2)) mi.language = 'Korean' mi.source_relevance = self.relevance if ridibooks_id: if isbn: self.plugin.cache_isbn_to_identifier(isbn, ridibooks_id) if cover_url: self.plugin.cache_identifier_to_cover_url(ridibooks_id, cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, raw, root): #解析元数据各字段数据 #self.log.info("=====") try: asin = self.parse_asin(root) except: self.log.exception('Error parsing asin for url: %r' % self.url) asin = None if self.testing: import tempfile, uuid with tempfile.NamedTemporaryFile( prefix=(asin or str(uuid.uuid4())) + '_', suffix='.html', delete=False) as f: f.write(raw) print('Downloaded html for', asin, 'saved in', f.name) # 分析取得书名 try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r' % self.url) title = None #分析取得作者 try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not asin: self.log.error('Could not find title/authors/asin for %r' % self.url) self.log.error('ASIN: %r Title: %r Authors: %r' % (asin, title, authors)) return #以书名,作者为元数据对象mi,用于设置元数据 mi = Metadata(title, authors) #设置Bookid idtype = '17k' mi.set_identifier(idtype, asin) self.k17k_id = asin #设备注释(简介) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) #设置丛书系列 try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r' % self.url) #设置标签 try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r' % self.url) #设置最后更新日期 # try: # mi.last_modified = self.parse_last_modified(root) # except: # self.log.exception('Error parsing last_modified for url: %r'%self.url) #设置封面 try: self.cover_url = self.parse_cover(root, raw) except: self.log.exception('Error parsing cover for url: %r' % self.url) mi.has_cover = bool(self.cover_url) mi.source_relevance = self.relevance mi.languages = [ u'中文', ] if self.k17k_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.k17k_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url( self.k17k_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
#!/usr/bin/env python
def parse_details(self, root): try: isbn = self.extract_isbn(self.url) except: self.log.exception('No ISBN in URL: %r'%self.url) isbn = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r'%self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not isbn: self.log.error('Could not find title/authors/Aladin id for %r'%self.url) self.log.error('Aladin: %r Title: %r Authors: %r'%(isbn, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index #mi.set_identifier('isbn', isbn) mi.isbn = isbn self.isbn = isbn # ISBN-13 try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) mi.cover_url = self.cover_url # This is purely so we can run a test for it!!! if mi.has_cover: self.log.info('Cover URL: '+mi.cover_url) try: mi.publisher = self.parse_publisher(root) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_published_date(root) except: self.log.exception('Error parsing published date for url: %r'%self.url) mi.language = 'ko' mi.source_relevance = self.relevance self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: antik_id = self.parse_antik_id(root) self.log.info('Parsed Antikvarium identifier: %s' % antik_id) except: self.log.exception('Error parsing Antikvarium id for url: %r' % self.url) antik_id = None try: title = self.parse_title(root) self.log.info('Parsed title: %s' % title) except: self.log.exception('Error parsing title for url: %r' % self.url) title = None try: authors = self.parse_authors(root) self.log.info('Parsed authors: %s' % authors) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not antik_id: self.log.error( 'Could not find title/authors/Antikvarium.hu id for %r' % self.url) self.log.error('Antikvarium.hu id: %r Title: %r Authors: %r' % (antik_id, title, authors)) return mi = Metadata(title, authors) mi.set_identifier('antik_hu', antik_id) self.antik_id = antik_id try: isbn = self.parse_isbn(root) self.log.info('Parsed ISBN: %s' % isbn) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r' % self.url) try: series = self.parse_series(root) self.log.info('Parsed series: %s' % series) except: self.log.exception('Error parsing series for url: %r' % self.url) series = None try: mi.series_index = self.parse_series_index(root) self.log.info('Parsed series index: %s' % mi.series_index) except: self.log.exception('Error parsing series for url: %r' % self.url) mi.series_index = None try: mi.comments = self.parse_comments(root) self.log.info('Parsed comments: %s' % mi.comments) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: self.cover_url = self.parse_cover(root) self.log.info('Parsed URL for cover: %r' % self.cover_url) self.plugin.cache_identifier_to_cover_url(self.antik_id, self.cover_url) mi.has_cover = bool(self.cover_url) except: self.log.exception('Error parsing cover for url: %r' % self.url) try: mi.publisher = self.parse_publisher(root) self.log.info('Parsed publisher: %s' % mi.publisher) except: self.log.exception('Error parsing publisher for url: %r' % self.url) try: mi.tags = self.parse_tags(root) self.log.info('Parsed tags: %s' % mi.tags) except: self.log.exception('Error parsing tags for url: %r' % self.url) try: mi.pubdate = self.parse_published_date(root) self.log.info('Parsed publication date: %s' % mi.pubdate) except: self.log.exception('Error parsing published date for url: %r' % self.url) try: mi.languages = self.parse_languages(root) self.log.info('Parsed languages: %r' % mi.languages) except: self.log.exception('Error parsing languages for url: %r' % self.url) mi.source_relevance = self.relevance if series: mi.series = series if self.antik_id and self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.antik_id) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: kyobobook_id = self.parse_kyobobook_id(self.url) except: self.log.exception('Error parsing Kyobobook id for url: %r' % self.url) kyobobook_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r' % self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not kyobobook_id: self.log.error('Could not find title/authors/kyobobook id for %r' % self.url) self.log.error('Kyobobook: %r Title: %r Authors: %r' % (kyobobook_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier('kyobobook', kyobobook_id) self.kyobobook_id = kyobobook_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r' % self.url) try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r' % self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r' % self.url) mi.has_cover = bool(self.cover_url) try: tags = self.parse_tags(root) if tags: mi.tags = tags except: self.log.exception('Error parsing tags for url: %r' % self.url) try: mi.publisher, mi.pubdate = self.parse_publisher_and_date(root) except: self.log.exception('Error parsing publisher and date for url: %r' % self.url) try: lang = self._parse_language(root) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r' % self.url) mi.source_relevance = self.relevance if self.kyobobook_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.kyobobook_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url( self.kyobobook_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: goodreads_id = self.parse_goodreads_id(self.url) except: self.log.exception("Error parsing goodreads id for url: %r" % self.url) goodreads_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception("Error parsing title and series for url: %r" % self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception("Error parsing authors for url: %r" % self.url) authors = [] if not title or not authors or not goodreads_id: self.log.error("Could not find title/authors/goodreads id for %r" % self.url) self.log.error("Goodreads: %r Title: %r Authors: %r" % (goodreads_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier("goodreads", goodreads_id) self.goodreads_id = goodreads_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception("Error parsing ISBN for url: %r" % self.url) try: mi.rating = self.parse_rating(root) except: self.log.exception("Error parsing ratings for url: %r" % self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception("Error parsing comments for url: %r" % self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception("Error parsing cover for url: %r" % self.url) mi.has_cover = bool(self.cover_url) try: tags = self.parse_tags(root) if tags: mi.tags = tags except: self.log.exception("Error parsing tags for url: %r" % self.url) try: mi.publisher, mi.pubdate = self.parse_publisher_and_date(root) except: self.log.exception("Error parsing publisher and date for url: %r" % self.url) mi.source_relevance = self.relevance if self.goodreads_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.goodreads_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.goodreads_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse(self, raw, desc_raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_date, utcnow import json root = parse_html(raw.decode('gb18030')) title = root.xpath('//*[@id="name"]/div[1]/text()') title = title[0].strip() authors = [] for i in root.xpath('//*[@id="p-author"]/a'): authors.append(i.text.strip()) mi = Metadata(title, authors) information = root.xpath('//*[@id="parameter2"]/li') info = dict() for i in information: tmp = etree.tostring(i, method='text', encoding='utf-8').split(u':') info[tmp[0].strip()] = tmp[1].strip() # Identifiers mi.identifiers = self.plugin.identifiers mi.identifiers['jd'] = self.sku isbn = info['ISBN'] self.log.error(isbn) if isbn: mi.isbn = isbn self.plugin.cache_isbn_to_identifier(isbn, self.sku) mi.identifiers['isbn'] = isbn # Publisher mi.publisher = info.get(u'出版社') # Pubdate pubdate = info.get(u'出版时间') if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: self.log.error('Failed to parse pubdate %r' % pubdate) # Series mi.series = info.get(u'丛书名') img = root.xpath('//*[@id="spec-n1"]/img') cover = img[0].get('src') if cover: if not cover.startswith('http'): cover = 'https:' + cover self.plugin.cache_identifier_to_cover_url(self.sku, cover) self.log.error(cover) mi.has_cover = self.plugin.cached_identifier_to_cover_url( self.sku) is not None # Comments # showdesc({"date":1583588455348,"content":" ... "}) try: desc = json.loads(desc_raw[9:-1].decode('gb18030')) desc_root = parse_html(desc['content']) div = desc_root.xpath( '//*[@id="detail-tag-id-3"]/div[2]/div/text()') comments = div[0] mi.comments = comments finally: return mi
def parse_details(self, raw, root): dang_id = parse_dang_id(root, self.log, self.url) if not dang_id and root.xpath( '//form[@action="/errors/validateCaptcha"]'): raise CaptchaError( 'Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.' ) if self.testing: import tempfile, uuid with tempfile.NamedTemporaryFile( prefix=(dang_id or str(uuid.uuid4())) + '_', suffix='.html', delete=False) as f: f.write(raw) print('Downloaded html for', dang_id, 'saved in', f.name) try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r' % self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not dang_id: self.log.error('Could not find title/authors/dang_id for %r' % self.url) self.log.error('ASIN: %r Title: %r Authors: %r' % (dang_id, title, authors)) return mi = Metadata(title, authors) idtype = 'dang' mi.set_identifier(idtype, dang_id) self.dang_id = dang_id try: mi.comments = self.parse_comments(root, raw) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r' % self.url) try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r' % self.url) try: self.cover_url = self.parse_cover(root, raw) except: self.log.exception('Error parsing cover for url: %r' % self.url) mi.has_cover = bool(self.cover_url) pd = root.xpath(self.pd_desc_xpath) pd_info = root.xpath(self.pd_info_xpath) pd_info_store = root.xpath(self.pd_info_store_xpath) pd_desc = root.xpath(self.pd_desc_xpath) if pd_info or pd_info_store: try: isbn = self.parse_isbn(pd_info, pd_info_store, pd_desc) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r' % self.url) if pd_info: pd_info = pd_info[0] else: pd_info = pd_info_store[0] try: mi.publisher = self.parse_publisher(pd_info) except: self.log.exception('Error parsing publisher for url: %r' % self.url) try: mi.pubdate = self.parse_pubdate(pd_info) except: self.log.exception('Error parsing publish date for url: %r' % self.url) else: self.log.warning('Failed to find product description for url: %r' % self.url) mi.source_relevance = self.relevance if self.dang_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.dang_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url( self.dang_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, raw, root): try: asin = self.parse_asin(root) except: self.log.exception('Error parsing asin for url: %r'%self.url) asin = None if self.testing: import tempfile, uuid with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_', suffix='.html', delete=False) as f: f.write(raw) print ('Downloaded html for', asin, 'saved in', f.name) try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r'%self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not asin: self.log.error('Could not find title/authors/asin for %r'%self.url) self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title, authors)) return mi = Metadata(title, authors) idtype = 'amazon' if self.domain == 'com' else 'amazon_'+self.domain mi.set_identifier(idtype, asin) self.amazon_id = asin try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r'%self.url) try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r'%self.url) try: self.cover_url = self.parse_cover(root, raw) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) non_hero = CSSSelect('div#bookDetails_container_div div#nonHeroSection')(root) if non_hero: # New style markup try: self.parse_new_details(root, mi, non_hero[0]) except: self.log.exception('Failed to parse new-style book details section') else: pd = root.xpath(self.pd_xpath) if pd: pd = pd[0] try: isbn = self.parse_isbn(pd) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.publisher = self.parse_publisher(pd) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_pubdate(pd) except: self.log.exception('Error parsing publish date for url: %r'%self.url) try: lang = self.parse_language(pd) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r'%self.url) else: self.log.warning('Failed to find product description for url: %r'%self.url) mi.source_relevance = self.relevance if self.amazon_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.amazon_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def load_details(self, url, timeout): def _format_item(str): return re.sub('^"(.*)"$', '\\1', unescape(str)) def _format_list(str): return [_.strip() for _ in _format_item(str).split(',')] def _find_meta(node, property): return [ _.get('content') for _ in node if _.get('property') == property ][0] def _format_date(date_text): year = int(date_text[0:4]) month = int(date_text[4:6]) day = int(date_text[6:]) return datetime.datetime(year, month, day, tzinfo=utc_tz) try: response = self.browser.open(url, timeout=timeout) root = lxml.html.fromstring(response.read()) # <meta> tag에서 불러오는 항목 # 책ID, 제목, ISBN, 이미지URL, 평점 meta = root.xpath( '//meta[starts-with(@property, "og") or starts-with(@property, "books")]' ) # schema.org JSON에서 불러오는 항목 # 제목, 저자, 책소개, 출판사 ld_json = root.xpath( '//script[@type="application/ld+json"]/text()') ld = [json.loads(_) for _ in ld_json] book_info = [_ for _ in ld if _['@type'] == 'Book'][0] except Exception as e: self.log.exception(e) ridibooks_id = re.search('id=([0-9]+)', url).group(1) isbn = _find_meta(meta, 'books:isbn') cover_url = _find_meta(meta, 'og:image') title = _find_meta(meta, 'og:title') authors = _format_list(book_info['author']['name']) if book_info.has_key('translator'): authors.extend([ _ + u'(역자)' for _ in _format_list(book_info['translator']['name']) ]) mi = Metadata(title, authors) mi.set_identifier('ridibooks', ridibooks_id) mi.cover_url = cover_url mi.has_cover = bool(cover_url) mi.publisher = _format_item(book_info['publisher']['name']) mi.pubdate = _format_date(book_info['datePublished']) mi.comments = _format_item(book_info['description']) mi.rating = float(_find_meta(meta, 'books:rating:normalized_value')) series = re.search(u'(.*)\s*(\d+)권', title) if series: mi.series = series.group(1) mi.series_index = float(series.group(2)) mi.language = 'Korean' mi.source_relevance = self.relevance if ridibooks_id: if isbn: self.plugin.cache_isbn_to_identifier(isbn, ridibooks_id) if cover_url: self.plugin.cache_identifier_to_cover_url( ridibooks_id, cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_only_date, UNDEFINED_DATE root = parse_html(raw) sku = CSSSelect("div.sku.attGroup")(root)[0] info = sku.getparent() top = info.getparent().getparent() banner = top.find("div") spans = banner.findall("span") title = "" for i, span in enumerate(spans): if i == 0 or "12pt" in span.get("style", ""): title += astext(span) else: break authors = [re.sub(r"\(.*\)", "", x).strip() for x in astext(spans[-1]).split(",")] mi = Metadata(title.strip(), authors) # Identifiers isbns = [check_isbn(x.strip()) for x in astext(sku).split(",")] for isbn in isbns: if isbn: self.plugin.cache_isbn_to_identifier(isbn, self.sku) isbns = sorted(isbns, key=lambda x: len(x) if x else 0, reverse=True) if isbns and isbns[0]: mi.isbn = isbns[0] mi.set_identifier("edelweiss", self.sku) # Tags bisac = CSSSelect("div.bisac.attGroup")(root) if bisac: bisac = astext(bisac[0]) mi.tags = [x.strip() for x in bisac.split(",")] mi.tags = [t[1:].strip() if t.startswith("&") else t for t in mi.tags] # Publisher pub = CSSSelect("div.supplier.attGroup")(root) if pub: pub = astext(pub[0]) mi.publisher = pub # Pubdate pub = CSSSelect("div.shipDate.attGroupItem")(root) if pub: pub = astext(pub[0]) parts = pub.partition(":")[0::2] pub = parts[1] or parts[0] try: if ", Ship Date:" in pub: pub = pub.partition(", Ship Date:")[0] q = parse_only_date(pub, assume_utc=True) if q.year != UNDEFINED_DATE: mi.pubdate = q except: self.log.exception("Error parsing published date: %r" % pub) # Comments comm = "" general = CSSSelect("div#pd-general-overview-content")(root) if general: q = self.render_comments(general[0]) if q != "<p>No title summary available. </p>": comm += q general = CSSSelect("div#pd-general-contributor-content")(root) if general: comm += self.render_comments(general[0]) general = CSSSelect("div#pd-general-quotes-content")(root) if general: comm += self.render_comments(general[0]) if comm: mi.comments = comm # Cover img = CSSSelect("img.title-image[src]")(root) if img: href = img[0].get("src").replace("jacket_covers/medium/", "jacket_covers/flyout/") self.plugin.cache_identifier_to_cover_url(self.sku, href) mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None return mi
def parse_details(self, raw, root): try: asin = self.parse_asin(root) except: self.log.exception('Error parsing asin for url: %r'%self.url) asin = None if self.testing: import tempfile, uuid with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_', suffix='.html', delete=False) as f: f.write(raw) print ('Downloaded html for', asin, 'saved in', f.name) try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r'%self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not asin: self.log.error('Could not find title/authors/asin for %r'%self.url) self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title, authors)) return mi = Metadata(title, authors) idtype = 'amazon' if self.domain == 'com' else 'amazon_'+self.domain mi.set_identifier(idtype, asin) self.amazon_id = asin try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r'%self.url) try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r'%self.url) try: self.cover_url = self.parse_cover(root, raw) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) non_hero = CSSSelect('div#bookDetails_container_div div#nonHeroSection')(root) if non_hero: # New style markup try: self.parse_new_details(root, mi, non_hero[0]) except: self.log.exception('Failed to parse new-style book details section') else: pd = root.xpath(self.pd_xpath) if pd: pd = pd[0] try: isbn = self.parse_isbn(pd) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.publisher = self.parse_publisher(pd) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_pubdate(pd) except: self.log.exception('Error parsing publish date for url: %r'%self.url) try: lang = self.parse_language(pd) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r'%self.url) else: self.log.warning('Failed to find product description for url: %r'%self.url) mi.source_relevance = self.relevance if self.amazon_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.amazon_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: CBDB_id = self.parse_CBDB_id(self.url) except: self.log.exception('Error parsing CBDB id for url: %r' % self.url) CBDB_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r' % self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not CBDB_id: self.log.error('Could not find title/authors/CBDB id for %r' % self.url) self.log.error('CBDB: %r Title: %r Authors: %r' % (CBDB_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index #mi.identifiers['cbdb'] = CBDB_id mi.set_identifier('cbdb', CBDB_id) #self.log.info(CBDB_id) #self.log.info(mi.identifiers.get('cbdb', None)) self.CBDB_id = CBDB_id try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r' % self.url) # summary try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: self.cover_urls = self.parse_covers(root) except: self.log.exception('Error parsing cover for url: %r' % self.url) mi.has_cover = bool(self.cover_urls) #self.log.info('covers') #self.log.info(self.cover_urls) try: tags = self.parse_tags(root) if tags: mi.tags = tags except: self.log.exception('Error parsing tags for url: %r' % self.url) try: mi.publisher, mi.pubdate, isbn = self.parse_editions(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing publisher and date for url: %r' % self.url) mi.source_relevance = self.relevance mi.language = 'Czech' #self.log.info('self.CBDB_id = ' + str(self.CBDB_id )) if self.CBDB_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.CBDB_id) if self.cover_urls: self.plugin.cache_identifier_to_cover_url( self.CBDB_id, self.cover_urls) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): isfdb_id = None title = None authors = [] isbn = None publisher = None pubdate = None try: isfdb_id = re.search('(\d+)$', self.url).groups(0)[0] except: self.log.exception('Error parsing ISFDB ID for url: %r' % self.url) detail_nodes = root.xpath('//div[@id="content"]//td[@class="pubheader"]/ul/li') if not detail_nodes: detail_nodes = root.xpath('//div[@id="content"]/div/ul/li') # no table (on records with no image) for detail_node in detail_nodes: section = detail_node[0].text_content().strip().rstrip(':') #self.log.info(section) try: if section == 'Publication': title = detail_node[0].tail.strip() if not title: # assume an extra span with a transliterated title tooltip title = detail_node[1].text_content().strip() #self.log.info(title) elif section == 'Authors' or section == 'Editors': for a in detail_node.xpath('.//a'): author = a.text_content().strip() if section.startswith('Editors'): authors.append(author + ' (Editor)') else: authors.append(author) #self.log.info(authors) elif section == 'ISBN': isbn = detail_node[0].tail.strip('[] \n') #self.log.info(isbn) elif section == 'Publisher': publisher = detail_node.xpath('a')[0].text_content().strip() #self.log.info(publisher) elif section == 'Date': pubdate = self._convert_date_text(detail_node[0].tail.strip()) #self.log.info(pubdate) except: self.log.exception('Error parsing section %r for url: %r' % (section, self.url) ) if not title or not authors or not isfdb_id: self.log.error('Could not find title/authors/ISFDB ID for %r' % self.url) self.log.error('ISFDB: %r Title: %r Authors: %r' % (isfdb_id, title, authors)) return mi = Metadata(title, authors) mi.set_identifier('isfdb', isfdb_id) self.isfdb_id = isfdb_id if isbn: self.isbn = mi.isbn = isbn if publisher: mi.publisher = publisher if pubdate: mi.pubdate = pubdate try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) mi.cover_url = self.cover_url # This is purely so we can run a test for it!!! mi.source_relevance = self.relevance if self.isfdb_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.isfdb_id) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: kyobobook_id = self.parse_kyobobook_id(self.url) except: self.log.exception('Error parsing Kyobobook id for url: %r'%self.url) kyobobook_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r'%self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not kyobobook_id: self.log.error('Could not find title/authors/kyobobook id for %r'%self.url) self.log.error('Kyobobook: %r Title: %r Authors: %r'%(kyobobook_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier('kyobobook', kyobobook_id) self.kyobobook_id = kyobobook_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) try: tags = self.parse_tags(root) if tags: mi.tags = tags except: self.log.exception('Error parsing tags for url: %r'%self.url) try: mi.publisher, mi.pubdate = self.parse_publisher_and_date(root) except: self.log.exception('Error parsing publisher and date for url: %r'%self.url) try: lang = self._parse_language(root) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r'%self.url) mi.source_relevance = self.relevance if self.kyobobook_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.kyobobook_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.kyobobook_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: yes24_id = self.parse_yes24_id(self.url) except: self.log.exception('Error parsing YES24 id for url: %r'%self.url) yes24_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r'%self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not yes24_id: self.log.error('Could not find title/authors/YES24 id for %r'%self.url) self.log.error('YES24: %r Title: %r Authors: %r'%(yes24_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier('yes24', yes24_id) self.yes24_id = yes24_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) mi.cover_url = self.cover_url # This is purely so we can run a test for it!!! try: mi.publisher = self.parse_publisher(root) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_published_date(root) except: self.log.exception('Error parsing published date for url: %r'%self.url) mi.language = 'ko' mi.source_relevance = self.relevance if self.yes24_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.yes24_id) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: moly_id = self.parse_moly_id(self.url) self.log.info('Parsed moly.hu identifier: %s'%moly_id) except: self.log.exception('Error parsing moly.hu id for url: %r'%self.url) moly_id = None try: title = self.parse_title(root) self.log.info('Parsed title: %s'%title) except: self.log.exception('Error parsing title for url: %r'%self.url) title = None try: authors = self.parse_authors(root) self.log.info('Parsed authors: %s'%authors) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not moly_id: self.log.error('Could not find title/authors/moly.hu id for %r'%self.url) self.log.error('Moly.hu id: %r Title: %r Authors: %r'%(moly_id, title, authors)) return mi = Metadata(title, authors) mi.set_identifier('moly_hu', moly_id) self.moly_id = moly_id try: isbn = self.parse_isbn(root) self.log.info('Parsed ISBN: %s'%isbn) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: series_info = self.parse_series(root) if series_info is not None: mi.series = series_info[0] mi.series_index = int(series_info[1]) self.log.info('Parsed series: %s, series index: %f'%(mi.series,mi.series_index)) except: self.log.exception('Error parsing series for url: %r'%self.url) try: mi.comments = self.parse_comments(root) self.log.info('Parsed comments: %s'%mi.comments) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_covers(root) self.log.info('Parsed URL for cover: %r'%self.cover_url) self.plugin.cache_identifier_to_cover_url(self.moly_id, self.cover_url) mi.has_cover = bool(self.cover_url) except: self.log.exception('Error parsing cover for url: %r'%self.url) try: mi.tags = self.parse_tags(root) self.log.info('Parsed tags: %s'%mi.tags) except: self.log.exception('Error parsing tags for url: %r'%self.url) try: mi.languages = self.parse_languages(mi.tags) self.log.info('Parsed languages: %r'%mi.languages) except: self.log.exception('Error parsing language for url: %r'%self.url) try: mi.publisher = self.parse_publisher(root) self.log.info('Parsed publisher: %s'%mi.publisher) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_published_date(root) self.log.info('Parsed publication date: %s'%mi.pubdate) except: self.log.exception('Error parsing published date for url: %r'%self.url) try: mi.rating = self.parse_rating(root) self.log.info('Parsed rating: %s\n\n'%mi.rating) except: self.log.exception('Error parsing tags for url: %r\n\n'%self.url) mi.source_relevance = self.relevance if self.moly_id and self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.moly_id) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_only_date, UNDEFINED_DATE from css_selectors import Select root = parse_html(raw) selector = Select(root) sku = next(selector('div.sku.attGroup')) info = sku.getparent() top = info.getparent().getparent() banner = top.find('div') spans = banner.findall('span') title = '' for i, span in enumerate(spans): if i == 0 or '12pt' in span.get('style', ''): title += astext(span) else: break authors = [ re.sub(r'\(.*\)', '', x).strip() for x in astext(spans[-1]).split(',') ] mi = Metadata(title.strip(), authors) # Identifiers isbns = [check_isbn(x.strip()) for x in astext(sku).split(',')] for isbn in isbns: if isbn: self.plugin.cache_isbn_to_identifier(isbn, self.sku) isbns = sorted(isbns, key=lambda x: len(x) if x else 0, reverse=True) if isbns and isbns[0]: mi.isbn = isbns[0] mi.set_identifier('edelweiss', self.sku) # Tags bisac = tuple(selector('div.bisac.attGroup')) if bisac: bisac = astext(bisac[0]) mi.tags = [x.strip() for x in bisac.split(',')] mi.tags = [ t[1:].strip() if t.startswith('&') else t for t in mi.tags ] # Publisher pub = tuple(selector('div.supplier.attGroup')) if pub: pub = astext(pub[0]) mi.publisher = pub # Pubdate pub = tuple(selector('div.shipDate.attGroupItem')) if pub: pub = astext(pub[0]) parts = pub.partition(':')[0::2] pub = parts[1] or parts[0] try: if ', Ship Date:' in pub: pub = pub.partition(', Ship Date:')[0] q = parse_only_date(pub, assume_utc=True) if q.year != UNDEFINED_DATE: mi.pubdate = q except: self.log.exception('Error parsing published date: %r' % pub) # Comments comm = '' general = tuple(selector('div#pd-general-overview-content')) if general: q = self.render_comments(general[0]) if q != '<p>No title summary available. </p>': comm += q general = tuple(selector('div#pd-general-contributor-content')) if general: comm += self.render_comments(general[0]) general = tuple(selector('div#pd-general-quotes-content')) if general: comm += self.render_comments(general[0]) if comm: mi.comments = comm # Cover img = tuple(selector('img.title-image[src]')) if img: href = img[0].get('src').replace('jacket_covers/medium/', 'jacket_covers/flyout/') self.plugin.cache_identifier_to_cover_url(self.sku, href) mi.has_cover = self.plugin.cached_identifier_to_cover_url( self.sku) is not None return mi
def extract_vol_details(self, vol_url): # Here we extract and format the information from the choosen volume. # - The first name and last name to populate author and author sort : vol_auteur_prenom and vol_auteur_nom # - The title of the volume : vol_title # - The serie name the volume is part of : vol_serie # - The sequence number in the serie : vol_serie_seq # missing # - The editor of this volume : vol_editor # - The editor's collection of this volume : vol_coll # - The collection serial code of this volume : vol_coll_srl # - The "dépot légal" date (the publication date is vastly unknown) : vol_dp_lgl # date format to be computed # - The ISBN number assoi-ciated with the volume : vol_isbn # - The volume tags : vol_genre # - The url pointer to the volume cover image : vol_cover_index # - The comments includes various info about the book : vol_comment_soup # . reference, an url pointer to noosfere # . couverture, an url pointer to noosfere, cover may be real smal, but is accurate to the volume # . first edition information # . serie (cycle) name and number # . this volume editor info # . Resume (quatrième de couverture) # . Critiques # . Sommaire detailing what novels are in the volume when it is an anthology # . Critiques about the serie and/or about another volume of the book # debug = self.dbg_lvl & 2 self.log.info(self.who, "\nIn extract_vol_details(soup)") if debug: self.log.info(self.who, "vol_url : ", vol_url) if debug: self.log.info( self.who, "calling ret_soup(log, dbg_lvl, br, url, rkt=None, who='[__init__]')" ) self.log.info(self.who, "vol_url : ", vol_url, "who : ", self.who) rsp = ret_soup(self.log, self.dbg_lvl, self.br, vol_url, who=self.who) soup = rsp[0] url_vrai = rsp[1].replace("&Tri=3", "") # if debug: self.log.info(self.who,soup.prettify()) # useful but too big... self.nsfr_id = self.nsfr_id + "$vl$" + url_vrai.replace( '?', '&').replace('=', '&').split('&')[2] # self.nsfr_id = (self.nfsr_id).strip("$") # If I use this form, it gives this error: 'Worker' object has no attribute 'nfsr_id' ??? tmp = self.nsfr_id self.nsfr_id = tmp.strip('$') if debug: self.log.info(self.who, "self.nsfr_id, type() : ", self.nsfr_id, type(self.nsfr_id)) tmp_lst = [] vol_info = {} vol_title = "" vol_auteur = "" vol_auteur_prenom = "" vol_auteur_nom = "" vol_serie = "" vol_serie_seq = "" vol_editor = "" vol_coll = "" vol_coll_srl = "" vol_dp_lgl = "" vol_isbn = "" vol_genre = "" vol_cover_index = "" comment_generic = None comment_resume = None comment_Critiques = None comment_Sommaire = None comment_AutresCritique = None comment_cover = None comment_decoupage_annexe = None # add volume address as a reference in the comment vol_comment_soup = BS( '<div><p>Référence: <a href="' + url_vrai + '">' + url_vrai + '</a></p></div>', "lxml") if debug: self.log.info(self.who, "vol reference processed") if soup.select("span[class='TitreNiourf']"): vol_title = soup.select( "span[class='TitreNiourf']")[0].text.strip() if debug: self.log.info(self.who, "vol_title processed : ", vol_title) if soup.select("span[class='AuteurNiourf']"): vol_auteur = soup.select( "span[class='AuteurNiourf']")[0].text.replace("\n", "").strip() if debug: self.log.info(self.who, "vol_auteur processed : ", vol_auteur) for i in range(len(vol_auteur.split())): if not vol_auteur.split()[i].isupper(): vol_auteur_prenom += " " + vol_auteur.split()[i] else: vol_auteur_nom += " " + vol_auteur.split()[i].title() vol_auteur = vol_auteur.title() vol_auteur_prenom = vol_auteur_prenom.strip() if debug: self.log.info(self.who, "vol_auteur_prenom processed : ", vol_auteur_prenom) vol_auteur_nom = vol_auteur_nom.strip() if debug: self.log.info(self.who, "vol_auteur_nom processed : ", vol_auteur_nom) if soup.select("a[href*='serie.asp']"): if soup.select("a[href*='serie.asp']")[0].find_parent( "span", {"class": "ficheNiourf"}): vol_serie = soup.select("a[href*='serie.asp']")[0].text tmp_vss = [ x for x in soup.select("a[href*='serie.asp']") [0].parent.stripped_strings ] for i in range(len(tmp_vss)): if "vol." in tmp_vss[i]: if not vol_serie_seq: vol_serie_seq = tmp_vss[i].replace("vol.", "").strip() if "découpage" in tmp_vss[i]: dec_anx_url = "https://www.noosfere.org/livres/" + soup.select( "a[href*='serie.asp']")[0]['href'] comment_pre_decoupage_annexe = BS( '<div><p> </p><p style="font-weight: 600; font-size: 18px"> Découpage annexe</p><hr style="color:CCC;"/></div>', "lxml") comment_decoupage_annexe = self.get_decoupage_annexe( dec_anx_url) if debug: self.log.info(self.who, "vol_serie, vol_serie_seq processed : ", vol_serie, ",", vol_serie_seq) comment_generic = soup.select("span[class='ficheNiourf']")[0] new_div = soup.new_tag('div') comment_generic = comment_generic.wrap(new_div) if debug: self.log.info(self.who, "comment_generic processed") if soup.select("a[href*='editeur.asp']"): vol_editor = soup.select("a[href*='editeur.asp']")[0].text if debug: self.log.info(self.who, "vol_editor processed : ", vol_editor) if soup.select("a[href*='collection.asp']"): vol_coll = soup.select("a[href*='collection.asp']")[0].text if debug: self.log.info(self.who, "vol_coll : ", vol_coll) for i in comment_generic.stripped_strings: tmp_lst.append(str(i)) vol_coll_srl = tmp_lst[len(tmp_lst) - 1] if "n°" in vol_coll_srl: for k in ["n°", "(", ")"]: if k in vol_coll_srl: vol_coll_srl = vol_coll_srl.replace(k, "") vol_coll_srl = vol_coll_srl.strip() vol_coll_srl = vol_coll_srl.split("/")[0] if vol_coll_srl[0].isnumeric(): vol_coll_srl = ("0" * 5 + vol_coll_srl)[-6:] else: vol_coll_srl = "" if debug: self.log.info(self.who, "vol_coll_srl processed : ", vol_coll_srl) # publication date is largely ignored in noosfere, but we have the "dépot legal" date and I use it instead # note that I 'calculate' the missing day of the month and even sometimes the missing month ms = ("janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre") for elemnt in soup.select_one( "span[class='sousFicheNiourf']").stripped_strings: if debug: self.log.info(self.who, "elemnt : ", elemnt) if not vol_dp_lgl: elemn = (elemnt.replace("Dépôt légal :", "").split(','))[0].strip() if elemn: if elemn.isnumeric() and len(elemn) == 4: vol_dp_lgl = datetime.datetime.strptime( "175 " + elemn, "%j %Y") elif "semestre" in elemn: ele = elemn.split() vol_dp_lgl = datetime.datetime.strptime( ("000" + str((int(ele[0][0]) - 1) * 175 + 97))[-3:] + " " + ele[2], "%j %Y") elif "trimestre" in elemn: ele = elemn.split() vol_dp_lgl = datetime.datetime.strptime( ("000" + str((int(ele[0][0]) - 1) * 91 + 47))[-3:] + " " + ele[2], "%j %Y") else: for i in range(len(ms)): if ms[i] in elemn: ele = elemn.split() vol_dp_lgl = datetime.datetime.strptime( ("000" + str(10 + 31 * i))[-3:] + " " + ele[1], "%j %Y") break if debug: self.log.info(self.who, "vol_dp_lgl : ", vol_dp_lgl) if "ISBN" in elemnt: vol_isbn = elemnt.lower().replace(" ", "").replace('isbn:', '') if "néant" in vol_isbn: vol_isbn = "" if debug: self.log.info(self.who, "vol_isbn processed : ", vol_isbn) if "Genre" in elemnt: vol_genre = elemnt.lstrip("Genre : ") if debug: self.log.info(self.who, "vol_genre processed : ", vol_genre) if soup.select("img[name='couverture']"): for elemnt in repr( soup.select("img[name='couverture']")[0]).split('"'): if "http" in elemnt: if not vol_cover_index: vol_cover_index = elemnt if debug: self.log.info(self.who, "vol_cover_index processed : ", vol_cover_index) # add cover image address as a reference in the comment if vol_cover_index: comment_cover = BS( '<div><p>Couverture: <a href="' + vol_cover_index + '">' + vol_cover_index + '</a></p></div>', "lxml") # select the fields I want... More exist such as film adaptations or references to advises to read # but that is not quite consistant around all the books (noosfere is a common database from many people) # and beside I have enough info like that AND I do NOT want to take out the noosfere's business tmp_comm_lst = soup.select("span[class='AuteurNiourf']") if debug: self.log.info(self.who, tmp_comm_lst) #usefull but too long for i in range(len(tmp_comm_lst)): if "Quatrième de couverture" in str(tmp_comm_lst[i]): comment_resume = tmp_comm_lst[i].find_parents( "div", {'class': 'sousbloc'})[0] if debug: self.log.info(self.who, "comment_resume processed") if "Critiques" in str(tmp_comm_lst[i]): if not "autres" in str(tmp_comm_lst[i]): comment_Critiques = tmp_comm_lst[i].find_parents( "div", {'class': 'sousbloc'})[0] if debug: self.log.info(self.who, "comment_Critiques processed") if "Sommaire" in str(tmp_comm_lst[i]): comment_Sommaire = tmp_comm_lst[i].find_parents( "div", {'class': 'sousbloc'})[0] if debug: self.log.info(self.who, "comment_Sommaire processed") if "Critiques des autres" in str(tmp_comm_lst[i]): comment_AutresCritique = tmp_comm_lst[i].find_parents( "div", {'class': 'sousbloc'})[0] if comment_AutresCritique.select('a[href*="serie.asp"]') and ( "Critique de la série" in comment_AutresCritique. select('a[href*="serie.asp"]')[0].text): critic_url = "https://www.noosfere.org/livres/" + comment_AutresCritique.select( 'a[href*="serie.asp"]')[0]['href'] try: more_comment_AutresCritique = self.get_Critique_de_la_serie( critic_url) comment_AutresCritique.append( more_comment_AutresCritique) except: self.log.exception( "get_Critique_de_la_serie failed for url: ", critic_url) if debug: self.log.info(self.who, "comment_AutresCritique processed") # group in a big bundle all the fields I think I want... (It is difficult not to include more... :-)) if comment_cover: vol_comment_soup.append(comment_cover) if comment_generic: vol_comment_soup.append(comment_generic) if comment_resume: vol_comment_soup.append(comment_resume) if comment_Critiques: vol_comment_soup.append(comment_Critiques) if comment_Sommaire: vol_comment_soup.append(comment_Sommaire) if comment_AutresCritique: vol_comment_soup.append(comment_AutresCritique) if comment_decoupage_annexe: vol_comment_soup.append( comment_pre_decoupage_annexe) # this is the title vol_comment_soup.append(comment_decoupage_annexe) # # Make a minimum of "repair" over vol_comment_soup so that it displays correctly (how I like it) in the comments and in my catalogs # - I hate justify when it makes margin "float" around the correct position (in fact when space are used instead of absolute positioning) # - I like to have functional url when they exist # - I like to find out the next and/or previous books in a serie (simulated arrows are link :-) ) for elemnt in vol_comment_soup.select('[align="justify"]'): del elemnt['align'] # remove all double or triple 'br' to improve presentation. # Note: tmp1 and tmp2 must contain a different value from any possible first elemnt. (yes, I am lrp and I am unique :-) ) # # ouais, et alors, si je modifie comment_generic APRES l'avoir integré à vol_comment_soup, il n'y a qu'une seule version en mémoire... # donc vol_comment_soup est modifié... # tmp1 = tmp2 = "lrp_the_unique" for elemnt in vol_comment_soup.findAll(): tmp1, tmp2 = tmp2, elemnt if tmp1 == tmp2: elemnt.extract() br = soup.new_tag('br') for elemnt in vol_comment_soup.select('.AuteurNiourf'): elemnt.insert(0, br) elemnt["style"] = "font-weight: 600; font-size: 18px" if debug: for elemnt in vol_comment_soup.select("a[href*='.asp']"): if 'http' not in elemnt.get('href'): self.log.info(self.who, "url incomplet avant correction: ", elemnt) for elemnt in vol_comment_soup.select("a[href*='/livres/auteur.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "/livres/auteur.asp", "https://www.noosfere.org/livres/auteur.asp") for elemnt in vol_comment_soup.select("a[href*='/livres/niourf.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "/livres/niourf.asp", "https://www.noosfere.org/livres/niourf.asp") for elemnt in vol_comment_soup.select("a[href*='/heberg/']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "/heberg/", "https://www.noosfere.org/heberg/") for elemnt in vol_comment_soup.select( "a[href*='./EditionsLivre.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "./EditionsLivre.asp", "https://www.noosfere.org/livres/EditionsLivre.asp") for elemnt in vol_comment_soup.select("a[href*='./niourf.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "./niourf.asp", "https://www.noosfere.org/livres/niourf.asp") for elemnt in vol_comment_soup.select("a[href*='heberg']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "../../heberg", "https://www.noosfere.org/heberg") for elemnt in vol_comment_soup.select("a[href*='../bd']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "../bd", "https://www.noosfere.org/bd") for elemnt in vol_comment_soup.select("a[href*='auteur.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "auteur.asp", "https://www.noosfere.org/livres/auteur.asp") for elemnt in vol_comment_soup.select("a[href*='collection.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "collection.asp", "https://www.noosfere.org/livres/collection.asp") for elemnt in vol_comment_soup.select("a[href*='critsign.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "critsign.asp", "https://www.noosfere.org/livres/critsign.asp") for elemnt in vol_comment_soup.select("a[href*='EditionsLivre.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "EditionsLivre.asp", "https://www.noosfere.org/livres/EditionsLivre.asp") for elemnt in vol_comment_soup.select("a[href*='editeur.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "editeur.asp", "https://www.noosfere.org/livres/editeur.asp") for elemnt in vol_comment_soup.select("a[href*='editionslivre.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "editionslivre.asp", "https://www.noosfere.org/livres/editionslivre.asp") for elemnt in vol_comment_soup.select("a[href*='niourf.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "niourf.asp", "https://www.noosfere.org/livres/niourf.asp") for elemnt in vol_comment_soup.select("a[href*='serie.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "serie.asp", "https://www.noosfere.org/livres/serie.asp") if debug: for elemnt in vol_comment_soup.select("a[href*='.asp']"): if 'http' not in elemnt.get('href'): self.log.info(self.who, "url incomplet apres correction: ", elemnt) fg, fd = "<<==", "==>>" #chr(0x21D0),chr(0x21D2) #chr(0x27f8),chr(0x27f9) for elemnt in vol_comment_soup.select("img[src*='arrow_left']"): elemnt.replace_with(fg) for elemnt in vol_comment_soup.select("img[src*='arrow_right']"): elemnt.replace_with(fd) # depending on the tick box, make a fat publisher using seperators that have a very low probability to pop up (§ and €) # only set vol_coll_srl if vol_coll exists # the idea is to use search and replace in the edit Metadata in bulk window. if self.extended_publisher: if debug: self.log.info( self.who, """flag : "Ajoute collection et son numéro d'ordre au champ èditeur" set""" ) if vol_coll: if debug: self.log.info(self.who, 'add collection') vol_editor = vol_editor + ('§') + vol_coll if vol_coll_srl: if debug: self.log.info(self.who, 'add collection number') vol_editor = vol_editor + ('€') + vol_coll_srl if vol_serie: if vol_serie_seq.isnumeric(): vol_serie_seq = float(vol_serie_seq) else: vol_serie_seq = 1.0 # UTF-8 characters may be serialized different ways, only xmlcharrefreplace produces xml compatible strings # any other non ascii character with another utf-8 byte representation will make calibre behave with the messsage: # ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters # Side note: # I have no real good url structure(i once got html 3 times, div a sibling of html...), but calibre does not seems to care (nice :-) ) # # Ca m'a pris un temps fou pour trouver, par hazard, que encode('ascii','xmlcharrefreplace') aidait bien... # (enfin, quasi par hazard, j' ai essayé tout ce qui pouvait ameliorer la compatibilité avec xml... mais je # lisais mal et je pensais à une incompatibilité avec la structure xml), # vol_comment_soup = vol_comment_soup.encode('ascii', 'xmlcharrefreplace') self.log.info(self.who, "+++" * 25) self.log.info(self.who, "nsfr_id, type() : ", self.nsfr_id, type(self.nsfr_id)) # must be <class 'str'> self.log.info(self.who, "relevance, type() : ", self.relevance, type(self.relevance)) # must be <class 'float'> self.log.info(self.who, "vol_title, type() : ", vol_title, type(vol_title)) # must be <class 'str'> self.log.info( self.who, "vol_auteur, type() : ", vol_auteur, type(vol_auteur)) # must be <class 'list'> of <class 'str'> self.log.info(self.who, "vol_auteur_prenom, type() : ", vol_auteur_prenom, type(vol_auteur_prenom)) # must be <class 'str'> self.log.info(self.who, "vol_auteur_nom, type() : ", vol_auteur_nom, type(vol_auteur_nom)) # must be <class 'str'> if vol_serie: self.log.info(self.who, "vol_serie, type() : ", vol_serie, type(vol_serie)) # must be <class 'str'> self.log.info(self.who, "vol_serie_seq, type() : ", vol_serie_seq, type(vol_serie_seq)) # must be <class 'float'> self.log.info(self.who, "vol_editor, type() : ", vol_editor, type(vol_editor)) # must be <class 'str'> self.log.info(self.who, "vol_coll, type() : ", vol_coll, type(vol_coll)) # must be <class 'str'> self.log.info(self.who, "vol_coll_srl, type() : ", vol_coll_srl, type(vol_coll_srl)) # must be <class 'str'> self.log.info( self.who, "vol_dp_lgl, type() : ", vol_dp_lgl, type(vol_dp_lgl) ) # must be <class 'datetime.datetime'> ('renderer=isoformat') self.log.info(self.who, "vol_isbn, type() : ", vol_isbn, type(vol_isbn)) # must be <class 'str'> self.log.info( self.who, "vol_genre, type() : ", vol_genre, type(vol_genre)) # must be <class 'list'> of <class 'str'> self.log.info(self.who, "vol_cover_index, type() : ", vol_cover_index, type(vol_cover_index)) # must be self.log.info(self.who, "type(vol_comment_soup) : ", type(vol_comment_soup) ) # must be byte encoded (start with b'blablabla... # self.log.info(self.who,"vol_comment_soup :\n",vol_comment_soup) # Maybe a bit long sometimes # language must be <class 'str'> if vol_cover_index: self.plugin.cache_identifier_to_cover_url(self.nsfr_id, vol_cover_index) if vol_isbn: self.plugin.cache_isbn_to_identifier(vol_isbn, self.nsfr_id) mi = Metadata(vol_title, [vol_auteur]) mi.set_identifier('nsfr_id', self.nsfr_id) mi.publisher = vol_editor mi.isbn = vol_isbn mi.tags = [vol_genre] mi.source_relevance = self.relevance mi.has_cover = bool(vol_cover_index) if vol_dp_lgl: mi.pubdate = vol_dp_lgl if vol_serie: mi.series = vol_serie mi.series_index = vol_serie_seq mi.language = "fra" mi.comments = vol_comment_soup if debug: self.log.info(self.who, "mi\n", mi, "\n") self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)