def parse(self, xml_detail): title = self.parse_title(xml_detail) authors = self.parse_authors(xml_detail) comments = self.parse_comments(xml_detail) rating = self.parse_rating(xml_detail) isbn = self.parse_isbn(xml_detail) publisher = self.parse_publisher(xml_detail) pub_year = self.parse_pubdate(xml_detail) tags = self.parse_tags(xml_detail) serie, serie_index = self.parse_serie(xml_detail) cover = self.parse_cover(xml_detail) if title is not None and authors is not None: mi = Metadata(title, authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:str(self.number)} mi.rating = rating mi.tags = tags mi.publisher = publisher mi.pubdate = pub_year mi.isbn = isbn mi.series = serie mi.series_index = serie_index mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(str(self.number), cover) return mi else: return None
def _parse_work(self, work): """Convert a list of works returned in CrossRef JSON to Calibre Metadata objects """ title = work.get('title')[0] authors = self._parse_authors(work) # Now we have a title - init Calibre Metadata mi = Metadata(title, authors) doi = work.get('DOI') if doi: mi.set_identifier('doi', doi) pubdate = self._parse_pubdate(work) if pubdate: mi.pubdate = pubdate publisher = self._parse_publisher(work) if publisher: mi.publisher = publisher series = self._parse_series(work) if series: mi.series = series[0] mi.series_index = series[1] return mi
def test(scale=0.5): from PyQt5.Qt import QLabel, QApplication, QPixmap, QMainWindow, QWidget, QScrollArea, QGridLayout app = QApplication([]) mi = Metadata('xxx', ['Kovid Goyal', 'John Q. Doe', 'Author']) mi.series = 'A series of styles' m = QMainWindow() sa = QScrollArea(m) w = QWidget(m) sa.setWidget(w) l = QGridLayout(w) w.setLayout(l), l.setSpacing(30) labels = [] for r, color in enumerate(sorted(default_color_themes)): for c, style in enumerate(sorted(all_styles())): mi.series_index = c + 1 mi.title = 'An algorithmic cover [%s]' % color prefs = override_prefs(cprefs, override_color_theme=color, override_style=style) for x in ('cover_width', 'cover_height', 'title_font_size', 'subtitle_font_size', 'footer_font_size'): prefs[x] = int(scale * prefs[x]) img = generate_cover(mi, prefs=prefs, as_qimage=True) la = QLabel() la.setPixmap(QPixmap.fromImage(img)) l.addWidget(la, r, c) labels.append(la) m.setCentralWidget(sa) w.resize(w.sizeHint()) m.show() app.exec_()
def build_meta(log, issue_id): """Build metadata record based on comicvine issue_id""" issue = pycomicvine.Issue( issue_id, field_list=[ "id", "name", "volume", "issue_number", "person_credits", "description", "store_date", "cover_date", ], ) if not issue or not issue.volume: log.warn("Unable to load Issue(%d)" % issue_id) return None title = "%s #%s" % (issue.volume.name, issue.issue_number) if issue.name: title = title + ": %s" % (issue.name) authors = [p.name for p in issue.person_credits] meta = Metadata(title, authors) meta.series = issue.volume.name meta.series_index = str(issue.issue_number) meta.set_identifier("comicvine", str(issue.id)) meta.comments = issue.description meta.has_cover = False if issue.volume.publisher: meta.publisher = issue.volume.publisher.name meta.pubdate = issue.store_date or issue.cover_date return meta
def parse(self, xml_detail, xml_more_info): title = self.parse_title(xml_detail) authors = self.parse_authors(xml_detail) comments = self.parse_comments(xml_detail) rating = self.parse_rating(xml_detail) isbn = self.parse_isbn(xml_more_info) publisher = self.parse_publisher(xml_detail) tags = self.parse_tags(xml_detail, xml_more_info) serie, serie_index = self.parse_serie(xml_detail) pub_year = self.parse_pub_year(xml_detail, xml_more_info) cover = self.parse_cover(xml_detail) if title is not None and authors is not None: mi = Metadata(as_unicode(title), authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:self.ident} mi.rating = rating mi.tags = tags mi.publisher = publisher mi.pubdate = pub_year mi.isbn = isbn mi.series = serie mi.series_index = serie_index mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(self.ident, cover) return mi else: self.log('Result skipped for because title or authors not found') return None
def test(scale=0.25): from PyQt5.Qt import QLabel, QApplication, QPixmap, QMainWindow, QWidget, QScrollArea, QGridLayout app = QApplication([]) mi = Metadata('xxx', ['Kovid Goyal', 'John & Doe', 'Author']) mi.series = 'A series of styles' m = QMainWindow() sa = QScrollArea(m) w = QWidget(m) sa.setWidget(w) l = QGridLayout(w) w.setLayout(l), l.setSpacing(30) labels = [] for r, color in enumerate(sorted(default_color_themes)): for c, style in enumerate(sorted(all_styles())): mi.series_index = c + 1 mi.title = 'An algorithmic cover [%s]' % color prefs = override_prefs(cprefs, override_color_theme=color, override_style=style) scale_cover(prefs, scale) img = generate_cover(mi, prefs=prefs, as_qimage=True) la = QLabel() la.setPixmap(QPixmap.fromImage(img)) l.addWidget(la, r, c) labels.append(la) m.setCentralWidget(sa) w.resize(w.sizeHint()) m.show() app.exec_()
def test(scale=0.25): from PyQt5.Qt import QLabel, QPixmap, QMainWindow, QWidget, QScrollArea, QGridLayout from calibre.gui2 import Application app = Application([]) mi = Metadata('Unknown', ['Kovid Goyal', 'John & Doe', 'Author']) mi.series = 'A series & styles' m = QMainWindow() sa = QScrollArea(m) w = QWidget(m) sa.setWidget(w) l = QGridLayout(w) w.setLayout(l), l.setSpacing(30) scale *= w.devicePixelRatioF() labels = [] for r, color in enumerate(sorted(default_color_themes)): for c, style in enumerate(sorted(all_styles())): mi.series_index = c + 1 mi.title = 'An algorithmic cover [%s]' % color prefs = override_prefs(cprefs, override_color_theme=color, override_style=style) scale_cover(prefs, scale) img = generate_cover(mi, prefs=prefs, as_qimage=True) img.setDevicePixelRatio(w.devicePixelRatioF()) la = QLabel() la.setPixmap(QPixmap.fromImage(img)) l.addWidget(la, r, c) labels.append(la) m.setCentralWidget(sa) w.resize(w.sizeHint()) m.show() app.exec_()
def get_series(title, authors, timeout=60): mi = Metadata(title, authors) if title and title[0] in _ignore_starts: title = title[1:] title = re.sub(r'^(A|The|An)\s+', '', title).strip() if not title: return mi if isinstance(title, unicode): title = title.encode('utf-8') title = urllib.quote_plus(title) author = authors[0].strip() if not author: return mi if ',' in author: author = author.split(',')[0] else: author = author.split()[-1] url = URL.format(author, title) br = browser() try: raw = br.open_novisit(url, timeout=timeout).read() except URLError as e: if isinstance(e.reason, socket.timeout): raise Exception('KDL Server busy, try again later') raise if 'see the full results' not in raw: return mi raw = xml_to_unicode(raw)[0] soup = BeautifulSoup(raw) searcharea = soup.find('div', attrs={'class':'searcharea'}) if searcharea is None: return mi ss = searcharea.find('div', attrs={'class':'seriessearch'}) if ss is None: return mi a = ss.find('a', href=True) if a is None: return mi href = a['href'].partition('?')[-1] data = urlparse.parse_qs(href) series = data.get('SeriesName', []) if not series: return mi series = series[0] series = re.sub(r' series$', '', series).strip() if series: mi.series = series ns = ss.nextSibling if ns.contents: raw = unicode(ns.contents[0]) raw = raw.partition('.')[0].strip() try: mi.series_index = int(raw) except: pass return mi
def set_mi(self, mi, fm): ''' This sets the metadata for the test result books table. It doesn't reset the contents of the field selectors for editing rules. ''' self.fm = fm if mi: if not isinstance(mi, list): mi = (mi, ) else: mi = Metadata(_('Title'), [_('Author')]) mi.author_sort = _('Author Sort') mi.series = ngettext('Series', 'Series', 1) mi.series_index = 3 mi.rating = 4.0 mi.tags = [_('Tag 1'), _('Tag 2')] mi.languages = ['eng'] mi.id = 1 if self.fm is not None: mi.set_all_user_metadata(self.fm.custom_field_metadata()) else: # No field metadata. Grab a copy from the current library so # that we can validate any custom column names. The values for # the columns will all be empty, which in some very unusual # cases might cause formatter errors. We can live with that. from calibre.gui2.ui import get_gui mi.set_all_user_metadata(get_gui( ).current_db.new_api.field_metadata.custom_field_metadata()) for col in mi.get_all_user_metadata(False): mi.set(col, (col, ), 0) mi = (mi, ) self.mi = mi tv = self.template_value tv.setColumnCount(2) tv.setHorizontalHeaderLabels((_('Book title'), _('Template value'))) tv.horizontalHeader().setStretchLastSection(True) tv.horizontalHeader().sectionResized.connect(self.table_column_resized) tv.setRowCount(len(mi)) # Set the height of the table h = tv.rowHeight(0) * min(len(mi), 5) h += 2 * tv.frameWidth() + tv.horizontalHeader().height() tv.setMinimumHeight(h) tv.setMaximumHeight(h) # Set the size of the title column if self.table_column_widths: tv.setColumnWidth(0, self.table_column_widths[0]) else: tv.setColumnWidth(0, tv.fontMetrics().averageCharWidth() * 10) tv.setSelectionBehavior(QAbstractItemView.SelectionBehavior.SelectRows) tv.setRowCount(len(mi)) # Use our own widget to get rid of elision. setTextElideMode() doesn't work for r in range(0, len(mi)): w = QLineEdit(tv) w.setReadOnly(True) tv.setCellWidget(r, 0, w) w = QLineEdit(tv) w.setReadOnly(True) tv.setCellWidget(r, 1, w) self.display_values('')
def get_series(title, authors, timeout=60): mi = Metadata(title, authors) if title and title[0] in _ignore_starts: title = title[1:] title = re.sub(r'^(A|The|An)\s+', '', title).strip() if not title: return mi if isinstance(title, unicode): title = title.encode('utf-8') title = urllib.quote_plus(title) author = authors[0].strip() if not author: return mi if ',' in author: author = author.split(',')[0] else: author = author.split()[-1] url = URL.format(author, title) br = browser() try: raw = br.open_novisit(url, timeout=timeout).read() except URLError as e: if isinstance(e.reason, socket.timeout): raise Exception('KDL Server busy, try again later') raise if 'see the full results' not in raw: return mi raw = xml_to_unicode(raw)[0] soup = BeautifulSoup(raw) searcharea = soup.find('div', attrs={'class': 'searcharea'}) if searcharea is None: return mi ss = searcharea.find('div', attrs={'class': 'seriessearch'}) if ss is None: return mi a = ss.find('a', href=True) if a is None: return mi href = a['href'].partition('?')[-1] data = urlparse.parse_qs(href) series = data.get('SeriesName', []) if not series: return mi series = series[0] series = re.sub(r' series$', '', series).strip() if series: mi.series = series ns = ss.nextSibling if ns.contents: raw = unicode(ns.contents[0]) raw = raw.partition('.')[0].strip() try: mi.series_index = int(raw) except: pass return mi
def data2mi(self, item): """Converts a single metadata answer in the form of a dict to a MetadataInformation object""" mi = Metadata(_('Unknown')) # Regular metadata mi.title = item.get('title', None) mi.authors = item.get('authors', []) mi.publisher = item.get('publisher', None) if 'id' in item.keys(): mi.set_identifier(self.idkey, item['id']) if 'doi' in item.keys(): mi.set_identifier('doi', item['doi']) if 'isbn' in item.keys(): mi.set_identifier('isbn', item['isbn']) if 'updated' in item.keys(): mi.pubdate = parse_date(item['updated'], assume_utc=True) if 'series' in item.keys(): mi.series = item['series'] mi.series_index = self.format_series_index( item.get('series_index'), None) if 'year' in item.keys(): mi.pubdate = parse_date(item['year'], assume_utc=True) if 'abstract' in item.keys(): mi.comments = self.format_abstract(item['abstract']) if 'language' in item.keys(): mi.language = item['language'] if 'journal' in item.keys(): mi.series = item['journal'] mi.series_index = self.format_series_index(item.get('volume'), item.get('number')) if 'subject' in item.keys(): tags = set([]) for s in item['subject']: tags.update(msc_tags(s)) tags.update(arxiv_tags(s)) mi.tags = list(sorted(tags)) return mi
def create_cover(title, authors, series=None, series_index=1, prefs=None, as_qimage=False): ' Create a cover from the specified title, author and series. Any user set' ' templates are ignored, to ensure that the specified metadata is used. ' mi = Metadata(title, authors) if series: mi.series, mi.series_index = series, series_index d = cprefs.defaults prefs = override_prefs( prefs or cprefs, title_template=d['title_template'], subtitle_template=d['subtitle_template'], footer_template=d['footer_template']) return generate_cover(mi, prefs=prefs, as_qimage=as_qimage)
def create_cover(title, authors, series=None, series_index=1, prefs=None, as_qimage=False): ' Create a cover from the specified title, author and series. Any user set' ' templates are ignored, to ensure that the specified metadata is used. ' mi = Metadata(title, authors) if series: mi.series, mi.series_index = series, series_index d = cprefs.defaults prefs = override_prefs( prefs or cprefs, title_template=d['title_template'], subtitle_template=d['subtitle_template'], footer_template=d['footer_template']) return generate_cover(mi, prefs=prefs, as_qimage=as_qimage)
def default_mi(self): from calibre.ebooks.metadata.book.base import Metadata mi = Metadata(_('A sample book'), [_('Author One'), _('Author Two')]) mi.series = _('A series of samples') mi.series_index = 4 mi.tags = [_('Tag One'), _('Tag Two')] mi.publisher = _('Some publisher') mi.rating = 4 mi.identifiers = {'isbn':'123456789', 'url': 'http://calibre-ebook.com'} mi.languages = ['eng', 'fra'] mi.pubdate = mi.timestamp = now() return mi
def default_mi(self): from calibre.ebooks.metadata.book.base import Metadata mi = Metadata(_('A sample book'), [_('Author One'), _('Author Two')]) mi.series = _('A series of samples') mi.series_index = 4 mi.tags = [_('Tag One'), _('Tag Two')] mi.publisher = _('Some publisher') mi.rating = 4 mi.identifiers = {'isbn':'123456789', 'url': 'https://calibre-ebook.com'} mi.languages = ['eng', 'fra'] mi.pubdate = mi.timestamp = now() return mi
def read_metadata(root, ver=None, return_extra_data=False): ans = Metadata(_('Unknown'), [_('Unknown')]) prefixes, refines = read_prefixes(root), read_refines(root) identifiers = read_identifiers(root, prefixes, refines) ids = {} for key, vals in iteritems(identifiers): if key == 'calibre': ans.application_id = vals[0] elif key == 'uuid': ans.uuid = vals[0] else: ids[key] = vals[0] ans.set_identifiers(ids) ans.title = read_title(root, prefixes, refines) or ans.title ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort ans.languages = read_languages(root, prefixes, refines) or ans.languages auts, aus = [], [] for a in read_authors(root, prefixes, refines): auts.append(a.name), aus.append(a.sort) ans.authors = auts or ans.authors ans.author_sort = authors_to_string(aus) or ans.author_sort bkp = read_book_producers(root, prefixes, refines) if bkp: if bkp[0]: ans.book_producer = bkp[0] pd = read_pubdate(root, prefixes, refines) if not is_date_undefined(pd): ans.pubdate = pd ts = read_timestamp(root, prefixes, refines) if not is_date_undefined(ts): ans.timestamp = ts lm = read_last_modified(root, prefixes, refines) if not is_date_undefined(lm): ans.last_modified = lm ans.comments = read_comments(root, prefixes, refines) or ans.comments ans.publisher = read_publisher(root, prefixes, refines) or ans.publisher ans.tags = read_tags(root, prefixes, refines) or ans.tags ans.rating = read_rating(root, prefixes, refines) or ans.rating s, si = read_series(root, prefixes, refines) if s: ans.series, ans.series_index = s, si ans.author_link_map = read_author_link_map(root, prefixes, refines) or ans.author_link_map ans.user_categories = read_user_categories(root, prefixes, refines) or ans.user_categories for name, fm in iteritems((read_user_metadata(root, prefixes, refines) or {})): ans.set_user_metadata(name, fm) if return_extra_data: ans = ans, ver, read_raster_cover(root, prefixes, refines), first_spine_item( root, prefixes, refines) return ans
def data2mi(self, item): """Converts a single metadata answer in the form of a dict to a MetadataInformation object""" mi = Metadata(_('Unknown')) # Regular metadata mi.title = item.get('title', None) mi.authors = item.get('authors', []) mi.publisher = item.get('publisher', None) if 'id' in item.keys(): mi.set_identifier(self.idkey, item['id']) if 'doi' in item.keys(): mi.set_identifier('doi', item['doi']) if 'isbn' in item.keys(): mi.set_identifier('isbn', item['isbn']) if 'updated' in item.keys(): mi.pubdate = parse_date(item['updated'], assume_utc=True) if 'series' in item.keys(): mi.series = item['series'] mi.series_index = self.format_series_index(item.get('series_index'), None) if 'year' in item.keys(): mi.pubdate = parse_date(item['year'], assume_utc=True) if 'abstract' in item.keys(): mi.comments = self.format_abstract(item['abstract']) if 'language' in item.keys(): mi.language = item['language'] if 'journal' in item.keys(): mi.series = item['journal'] mi.series_index = self.format_series_index(item.get('volume'), item.get('number')) if 'subject' in item.keys(): tags = set([]) for s in item['subject']: tags.update(msc_tags(s)) tags.update(arxiv_tags(s)) mi.tags = list(sorted(tags)) return mi
def parse(self, xml_detail): sys_ident = title = isbn = publisher = pub_year = serie = serie_index = cover = None authors = [] tags = [] xpath = self.XPath('//table[@id="record"]//tr') for row in xpath(xml_detail): ch = row.getchildren() txt = ch[0].text.strip() data = self.normalize(ch[1].text) if txt.startswith('245') and title is None: title = self.parse_title(data) if txt.startswith('246'): title = self.parse_title(data) elif txt.startswith('100') or txt.startswith('700'): res = self.parse_author(data) if res is not None: authors.append(res) elif txt == 'SYS': sys_ident = data.strip() elif txt =='020': isbn = self.parse_isbn(data) elif txt == '260': publisher, pub_year = self.parse_publisher(data) elif txt.startswith('490') and serie is None: serie, serie_index = self.parse_serie(data) elif txt == '655 7': tags.append(self.parse_tags(data)) if isbn is not None and isbn != '': cover = self.parse_cover(isbn) if title is not None and len(authors) > 0 and sys_ident is not None: mi = Metadata(title, authors) mi.languages = {'ces'} mi.identifiers = {self.plugin.name:sys_ident} mi.tags = tags mi.publisher = publisher mi.pubdate = pub_year mi.isbn = isbn mi.series = serie mi.series_index = serie_index mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(sys_ident, cover) return mi else: self.log('Data not found') return None
def test_input_meta_single(self): stream_meta = get_metadata(self.get_stream('meta_single')) canon_meta = Metadata('A Meta Tag & Title Ⓒ', ['George Washington']) canon_meta.publisher = 'Publisher A' canon_meta.languages = ['English'] canon_meta.pubdate = parse_date('2019-01-01') canon_meta.timestamp = parse_date('2018-01-01') canon_meta.series = 'Meta Series' canon_meta.series_index = float(1) # canon_meta.rating = float(0) # canon_meta.comments = '' canon_meta.tags = ['tag a', 'tag b'] canon_meta.set_identifiers({'isbn': '1234567890'}) self.compare_metadata(stream_meta, canon_meta)
def read_metadata(root, ver=None, return_extra_data=False): ans = Metadata(_('Unknown'), [_('Unknown')]) prefixes, refines = read_prefixes(root), read_refines(root) identifiers = read_identifiers(root, prefixes, refines) ids = {} for key, vals in identifiers.iteritems(): if key == 'calibre': ans.application_id = vals[0] elif key == 'uuid': ans.uuid = vals[0] else: ids[key] = vals[0] ans.set_identifiers(ids) ans.title = read_title(root, prefixes, refines) or ans.title ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort ans.languages = read_languages(root, prefixes, refines) or ans.languages auts, aus = [], [] for a in read_authors(root, prefixes, refines): auts.append(a.name), aus.append(a.sort) ans.authors = auts or ans.authors ans.author_sort = authors_to_string(aus) or ans.author_sort bkp = read_book_producers(root, prefixes, refines) if bkp: if bkp[0]: ans.book_producer = bkp[0] pd = read_pubdate(root, prefixes, refines) if not is_date_undefined(pd): ans.pubdate = pd ts = read_timestamp(root, prefixes, refines) if not is_date_undefined(ts): ans.timestamp = ts lm = read_last_modified(root, prefixes, refines) if not is_date_undefined(lm): ans.last_modified = lm ans.comments = read_comments(root, prefixes, refines) or ans.comments ans.publisher = read_publisher(root, prefixes, refines) or ans.publisher ans.tags = read_tags(root, prefixes, refines) or ans.tags ans.rating = read_rating(root, prefixes, refines) or ans.rating s, si = read_series(root, prefixes, refines) if s: ans.series, ans.series_index = s, si ans.author_link_map = read_author_link_map(root, prefixes, refines) or ans.author_link_map ans.user_categories = read_user_categories(root, prefixes, refines) or ans.user_categories for name, fm in (read_user_metadata(root, prefixes, refines) or {}).iteritems(): ans.set_user_metadata(name, fm) if return_extra_data: ans = ans, ver, read_raster_cover(root, prefixes, refines), first_spine_item(root, prefixes, refines) return ans
def build_meta(log, issue_id): """Build metadata record based on comicvine issue_id.""" issue = PyComicvineWrapper(log).lookup_issue(issue_id) if issue: meta = Metadata(issue.get_full_title(), issue.get_authors()) meta.series = issue.volume_name meta.series_index = issue.issue_number meta.set_identifier('comicvine', str(issue.id)) meta.set_identifier('comicvine-volume', str(issue.volume_id)) meta.comments = issue.description meta.has_cover = False meta.publisher = issue.publisher_name meta.pubdate = issue.date return meta else: return None
def test_input_comment_single(self): stream_meta = get_metadata(self.get_stream('comment_single')) canon_meta = Metadata('A Comment Tag & Title Ⓒ', ['James Madison', 'James Monroe']) canon_meta.publisher = 'Publisher C' canon_meta.languages = ['French'] canon_meta.pubdate = parse_date('2015-01-01') canon_meta.timestamp = parse_date('2014-01-01') canon_meta.series = 'Comment Series' canon_meta.series_index = float(3) canon_meta.rating = float(0) canon_meta.comments = 'comment "comments" ♥ HTML -- too &' canon_meta.tags = ['tag d'] canon_meta.set_identifiers({ 'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre' }) self.compare_metadata(stream_meta, canon_meta)
def test_input_meta_multi(self): stream_meta = get_metadata(self.get_stream('meta_multi')) canon_meta = Metadata( 'A Meta Tag & Title Ⓒ', ['George Washington', 'John Adams', 'Thomas Jefferson']) canon_meta.publisher = 'Publisher A' canon_meta.languages = ['English', 'Spanish'] canon_meta.pubdate = parse_date('2019-01-01') canon_meta.timestamp = parse_date('2018-01-01') canon_meta.series = 'Meta Series' canon_meta.series_index = float(1) canon_meta.rating = float(8) canon_meta.comments = 'meta "comments" ♥ HTML &' canon_meta.tags = ['tag a', 'tag b', 'tag c'] canon_meta.set_identifiers({ 'isbn': '1234567890', 'url': 'http://google.com/search?q=calibre' }) self.compare_metadata(stream_meta, canon_meta)
def parse(self, xml_detail): title = self.parse_title(xml_detail) authors = self.parse_authors(xml_detail) comments = self.parse_comments(xml_detail) rating = self.parse_rating(xml_detail) tags = self.parse_tags(xml_detail) serie, serie_index = self.parse_serie(xml_detail) if title is not None and authors is not None: mi = Metadata(title, authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:self.ident} mi.rating = rating mi.tags = tags mi.series = serie mi.series_index = serie_index return mi else: return None
def build_meta(log, issue_id): '''Build metadata record based on comicvine issue_id''' issue = pycomicvine.Issue(issue_id, field_list=[ 'id', 'name', 'volume', 'issue_number', 'person_credits', 'description', 'store_date', 'cover_date']) if not issue or not issue.volume: log.warn('Unable to load Issue(%d)' % issue_id) return None title = '%s #%s' % (issue.volume.name, issue.issue_number) if issue.name: title = title + ': %s' % (issue.name) authors = [p.name for p in issue.person_credits] meta = Metadata(title, authors) meta.series = issue.volume.name meta.series_index = str(issue.issue_number) meta.set_identifier('comicvine', str(issue.id)) meta.set_identifier('comicvine-volume', str(issue.volume.id)) meta.comments = issue.description meta.has_cover = False if issue.volume.publisher: meta.publisher = issue.volume.publisher.name meta.pubdate = issue.store_date or issue.cover_date return meta
def _metadata(self, book): authors = [] if book['author']: for author in book['author']: for r in REMOVES: author = r.sub("", author) authors.append(author) if not authors: authors = [u'佚名'] from calibre.ebooks.metadata.book.base import Metadata mi = Metadata(book['title']) mi.authors = authors mi.author_sort = mi.authors[0] mi.publisher = book['publisher'] mi.comments = book['summary'] mi.isbn = book.get('isbn13', None) mi.series = book.get('serials', None) mi.tags = [t['name'] for t in book['tags']][:8] mi.rating = int(float(book['rating']['average'])) mi.pubdate = self.str2date(book['pubdate']) mi.timestamp = datetime.datetime.now() mi.douban_author_intro = book['author_intro'] mi.douban_subtitle = book.get('subtitle', None) mi.website = "https://book.douban.com/isbn/%s" % mi.isbn mi.source = u'豆瓣' mi.provider_key = KEY mi.provider_value = book['id'] mi.cover_url = book['images']['large'] if self.copy_image: img = io.BytesIO( urlopen(Request(mi.cover_url, headers=CHROME_HEADERS)).read()) img_fmt = mi.cover_url.split(".")[-1] mi.cover_data = (img_fmt, img) logging.debug("=================\ndouban metadata:\n%s" % mi) return mi
def __init__(self, parent, text, mi=None, fm=None, color_field=None, icon_field_key=None, icon_rule_kind=None, doing_emblem=False, text_is_placeholder=False, dialog_is_st_editor=False, global_vars=None, all_functions=None, builtin_functions=None): QDialog.__init__(self, parent) Ui_TemplateDialog.__init__(self) self.setupUi(self) self.coloring = color_field is not None self.iconing = icon_field_key is not None self.embleming = doing_emblem self.dialog_is_st_editor = dialog_is_st_editor if global_vars is None: self.global_vars = {} else: self.global_vars = global_vars cols = [] if fm is not None: for key in sorted(displayable_columns(fm), key=lambda k: sort_key(fm[k]['name'] if k != color_row_key else 0)): if key == color_row_key and not self.coloring: continue from calibre.gui2.preferences.coloring import all_columns_string name = all_columns_string if key == color_row_key else fm[key]['name'] if name: cols.append((name, key)) self.color_layout.setVisible(False) self.icon_layout.setVisible(False) if self.coloring: self.color_layout.setVisible(True) for n1, k1 in cols: self.colored_field.addItem(n1 + (' (' + k1 + ')' if k1 != color_row_key else ''), k1) self.colored_field.setCurrentIndex(self.colored_field.findData(color_field)) elif self.iconing or self.embleming: self.icon_layout.setVisible(True) if self.embleming: self.icon_kind_label.setVisible(False) self.icon_kind.setVisible(False) self.icon_chooser_label.setVisible(False) self.icon_field.setVisible(False) for n1, k1 in cols: self.icon_field.addItem('{} ({})'.format(n1, k1), k1) self.icon_file_names = [] d = os.path.join(config_dir, 'cc_icons') if os.path.exists(d): for icon_file in os.listdir(d): icon_file = icu_lower(icon_file) if os.path.exists(os.path.join(d, icon_file)): if icon_file.endswith('.png'): self.icon_file_names.append(icon_file) self.icon_file_names.sort(key=sort_key) self.update_filename_box() if self.iconing: dex = 0 from calibre.gui2.preferences.coloring import icon_rule_kinds for i,tup in enumerate(icon_rule_kinds): txt,val = tup self.icon_kind.addItem(txt, userData=(val)) if val == icon_rule_kind: dex = i self.icon_kind.setCurrentIndex(dex) self.icon_field.setCurrentIndex(self.icon_field.findData(icon_field_key)) if dialog_is_st_editor: self.buttonBox.setVisible(False) else: self.new_doc_label.setVisible(False) self.new_doc.setVisible(False) self.template_name_label.setVisible(False) self.template_name.setVisible(False) if mi: if not isinstance(mi, list): mi = (mi, ) else: mi = Metadata(_('Title'), [_('Author')]) mi.author_sort = _('Author Sort') mi.series = ngettext('Series', 'Series', 1) mi.series_index = 3 mi.rating = 4.0 mi.tags = [_('Tag 1'), _('Tag 2')] mi.languages = ['eng'] mi.id = 1 if fm is not None: self.mi.set_all_user_metadata(fm.custom_field_metadata()) else: # No field metadata. Grab a copy from the current library so # that we can validate any custom column names. The values for # the columns will all be empty, which in some very unusual # cases might cause formatter errors. We can live with that. from calibre.gui2.ui import get_gui mi.set_all_user_metadata( get_gui().current_db.new_api.field_metadata.custom_field_metadata()) for col in mi.get_all_user_metadata(False): mi.set(col, (col,), 0) mi = (mi, ) self.mi = mi # Set up the display table self.table_column_widths = None try: self.table_column_widths = \ gprefs.get('template_editor_table_widths', None) except: pass tv = self.template_value tv.setRowCount(len(mi)) tv.setColumnCount(2) tv.setHorizontalHeaderLabels((_('Book title'), _('Template value'))) tv.horizontalHeader().setStretchLastSection(True) tv.horizontalHeader().sectionResized.connect(self.table_column_resized) # Set the height of the table h = tv.rowHeight(0) * min(len(mi), 5) h += 2 * tv.frameWidth() + tv.horizontalHeader().height() tv.setMinimumHeight(h) tv.setMaximumHeight(h) # Set the size of the title column if self.table_column_widths: tv.setColumnWidth(0, self.table_column_widths[0]) else: tv.setColumnWidth(0, tv.fontMetrics().averageCharWidth() * 10) # Use our own widget to get rid of elision. setTextElideMode() doesn't work for r in range(0, len(mi)): w = QLineEdit(tv) w.setReadOnly(True) tv.setCellWidget(r, 0, w) w = QLineEdit(tv) w.setReadOnly(True) tv.setCellWidget(r, 1, w) # Remove help icon on title bar icon = self.windowIcon() self.setWindowFlags(self.windowFlags()&(~Qt.WindowType.WindowContextHelpButtonHint)) self.setWindowIcon(icon) self.all_functions = all_functions if all_functions else formatter_functions().get_functions() self.builtins = (builtin_functions if builtin_functions else formatter_functions().get_builtins_and_aliases()) self.last_text = '' self.highlighter = TemplateHighlighter(self.textbox.document(), builtin_functions=self.builtins) self.textbox.cursorPositionChanged.connect(self.text_cursor_changed) self.textbox.textChanged.connect(self.textbox_changed) self.textbox.setTabStopWidth(10) self.source_code.setTabStopWidth(10) self.documentation.setReadOnly(True) self.source_code.setReadOnly(True) if text is not None: if text_is_placeholder: self.textbox.setPlaceholderText(text) self.textbox.clear() text = '' else: self.textbox.setPlainText(text) else: text = '' self.buttonBox.button(QDialogButtonBox.StandardButton.Ok).setText(_('&OK')) self.buttonBox.button(QDialogButtonBox.StandardButton.Cancel).setText(_('&Cancel')) self.color_copy_button.clicked.connect(self.color_to_clipboard) self.filename_button.clicked.connect(self.filename_button_clicked) self.icon_copy_button.clicked.connect(self.icon_to_clipboard) try: with open(P('template-functions.json'), 'rb') as f: self.builtin_source_dict = json.load(f, encoding='utf-8') except: self.builtin_source_dict = {} func_names = sorted(self.all_functions) self.function.clear() self.function.addItem('') for f in func_names: self.function.addItem('{} -- {}'.format(f, self.function_type_string(f, longform=False)), f) self.function.setCurrentIndex(0) self.function.currentIndexChanged.connect(self.function_changed) self.display_values(text) self.rule = (None, '') tt = _('Template language tutorial') self.template_tutorial.setText( '<a href="%s">%s</a>' % ( localize_user_manual_link('https://manual.calibre-ebook.com/template_lang.html'), tt)) tt = _('Template function reference') self.template_func_reference.setText( '<a href="%s">%s</a>' % ( localize_user_manual_link('https://manual.calibre-ebook.com/generated/en/template_ref.html'), tt)) self.set_up_font_boxes() self.textbox.setFocus() # Now geometry try: geom = gprefs.get('template_editor_dialog_geometry', None) if geom is not None: QApplication.instance().safe_restore_geometry(self, QByteArray(geom)) except Exception: pass
#!/usr/bin/env python
def metadata_from_xmp_packet(raw_bytes): root = parse_xmp_packet(raw_bytes) mi = Metadata(_('Unknown')) title = first_alt('//dc:title', root) if title: mi.title = title authors = multiple_sequences('//dc:creator', root) if authors: mi.authors = authors tags = multiple_sequences('//dc:subject', root) or multiple_sequences( '//pdf:Keywords', root) if tags: mi.tags = tags comments = first_alt('//dc:description', root) if comments: mi.comments = comments publishers = multiple_sequences('//dc:publisher', root) if publishers: mi.publisher = publishers[0] try: pubdate = parse_date(first_sequence('//dc:date', root) or first_simple('//xmp:CreateDate', root), assume_utc=False) except: pass else: mi.pubdate = pubdate bkp = first_simple('//xmp:CreatorTool', root) if bkp: mi.book_producer = bkp md = first_simple('//xmp:MetadataDate', root) if md: try: mi.metadata_date = parse_date(md) except: pass rating = first_simple('//calibre:rating', root) if rating is not None: try: rating = float(rating) if 0 <= rating <= 10: mi.rating = rating except (ValueError, TypeError): pass series, series_index = read_series(root) if series: mi.series, mi.series_index = series, series_index for x in ('title_sort', 'author_sort'): for elem in XPath('//calibre:' + x)(root): val = read_simple_property(elem) if val: setattr(mi, x, val) break for x in ('author_link_map', 'user_categories'): val = first_simple('//calibre:' + x, root) if val: try: setattr(mi, x, json.loads(val)) except: pass languages = multiple_sequences('//dc:language', root) if languages: languages = filter(None, map(canonicalize_lang, languages)) if languages: mi.languages = languages identifiers = {} for xmpid in XPath('//xmp:Identifier')(root): for scheme, value in read_xmp_identifers(xmpid): if scheme and value: identifiers[scheme.lower()] = value for namespace in ('prism', 'pdfx'): for scheme in KNOWN_ID_SCHEMES: if scheme not in identifiers: val = first_simple('//%s:%s' % (namespace, scheme), root) scheme = scheme.lower() if scheme == 'isbn': val = check_isbn(val) elif scheme == 'doi': val = check_doi(val) if val: identifiers[scheme] = val # Check Dublin Core for recognizable identifier types for scheme, check_func in { 'doi': check_doi, 'isbn': check_isbn }.iteritems(): if scheme not in identifiers: val = check_func(first_simple('//dc:identifier', root)) if val: identifiers['doi'] = val if identifiers: mi.set_identifiers(identifiers) read_user_metadata(mi, root) return mi
def metadata_from_xmp_packet(raw_bytes): root = parse_xmp_packet(raw_bytes) mi = Metadata(_('Unknown')) title = first_alt('//dc:title', root) if title.startswith(r'\376\377'): # corrupted XMP packet generated by Nitro PDF. See # https://bugs.launchpad.net/calibre/+bug/1541981 raise ValueError('Corrupted XMP metadata packet detected, probably generated by Nitro PDF') if title: mi.title = title authors = multiple_sequences('//dc:creator', root) if authors: mi.authors = authors tags = multiple_sequences('//dc:subject', root) or multiple_sequences('//pdf:Keywords', root) if tags: mi.tags = tags comments = first_alt('//dc:description', root) if comments: mi.comments = comments publishers = multiple_sequences('//dc:publisher', root) if publishers: mi.publisher = publishers[0] try: pubdate = parse_date(first_sequence('//dc:date', root) or first_simple('//xmp:CreateDate', root), assume_utc=False) except: pass else: mi.pubdate = pubdate bkp = first_simple('//xmp:CreatorTool', root) if bkp: mi.book_producer = bkp md = safe_parse_date(first_simple('//xmp:MetadataDate', root)) mod = safe_parse_date(first_simple('//xmp:ModifyDate', root)) fd = more_recent(md, mod) if fd is not None: mi.metadata_date = fd rating = first_simple('//calibre:rating', root) if rating is not None: try: rating = float(rating) if 0 <= rating <= 10: mi.rating = rating except (ValueError, TypeError): pass series, series_index = read_series(root) if series: mi.series, mi.series_index = series, series_index for x in ('title_sort', 'author_sort'): for elem in XPath('//calibre:' + x)(root): val = read_simple_property(elem) if val: setattr(mi, x, val) break for x in ('author_link_map', 'user_categories'): val = first_simple('//calibre:'+x, root) if val: try: setattr(mi, x, json.loads(val)) except: pass languages = multiple_sequences('//dc:language', root) if languages: languages = filter(None, map(canonicalize_lang, languages)) if languages: mi.languages = languages identifiers = {} for xmpid in XPath('//xmp:Identifier')(root): for scheme, value in read_xmp_identifers(xmpid): if scheme and value: identifiers[scheme.lower()] = value for namespace in ('prism', 'pdfx'): for scheme in KNOWN_ID_SCHEMES: if scheme not in identifiers: val = first_simple('//%s:%s' % (namespace, scheme), root) scheme = scheme.lower() if scheme == 'isbn': val = check_isbn(val) elif scheme == 'doi': val = check_doi(val) if val: identifiers[scheme] = val # Check Dublin Core for recognizable identifier types for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems(): if scheme not in identifiers: val = check_func(first_simple('//dc:identifier', root)) if val: identifiers['doi'] = val if identifiers: mi.set_identifiers(identifiers) read_user_metadata(mi, root) return mi
def metadata_from_xmp_packet(raw_bytes): root = parse_xmp_packet(raw_bytes) mi = Metadata(_('Unknown')) title = first_alt('//dc:title', root) if title: mi.title = title authors = multiple_sequences('//dc:creator', root) if authors: mi.authors = authors tags = multiple_sequences('//dc:subject', root) or multiple_sequences('//pdf:Keywords', root) if tags: mi.tags = tags comments = first_alt('//dc:description', root) if comments: mi.comments = comments publishers = multiple_sequences('//dc:publisher', root) if publishers: mi.publisher = publishers[0] try: pubdate = parse_date(first_sequence('//dc:date', root) or first_simple('//xmp:CreateDate', root), assume_utc=False) except: pass else: mi.pubdate = pubdate bkp = first_simple('//xmp:CreatorTool', root) if bkp: mi.book_producer = bkp rating = first_simple('//calibre:rating', root) if rating is not None: try: rating = float(rating) if 0 <= rating <= 10: mi.rating = rating except (ValueError, TypeError): pass series, series_index = read_series(root) if series: mi.series, mi.series_index = series, series_index for x in ('title_sort', 'author_sort'): for elem in XPath('//calibre:' + x)(root): val = read_simple_property(elem) if val: setattr(mi, x, val) break languages = multiple_sequences('//dc:language', root) if languages: languages = filter(None, map(canonicalize_lang, languages)) if languages: mi.languages = languages identifiers = {} for xmpid in XPath('//xmp:Identifier')(root): for scheme, value in read_xmp_identifers(xmpid): if scheme and value: identifiers[scheme.lower()] = value for namespace in ('prism', 'pdfx'): for scheme in KNOWN_ID_SCHEMES: if scheme not in identifiers: val = first_simple('//%s:%s' % (namespace, scheme), root) scheme = scheme.lower() if scheme == 'isbn': val = check_isbn(val) elif scheme == 'doi': val = check_doi(val) if val: identifiers[scheme] = val # Check Dublin Core for recognizable identifier types for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems(): if scheme not in identifiers: val = check_func(first_simple('//dc:identifier', root)) if val: identifiers['doi'] = val if identifiers: mi.set_identifiers(identifiers) return mi
def parse_details(self, root): try: yes24_id = self.parse_yes24_id(self.url) except: self.log.exception('Error parsing YES24 id for url: %r'%self.url) yes24_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r'%self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not yes24_id: self.log.error('Could not find title/authors/YES24 id for %r'%self.url) self.log.error('YES24: %r Title: %r Authors: %r'%(yes24_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier('yes24', yes24_id) self.yes24_id = yes24_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) mi.cover_url = self.cover_url # This is purely so we can run a test for it!!! try: mi.publisher = self.parse_publisher(root) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_published_date(root) except: self.log.exception('Error parsing published date for url: %r'%self.url) mi.language = 'ko' mi.source_relevance = self.relevance if self.yes24_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.yes24_id) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def _get_metadata(self, book_id, get_user_categories=True): # {{{ mi = Metadata(None, template_cache=self.formatter_template_cache) author_ids = self._field_ids_for('authors', book_id) aut_list = [self._author_data(i) for i in author_ids] aum = [] aus = {} aul = {} for rec in aut_list: aut = rec['name'] aum.append(aut) aus[aut] = rec['sort'] aul[aut] = rec['link'] mi.title = self._field_for('title', book_id, default_value=_('Unknown')) mi.authors = aum mi.author_sort = self._field_for('author_sort', book_id, default_value=_('Unknown')) mi.author_sort_map = aus mi.author_link_map = aul mi.comments = self._field_for('comments', book_id) mi.publisher = self._field_for('publisher', book_id) n = nowf() mi.timestamp = self._field_for('timestamp', book_id, default_value=n) mi.pubdate = self._field_for('pubdate', book_id, default_value=n) mi.uuid = self._field_for('uuid', book_id, default_value='dummy') mi.title_sort = self._field_for('sort', book_id, default_value=_('Unknown')) mi.book_size = self._field_for('size', book_id, default_value=0) mi.ondevice_col = self._field_for('ondevice', book_id, default_value='') mi.last_modified = self._field_for('last_modified', book_id, default_value=n) formats = self._field_for('formats', book_id) mi.format_metadata = {} mi.languages = list(self._field_for('languages', book_id)) if not formats: good_formats = None else: mi.format_metadata = FormatMetadata(self, book_id, formats) good_formats = FormatsList(formats, mi.format_metadata) mi.formats = good_formats mi.has_cover = _('Yes') if self._field_for( 'cover', book_id, default_value=False) else '' mi.tags = list(self._field_for('tags', book_id, default_value=())) mi.series = self._field_for('series', book_id) if mi.series: mi.series_index = self._field_for('series_index', book_id, default_value=1.0) mi.rating = self._field_for('rating', book_id) mi.set_identifiers( self._field_for('identifiers', book_id, default_value={})) mi.application_id = book_id mi.id = book_id composites = [] for key, meta in self.field_metadata.custom_iteritems(): mi.set_user_metadata(key, meta) if meta['datatype'] == 'composite': composites.append(key) else: val = self._field_for(key, book_id) if isinstance(val, tuple): val = list(val) extra = self._field_for(key + '_index', book_id) mi.set(key, val=val, extra=extra) for key in composites: mi.set(key, val=self._composite_for(key, book_id, mi)) user_cat_vals = {} if get_user_categories: user_cats = self.backend.prefs['user_categories'] for ucat in user_cats: res = [] for name, cat, ign in user_cats[ucat]: v = mi.get(cat, None) if isinstance(v, list): if name in v: res.append([name, cat]) elif name == v: res.append([name, cat]) user_cat_vals[ucat] = res mi.user_categories = user_cat_vals return mi
def metadata_from_xmp_packet(raw_bytes): root = parse_xmp_packet(raw_bytes) mi = Metadata(_("Unknown")) title = first_alt("//dc:title", root) if title: mi.title = title authors = multiple_sequences("//dc:creator", root) if authors: mi.authors = authors tags = multiple_sequences("//dc:subject", root) or multiple_sequences("//pdf:Keywords", root) if tags: mi.tags = tags comments = first_alt("//dc:description", root) if comments: mi.comments = comments publishers = multiple_sequences("//dc:publisher", root) if publishers: mi.publisher = publishers[0] try: pubdate = parse_date( first_sequence("//dc:date", root) or first_simple("//xmp:CreateDate", root), assume_utc=False ) except: pass else: mi.pubdate = pubdate bkp = first_simple("//xmp:CreatorTool", root) if bkp: mi.book_producer = bkp md = first_simple("//xmp:MetadataDate", root) if md: try: mi.metadata_date = parse_date(md) except: pass rating = first_simple("//calibre:rating", root) if rating is not None: try: rating = float(rating) if 0 <= rating <= 10: mi.rating = rating except (ValueError, TypeError): pass series, series_index = read_series(root) if series: mi.series, mi.series_index = series, series_index for x in ("title_sort", "author_sort"): for elem in XPath("//calibre:" + x)(root): val = read_simple_property(elem) if val: setattr(mi, x, val) break for x in ("author_link_map", "user_categories"): val = first_simple("//calibre:" + x, root) if val: try: setattr(mi, x, json.loads(val)) except: pass languages = multiple_sequences("//dc:language", root) if languages: languages = filter(None, map(canonicalize_lang, languages)) if languages: mi.languages = languages identifiers = {} for xmpid in XPath("//xmp:Identifier")(root): for scheme, value in read_xmp_identifers(xmpid): if scheme and value: identifiers[scheme.lower()] = value for namespace in ("prism", "pdfx"): for scheme in KNOWN_ID_SCHEMES: if scheme not in identifiers: val = first_simple("//%s:%s" % (namespace, scheme), root) scheme = scheme.lower() if scheme == "isbn": val = check_isbn(val) elif scheme == "doi": val = check_doi(val) if val: identifiers[scheme] = val # Check Dublin Core for recognizable identifier types for scheme, check_func in {"doi": check_doi, "isbn": check_isbn}.iteritems(): if scheme not in identifiers: val = check_func(first_simple("//dc:identifier", root)) if val: identifiers["doi"] = val if identifiers: mi.set_identifiers(identifiers) read_user_metadata(mi, root) return mi
def load_details(self, url, timeout): def _format_item(str): return re.sub('^"(.*)"$', '\\1', unescape(str)) def _format_list(str): return [_.strip() for _ in _format_item(str).split(',')] def _find_meta(node, property): return [_.get('content') for _ in node if _.get('property') == property][0] def _format_date(date_text): year = int(date_text[0:4]) month = int(date_text[4:6]) day = int(date_text[6:]) return datetime.datetime(year, month, day, tzinfo=utc_tz) try: response = self.browser.open(url, timeout=timeout) root = lxml.html.fromstring(response.read()) # <meta> tag에서 불러오는 항목 # 책ID, 제목, ISBN, 이미지URL, 평점 meta = root.xpath('//meta[starts-with(@property, "og") or starts-with(@property, "books")]') # schema.org JSON에서 불러오는 항목 # 제목, 저자, 책소개, 출판사 ld_json = root.xpath('//script[@type="application/ld+json"]/text()') ld = [json.loads(_) for _ in ld_json] book_info = [_ for _ in ld if _['@type'] == 'Book'][0] except Exception as e: self.log.exception(e) ridibooks_id = re.search('id=([0-9]+)', url).group(1) isbn = _find_meta(meta, 'books:isbn') cover_url = _find_meta(meta, 'og:image') title = _find_meta(meta, 'og:title') authors = _format_list(book_info['author']['name']) if book_info.has_key('translator'): authors.extend([_ + u'(역자)' for _ in _format_list(book_info['translator']['name'])]) mi = Metadata(title, authors) mi.set_identifier('ridibooks', ridibooks_id) mi.cover_url = cover_url mi.has_cover = bool(cover_url) mi.publisher = _format_item(book_info['publisher']['name']) mi.pubdate = _format_date(book_info['datePublished']) mi.comments = _format_item(book_info['description']) mi.rating = float(_find_meta(meta, 'books:rating:normalized_value')) series = re.search(u'(.*)\s*(\d+)권', title) if series: mi.series = series.group(1) mi.series_index = float(series.group(2)) mi.language = 'Korean' mi.source_relevance = self.relevance if ridibooks_id: if isbn: self.plugin.cache_isbn_to_identifier(isbn, ridibooks_id) if cover_url: self.plugin.cache_identifier_to_cover_url(ridibooks_id, cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def get_metadata_(src, encoding=None): # Meta data definitions as in # https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9 if isbytestring(src): if not encoding: src = xml_to_unicode(src)[0] else: src = src.decode(encoding, 'replace') src = src[:150000] # Searching shouldn't take too long comment_tags, meta_tags, meta_tag_ids, title_tag = parse_metadata(src) def get_all(field): ans = comment_tags.get(field, meta_tags.get(field, None)) if ans: ans = [x.strip() for x in ans if x.strip()] if not ans: ans = None return ans def get(field): ans = get_all(field) if ans: ans = ans[0] return ans # Title title = get('title') or title_tag.strip() or _('Unknown') # Author authors = authors_to_string(get_all('authors')) or _('Unknown') # Create MetaInformation with Title and Author mi = Metadata(title, string_to_authors(authors)) # Single-value text fields for field in ('publisher', 'isbn'): val = get(field) if val: setattr(mi, field, val) # Multi-value text fields for field in ('languages', ): val = get_all(field) if val: setattr(mi, field, val) # HTML fields for field in ('comments', ): val = get(field) if val: setattr( mi, field, val.replace('&', '&').replace('<', '<').replace( '>', '>').replace('"', '"').replace("'", ''')) # Date fields for field in ('pubdate', 'timestamp'): try: val = parse_date(get(field)) except: pass else: if not is_date_undefined(val): setattr(mi, field, val) # SERIES series = get('series') if series: pat = re.compile(r'\[([.0-9]+)\]$') match = pat.search(series) series_index = None if match is not None: try: series_index = float(match.group(1)) except: pass series = series.replace(match.group(), '').strip() mi.series = series if series_index is None: series_index = get('series_index') try: series_index = float(series_index) except: pass if series_index is not None: mi.series_index = series_index # RATING rating = get('rating') if rating: try: mi.rating = float(rating) if mi.rating < 0: mi.rating = 0 if mi.rating > 10: mi.rating = 0 except: pass # TAGS tags = get_all('tags') if tags: tags = [x.strip() for s in tags for x in s.split(',') if x.strip()] if tags: mi.tags = tags # IDENTIFIERS for (k, v) in iteritems(meta_tag_ids): v = [x.strip() for x in v if x.strip()] if v: mi.set_identifier(k, v[0]) return mi
def load_details(self, url, timeout): def _format_item(str): return re.sub('^"(.*)"$', '\\1', unescape(str)) def _format_list(str): return [_.strip() for _ in _format_item(str).split(',')] def _find_meta(node, property): return [ _.get('content') for _ in node if _.get('property') == property ][0] def _format_date(date_text): year = int(date_text[0:4]) month = int(date_text[4:6]) day = int(date_text[6:]) return datetime.datetime(year, month, day, tzinfo=utc_tz) try: response = self.browser.open(url, timeout=timeout) root = lxml.html.fromstring(response.read()) # <meta> tag에서 불러오는 항목 # 책ID, 제목, ISBN, 이미지URL, 평점 meta = root.xpath( '//meta[starts-with(@property, "og") or starts-with(@property, "books")]' ) # schema.org JSON에서 불러오는 항목 # 제목, 저자, 책소개, 출판사 ld_json = root.xpath( '//script[@type="application/ld+json"]/text()') ld = [json.loads(_) for _ in ld_json] book_info = [_ for _ in ld if _['@type'] == 'Book'][0] except Exception as e: self.log.exception(e) ridibooks_id = re.search('id=([0-9]+)', url).group(1) isbn = _find_meta(meta, 'books:isbn') cover_url = _find_meta(meta, 'og:image') title = _find_meta(meta, 'og:title') authors = _format_list(book_info['author']['name']) if book_info.has_key('translator'): authors.extend([ _ + u'(역자)' for _ in _format_list(book_info['translator']['name']) ]) mi = Metadata(title, authors) mi.set_identifier('ridibooks', ridibooks_id) mi.cover_url = cover_url mi.has_cover = bool(cover_url) mi.publisher = _format_item(book_info['publisher']['name']) mi.pubdate = _format_date(book_info['datePublished']) mi.comments = _format_item(book_info['description']) mi.rating = float(_find_meta(meta, 'books:rating:normalized_value')) series = re.search(u'(.*)\s*(\d+)권', title) if series: mi.series = series.group(1) mi.series_index = float(series.group(2)) mi.language = 'Korean' mi.source_relevance = self.relevance if ridibooks_id: if isbn: self.plugin.cache_isbn_to_identifier(isbn, ridibooks_id) if cover_url: self.plugin.cache_identifier_to_cover_url( ridibooks_id, cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: moly_id = self.parse_moly_id(self.url) self.log.info('Parsed moly.hu identifier: %s'%moly_id) except: self.log.exception('Error parsing moly.hu id for url: %r'%self.url) moly_id = None try: title = self.parse_title(root) self.log.info('Parsed title: %s'%title) except: self.log.exception('Error parsing title for url: %r'%self.url) title = None try: authors = self.parse_authors(root) self.log.info('Parsed authors: %s'%authors) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not moly_id: self.log.error('Could not find title/authors/moly.hu id for %r'%self.url) self.log.error('Moly.hu id: %r Title: %r Authors: %r'%(moly_id, title, authors)) return mi = Metadata(title, authors) mi.set_identifier('moly_hu', moly_id) self.moly_id = moly_id try: isbn = self.parse_isbn(root) self.log.info('Parsed ISBN: %s'%isbn) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: series_info = self.parse_series(root) if series_info is not None: mi.series = series_info[0] mi.series_index = int(series_info[1]) self.log.info('Parsed series: %s, series index: %f'%(mi.series,mi.series_index)) except: self.log.exception('Error parsing series for url: %r'%self.url) try: mi.comments = self.parse_comments(root) self.log.info('Parsed comments: %s'%mi.comments) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_covers(root) self.log.info('Parsed URL for cover: %r'%self.cover_url) self.plugin.cache_identifier_to_cover_url(self.moly_id, self.cover_url) mi.has_cover = bool(self.cover_url) except: self.log.exception('Error parsing cover for url: %r'%self.url) try: mi.tags = self.parse_tags(root) self.log.info('Parsed tags: %s'%mi.tags) except: self.log.exception('Error parsing tags for url: %r'%self.url) try: mi.languages = self.parse_languages(mi.tags) self.log.info('Parsed languages: %r'%mi.languages) except: self.log.exception('Error parsing language for url: %r'%self.url) try: mi.publisher = self.parse_publisher(root) self.log.info('Parsed publisher: %s'%mi.publisher) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_published_date(root) self.log.info('Parsed publication date: %s'%mi.pubdate) except: self.log.exception('Error parsing published date for url: %r'%self.url) try: mi.rating = self.parse_rating(root) self.log.info('Parsed rating: %s\n\n'%mi.rating) except: self.log.exception('Error parsing tags for url: %r\n\n'%self.url) mi.source_relevance = self.relevance if self.moly_id and self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.moly_id) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: CBDB_id = self.parse_CBDB_id(self.url) except: self.log.exception('Error parsing CBDB id for url: %r' % self.url) CBDB_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r' % self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not CBDB_id: self.log.error('Could not find title/authors/CBDB id for %r' % self.url) self.log.error('CBDB: %r Title: %r Authors: %r' % (CBDB_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index #mi.identifiers['cbdb'] = CBDB_id mi.set_identifier('cbdb', CBDB_id) #self.log.info(CBDB_id) #self.log.info(mi.identifiers.get('cbdb', None)) self.CBDB_id = CBDB_id try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r' % self.url) # summary try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: self.cover_urls = self.parse_covers(root) except: self.log.exception('Error parsing cover for url: %r' % self.url) mi.has_cover = bool(self.cover_urls) #self.log.info('covers') #self.log.info(self.cover_urls) try: tags = self.parse_tags(root) if tags: mi.tags = tags except: self.log.exception('Error parsing tags for url: %r' % self.url) try: mi.publisher, mi.pubdate, isbn = self.parse_editions(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing publisher and date for url: %r' % self.url) mi.source_relevance = self.relevance mi.language = 'Czech' #self.log.info('self.CBDB_id = ' + str(self.CBDB_id )) if self.CBDB_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.CBDB_id) if self.cover_urls: self.plugin.cache_identifier_to_cover_url( self.CBDB_id, self.cover_urls) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: isbn = self.extract_isbn(self.url) except: self.log.exception('No ISBN in URL: %r'%self.url) isbn = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r'%self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not isbn: self.log.error('Could not find title/authors/Aladin id for %r'%self.url) self.log.error('Aladin: %r Title: %r Authors: %r'%(isbn, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index #mi.set_identifier('isbn', isbn) mi.isbn = isbn self.isbn = isbn # ISBN-13 try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) mi.cover_url = self.cover_url # This is purely so we can run a test for it!!! if mi.has_cover: self.log.info('Cover URL: '+mi.cover_url) try: mi.publisher = self.parse_publisher(root) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_published_date(root) except: self.log.exception('Error parsing published date for url: %r'%self.url) mi.language = 'ko' mi.source_relevance = self.relevance self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def metadata_from_xmp_packet(raw_bytes): root = parse_xmp_packet(raw_bytes) mi = Metadata(_('Unknown')) title = first_alt('//dc:title', root) if title: if title.startswith(r'\376\377'): # corrupted XMP packet generated by Nitro PDF. See # https://bugs.launchpad.net/calibre/+bug/1541981 raise ValueError( 'Corrupted XMP metadata packet detected, probably generated by Nitro PDF' ) mi.title = title authors = multiple_sequences('//dc:creator', root) if authors: mi.authors = [au for aus in authors for au in string_to_authors(aus)] tags = multiple_sequences('//dc:subject', root) or multiple_sequences( '//pdf:Keywords', root) if tags: mi.tags = tags comments = first_alt('//dc:description', root) if comments: mi.comments = comments publishers = multiple_sequences('//dc:publisher', root) if publishers: mi.publisher = publishers[0] try: pubdate = parse_date(first_sequence('//dc:date', root) or first_simple('//xmp:CreateDate', root), assume_utc=False) except: pass else: mi.pubdate = pubdate bkp = first_simple('//xmp:CreatorTool', root) if bkp: mi.book_producer = bkp md = safe_parse_date(first_simple('//xmp:MetadataDate', root)) mod = safe_parse_date(first_simple('//xmp:ModifyDate', root)) fd = more_recent(md, mod) if fd is not None: mi.metadata_date = fd rating = first_simple('//calibre:rating', root) if rating is not None: try: rating = float(rating) if 0 <= rating <= 10: mi.rating = rating except (ValueError, TypeError): pass series, series_index = read_series(root) if series: mi.series, mi.series_index = series, series_index for x in ('title_sort', 'author_sort'): for elem in XPath('//calibre:' + x)(root): val = read_simple_property(elem) if val: setattr(mi, x, val) break for x in ('author_link_map', 'user_categories'): val = first_simple('//calibre:' + x, root) if val: try: setattr(mi, x, json.loads(val)) except: pass languages = multiple_sequences('//dc:language', root) if languages: languages = list(filter(None, map(canonicalize_lang, languages))) if languages: mi.languages = languages identifiers = {} for xmpid in XPath('//xmp:Identifier')(root): for scheme, value in read_xmp_identifers(xmpid): if scheme and value: identifiers[scheme.lower()] = value for namespace in ('prism', 'pdfx'): for scheme in KNOWN_ID_SCHEMES: if scheme not in identifiers: val = first_simple('//%s:%s' % (namespace, scheme), root) scheme = scheme.lower() if scheme == 'isbn': val = check_isbn(val) elif scheme == 'doi': val = check_doi(val) if val: identifiers[scheme] = val # Check Dublin Core for recognizable identifier types for scheme, check_func in iteritems({ 'doi': check_doi, 'isbn': check_isbn }): if scheme not in identifiers: val = check_func(first_simple('//dc:identifier', root)) if val: identifiers['doi'] = val if identifiers: mi.set_identifiers(identifiers) read_user_metadata(mi, root) return mi
def parse_details(self, raw, root): try: asin = self.parse_asin(root) except: self.log.exception('Error parsing asin for url: %r'%self.url) asin = None if self.testing: import tempfile, uuid with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_', suffix='.html', delete=False) as f: f.write(raw) print ('Downloaded html for', asin, 'saved in', f.name) try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r'%self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not asin: self.log.error('Could not find title/authors/asin for %r'%self.url) self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title, authors)) return mi = Metadata(title, authors) idtype = 'amazon' if self.domain == 'com' else 'amazon_'+self.domain mi.set_identifier(idtype, asin) self.amazon_id = asin try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r'%self.url) try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r'%self.url) try: self.cover_url = self.parse_cover(root, raw) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) non_hero = CSSSelect('div#bookDetails_container_div div#nonHeroSection')(root) if non_hero: # New style markup try: self.parse_new_details(root, mi, non_hero[0]) except: self.log.exception('Failed to parse new-style book details section') else: pd = root.xpath(self.pd_xpath) if pd: pd = pd[0] try: isbn = self.parse_isbn(pd) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.publisher = self.parse_publisher(pd) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_pubdate(pd) except: self.log.exception('Error parsing publish date for url: %r'%self.url) try: lang = self.parse_language(pd) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r'%self.url) else: self.log.warning('Failed to find product description for url: %r'%self.url) mi.source_relevance = self.relevance if self.amazon_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.amazon_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: goodreads_id = self.parse_goodreads_id(self.url) except: self.log.exception("Error parsing goodreads id for url: %r" % self.url) goodreads_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception("Error parsing title and series for url: %r" % self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception("Error parsing authors for url: %r" % self.url) authors = [] if not title or not authors or not goodreads_id: self.log.error("Could not find title/authors/goodreads id for %r" % self.url) self.log.error("Goodreads: %r Title: %r Authors: %r" % (goodreads_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier("goodreads", goodreads_id) self.goodreads_id = goodreads_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception("Error parsing ISBN for url: %r" % self.url) try: mi.rating = self.parse_rating(root) except: self.log.exception("Error parsing ratings for url: %r" % self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception("Error parsing comments for url: %r" % self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception("Error parsing cover for url: %r" % self.url) mi.has_cover = bool(self.cover_url) try: tags = self.parse_tags(root) if tags: mi.tags = tags except: self.log.exception("Error parsing tags for url: %r" % self.url) try: mi.publisher, mi.pubdate = self.parse_publisher_and_date(root) except: self.log.exception("Error parsing publisher and date for url: %r" % self.url) mi.source_relevance = self.relevance if self.goodreads_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.goodreads_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.goodreads_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, raw, root): #解析元数据各字段数据 #self.log.info("=====") try: asin = self.parse_asin(root) except: self.log.exception('Error parsing asin for url: %r' % self.url) asin = None if self.testing: import tempfile, uuid with tempfile.NamedTemporaryFile( prefix=(asin or str(uuid.uuid4())) + '_', suffix='.html', delete=False) as f: f.write(raw) print('Downloaded html for', asin, 'saved in', f.name) # 分析取得书名 try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r' % self.url) title = None #分析取得作者 try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not asin: self.log.error('Could not find title/authors/asin for %r' % self.url) self.log.error('ASIN: %r Title: %r Authors: %r' % (asin, title, authors)) return #以书名,作者为元数据对象mi,用于设置元数据 mi = Metadata(title, authors) #设置Bookid idtype = '17k' mi.set_identifier(idtype, asin) self.k17k_id = asin #设备注释(简介) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) #设置丛书系列 try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r' % self.url) #设置标签 try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r' % self.url) #设置最后更新日期 # try: # mi.last_modified = self.parse_last_modified(root) # except: # self.log.exception('Error parsing last_modified for url: %r'%self.url) #设置封面 try: self.cover_url = self.parse_cover(root, raw) except: self.log.exception('Error parsing cover for url: %r' % self.url) mi.has_cover = bool(self.cover_url) mi.source_relevance = self.relevance mi.languages = [ u'中文', ] if self.k17k_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.k17k_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url( self.k17k_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, raw, root): dang_id = parse_dang_id(root, self.log, self.url) if not dang_id and root.xpath( '//form[@action="/errors/validateCaptcha"]'): raise CaptchaError( 'Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.' ) if self.testing: import tempfile, uuid with tempfile.NamedTemporaryFile( prefix=(dang_id or str(uuid.uuid4())) + '_', suffix='.html', delete=False) as f: f.write(raw) print('Downloaded html for', dang_id, 'saved in', f.name) try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r' % self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not dang_id: self.log.error('Could not find title/authors/dang_id for %r' % self.url) self.log.error('ASIN: %r Title: %r Authors: %r' % (dang_id, title, authors)) return mi = Metadata(title, authors) idtype = 'dang' mi.set_identifier(idtype, dang_id) self.dang_id = dang_id try: mi.comments = self.parse_comments(root, raw) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r' % self.url) try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r' % self.url) try: self.cover_url = self.parse_cover(root, raw) except: self.log.exception('Error parsing cover for url: %r' % self.url) mi.has_cover = bool(self.cover_url) pd = root.xpath(self.pd_desc_xpath) pd_info = root.xpath(self.pd_info_xpath) pd_info_store = root.xpath(self.pd_info_store_xpath) pd_desc = root.xpath(self.pd_desc_xpath) if pd_info or pd_info_store: try: isbn = self.parse_isbn(pd_info, pd_info_store, pd_desc) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r' % self.url) if pd_info: pd_info = pd_info[0] else: pd_info = pd_info_store[0] try: mi.publisher = self.parse_publisher(pd_info) except: self.log.exception('Error parsing publisher for url: %r' % self.url) try: mi.pubdate = self.parse_pubdate(pd_info) except: self.log.exception('Error parsing publish date for url: %r' % self.url) else: self.log.warning('Failed to find product description for url: %r' % self.url) mi.source_relevance = self.relevance if self.dang_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.dang_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url( self.dang_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def _get_metadata(self, book_id, get_user_categories=True): # {{{ mi = Metadata(None, template_cache=self.formatter_template_cache) author_ids = self._field_ids_for('authors', book_id) aut_list = [self._author_data(i) for i in author_ids] aum = [] aus = {} aul = {} for rec in aut_list: aut = rec['name'] aum.append(aut) aus[aut] = rec['sort'] aul[aut] = rec['link'] mi.title = self._field_for('title', book_id, default_value=_('Unknown')) mi.authors = aum mi.author_sort = self._field_for('author_sort', book_id, default_value=_('Unknown')) mi.author_sort_map = aus mi.author_link_map = aul mi.comments = self._field_for('comments', book_id) mi.publisher = self._field_for('publisher', book_id) n = nowf() mi.timestamp = self._field_for('timestamp', book_id, default_value=n) mi.pubdate = self._field_for('pubdate', book_id, default_value=n) mi.uuid = self._field_for('uuid', book_id, default_value='dummy') mi.title_sort = self._field_for('sort', book_id, default_value=_('Unknown')) mi.book_size = self._field_for('size', book_id, default_value=0) mi.ondevice_col = self._field_for('ondevice', book_id, default_value='') mi.last_modified = self._field_for('last_modified', book_id, default_value=n) formats = self._field_for('formats', book_id) mi.format_metadata = {} mi.languages = list(self._field_for('languages', book_id)) if not formats: good_formats = None else: mi.format_metadata = FormatMetadata(self, book_id, formats) good_formats = FormatsList(formats, mi.format_metadata) mi.formats = good_formats mi.has_cover = _('Yes') if self._field_for('cover', book_id, default_value=False) else '' mi.tags = list(self._field_for('tags', book_id, default_value=())) mi.series = self._field_for('series', book_id) if mi.series: mi.series_index = self._field_for('series_index', book_id, default_value=1.0) mi.rating = self._field_for('rating', book_id) mi.set_identifiers(self._field_for('identifiers', book_id, default_value={})) mi.application_id = book_id mi.id = book_id composites = [] for key, meta in self.field_metadata.custom_iteritems(): mi.set_user_metadata(key, meta) if meta['datatype'] == 'composite': composites.append(key) else: val = self._field_for(key, book_id) if isinstance(val, tuple): val = list(val) extra = self._field_for(key+'_index', book_id) mi.set(key, val=val, extra=extra) for key in composites: mi.set(key, val=self._composite_for(key, book_id, mi)) user_cat_vals = {} if get_user_categories: user_cats = self.backend.prefs['user_categories'] for ucat in user_cats: res = [] for name,cat,ign in user_cats[ucat]: v = mi.get(cat, None) if isinstance(v, list): if name in v: res.append([name,cat]) elif name == v: res.append([name,cat]) user_cat_vals[ucat] = res mi.user_categories = user_cat_vals return mi
def get_metadata_(src, encoding=None): # Meta data definitions as in # https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9 if isbytestring(src): if not encoding: src = xml_to_unicode(src)[0] else: src = src.decode(encoding, 'replace') src = src[:150000] # Searching shouldn't take too long comment_tags = parse_comment_tags(src) meta_tags = parse_meta_tags(src) def get(field): ans = comment_tags.get(field, meta_tags.get(field, None)) if ans: ans = ans.strip() if not ans: ans = None return ans # Title title = get('title') if not title: pat = re.compile('<title>([^<>]+?)</title>', re.IGNORECASE) match = pat.search(src) if match: title = replace_entities(match.group(1)) # Author authors = get('authors') or _('Unknown') # Create MetaInformation with Title and Author mi = Metadata(title or _('Unknown'), string_to_authors(authors)) for field in ('publisher', 'isbn', 'language', 'comments'): val = get(field) if val: setattr(mi, field, val) for field in ('pubdate', 'timestamp'): try: val = parse_date(get(field)) except: pass else: if not is_date_undefined(val): setattr(mi, field, val) # SERIES series = get('series') if series: pat = re.compile(r'\[([.0-9]+)\]$') match = pat.search(series) series_index = None if match is not None: try: series_index = float(match.group(1)) except: pass series = series.replace(match.group(), '').strip() mi.series = series if series_index is None: series_index = get('series_index') try: series_index = float(series_index) except: pass if series_index is not None: mi.series_index = series_index # RATING rating = get('rating') if rating: try: mi.rating = float(rating) if mi.rating < 0: mi.rating = 0 if mi.rating > 5: mi.rating /= 2. if mi.rating > 5: mi.rating = 0 except: pass # TAGS tags = get('tags') if tags: tags = [x.strip() for x in tags.split(',') if x.strip()] if tags: mi.tags = tags return mi
def merge(self, results, min_year, do_asr=True): ans = Metadata(_('Unknown')) # We assume the shortest title has the least cruft in it ans.title = self.length_merge('title', results, null_value=ans.title) # No harm in having extra authors, maybe something useful like an # editor or translator ans.authors = self.length_merge('authors', results, null_value=ans.authors, shortest=False) # We assume the shortest publisher has the least cruft in it ans.publisher = self.length_merge('publisher', results, null_value=ans.publisher) # We assume the smallest set of tags has the least cruft in it ans.tags = self.length_merge('tags', results, null_value=ans.tags, shortest=msprefs['fewer_tags']) # We assume the longest series has the most info in it ans.series = self.length_merge('series', results, null_value=ans.series, shortest=False) for r in results: if r.series and r.series == ans.series: ans.series_index = r.series_index break # Average the rating over all sources ratings = [] for r in results: rating = r.rating if rating and rating > 0 and rating <= 5: ratings.append(rating) if ratings: ans.rating = int(round(sum(ratings)/len(ratings))) # Smallest language is likely to be valid ans.language = self.length_merge('language', results, null_value=ans.language) # Choose longest comments ans.comments = self.length_merge('comments', results, null_value=ans.comments, shortest=False) # Published date if min_year: for r in results: year = getattr(r.pubdate, 'year', None) if year == min_year: ans.pubdate = r.pubdate break if getattr(ans.pubdate, 'year', None) == min_year: min_date = datetime(min_year, ans.pubdate.month, ans.pubdate.day, tzinfo=utc_tz) else: min_date = datetime(min_year, 1, 2, tzinfo=utc_tz) ans.pubdate = min_date else: min_date = datetime(3001, 1, 1, tzinfo=utc_tz) for r in results: if r.pubdate is not None: candidate = as_utc(r.pubdate) if candidate < min_date: min_date = candidate if min_date.year < 3000: ans.pubdate = min_date # Identifiers for r in results: ans.identifiers.update(r.identifiers) # Cover URL ans.has_cached_cover_url = bool([r for r in results if getattr(r, 'has_cached_cover_url', False)]) # Merge any other fields with no special handling (random merge) touched_fields = set() for r in results: if hasattr(r, 'identify_plugin'): touched_fields |= r.identify_plugin.touched_fields for f in touched_fields: if f.startswith('identifier:') or not ans.is_null(f): continue setattr(ans, f, self.random_merge(f, results, null_value=getattr(ans, f))) if do_asr: avg = [x.relevance_in_source for x in results] avg = sum(avg)/len(avg) ans.average_source_relevance = avg return ans
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): self.load_config() # get identifying tags from book idn = identifiers.get('dnb-idn', None) isbn = check_isbn(identifiers.get('isbn', None)) # ignore unknown authors if authors is "V. A." or authors is "V.A." or authors is "Unknown" or authors is "Unbekannt": authors = None if (isbn is None) and (idn is None) and (title is None) and (authors is None): log.info( "This plugin requires at least either ISBN, IDN, Title or Author(s)." ) return None queries = [] # DNB does not do an exact search when searching for a idn or isbn, so we have to filter the results exact_search = {} if idn is not None: queries.append('num=' + idn) exact_search['idn'] = idn else: authors_v = [] title_v = [] if authors is not None: authors_v.append(' '.join(authors)) authors_v.append(' '.join( self.get_author_tokens(authors, only_first_author=False))) authors_v.append(' '.join( self.get_author_tokens(authors, only_first_author=True))) if title is not None: title_v.append(title) title_v.append(' '.join( self.get_title_tokens(title, strip_joiners=False, strip_subtitle=False))) title_v.append(' '.join( self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True))) if isbn is not None: exact_search['isbn'] = isbn # title and author if authors is not None and title is not None: for a in authors_v: for t in title_v: if isbn is not None: queries.append('tit="' + t + '" AND per="' + a + '" AND num="' + isbn + '"') else: queries.append('tit="' + t + '" AND per="' + a + '"') # try with author and title swapped if isbn is not None: queries.append('per="' + title + '" AND tit="' + authors[0] + '" AND num="' + isbn + '"') else: queries.append('per="' + title + '" AND tit="' + authors[0] + '"') # title but no author elif authors is not None and title is None: for i in authors_v: if isbn is not None: queries.append('per="' + i + '" AND num="' + isbn + '"') else: queries.append('per="' + i + '"') # try with author and title swapped if isbn is not None: queries.append('tit="' + authors[0] + '" AND num="' + isbn + '"') else: queries.append('tit="' + authors[0] + '"') # author but no title elif authors is None and title is not None: for i in title_v: if isbn is not None: queries.append('tit="' + i + '" AND num="' + isbn + '"') else: queries.append('tit="' + i + '"') # try with author and title swapped if isbn is not None: queries.append('per="' + title + '" AND num="' + isbn + '"') else: queries.append('per="' + title + '"') # as last resort only use isbn if isbn is not None: queries.append('num=' + isbn) # Sort queries descending by length (assumption: longer query -> less but better results) #queries.sort(key=len) #queries.reverse() # remove duplicate queries uniqueQueries = [] for i in queries: if i not in uniqueQueries: uniqueQueries.append(i) # Process queries results = None for query in uniqueQueries: query = query + ' NOT (mat=film OR mat=music OR mat=microfiches)' log.info(query) if self.cfg_dnb_token is None: results = self.getSearchResultsByScraping(log, query, timeout) else: results = self.getSearchResults(log, query, timeout) if results is None: continue log.info("Parsing records") ns = {'marc21': 'http://www.loc.gov/MARC21/slim'} for record in results: series = None series_index = None publisher = None pubdate = None languages = [] title = None title_sort = None edition = None comments = None idn = None urn = None isbn = None ddc = [] subjects_gnd = [] subjects_non_gnd = [] # Title: Field 245 title_parts = [] # if a,n,p exist: series = a, series_index = n, title = p for i in record.xpath( ".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]/../marc21:subfield[@code='n' and string-length(text())>0]/../marc21:subfield[@code='p' and string-length(text())>0]/..", namespaces=ns): series_index = i.xpath(".//marc21:subfield[@code='n']", namespaces=ns)[0].text.strip() match = re.search("(\d+[,\.\d+]?)", series_index) if match: series_index = match.group(1) else: series_index = "0" # looks like sometimes DNB does not know the series index and uses something like "[...]" series_index = series_index.replace(',', '.') series = i.xpath(".//marc21:subfield[@code='a']", namespaces=ns)[0].text.strip() title_parts.append( i.xpath(".//marc21:subfield[@code='p']", namespaces=ns)[0].text.strip()) log.info("Extracted Series: %s" % series) log.info("Extracted Series Index: %s" % series_index) break # otherwise: title = a if len(title_parts) == 0: for i in record.xpath( ".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): title_parts.append(i.text.strip()) break # subtitle 1 for i in record.xpath( ".//marc21:datafield[@tag='245']/marc21:subfield[@code='b' and string-length(text())>0]", namespaces=ns): title_parts.append(i.text.strip()) break # subtitle 2 #for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='c' and string-length(text())>0]",namespaces=ns): # title = title + " / " + i.text.strip() # break title = " : ".join(title_parts) log.info("Extracted Title: %s" % title) # Title_Sort title_sort_parts = list(title_parts) title_sort_regex = re.match( '^(.*?)(' + chr(152) + '.*' + chr(156) + ')?(.*?)$', title_parts[0]) sortword = title_sort_regex.group(2) if sortword: title_sort_parts[0] = ''.join( filter(None, [ title_sort_regex.group(1).strip(), title_sort_regex.group(3).strip(), ", " + sortword ])) title_sort = " : ".join(title_sort_parts) log.info("Extracted Title_Sort: %s" % title_sort) # Authors authors = [] author_sort = None for i in record.xpath( ".//marc21:datafield[@tag='100']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # primary authors name = re.sub(" \[.*\]$", "", i.text.strip()) authors.append(name) for i in record.xpath( ".//marc21:datafield[@tag='700']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # secondary authors name = re.sub(" \[.*\]$", "", i.text.strip()) authors.append(name) if len( authors ) == 0: # if no "real" autor was found take all persons involved for i in record.xpath( ".//marc21:datafield[@tag='700']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # secondary authors name = re.sub(" \[.*\]$", "", i.text.strip()) authors.append(name) if len(authors) > 0: author_sort = authors[0] log.info("Extracted Authors: %s" % " & ".join(authors)) # Comments for i in record.xpath( ".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]", namespaces=ns): if i.text.startswith("http://deposit.dnb.de/"): br = self.browser log.info('Downloading Comments from: %s' % i.text) try: comments = br.open_novisit(i.text, timeout=30).read() comments = sanitize_comments_html(comments) log.info('Comments: %s' % comments) break except: log.info("Could not download Comments from %s" % i) # Publisher Name and Location publisher_name = None publisher_location = None fields = record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..", namespaces=ns) if len(fields) > 0: publisher_name = fields[0].xpath( ".//marc21:subfield[@code='b' and string-length(text())>0]", namespaces=ns)[0].text.strip() publisher_location = fields[0].xpath( ".//marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns)[0].text.strip() else: fields = record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../..", namespaces=ns) if len(fields) > 0: publisher_name = fields[0].xpath( ".//marc21:subfield[@code='b' and string-length(text())>0]", namespaces=ns)[0].text.strip() else: fields = record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='a' and string-length(text())>0]/../..", namespaces=ns) if len(fields) > 0: publisher_location = fields[0].xpath( ".//marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns)[0].text.strip() log.info("Extracted Publisher: %s" % publisher_name) log.info("Extracted Publisher Location: %s" % publisher_location) # Publishing Date for i in record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='c' and string-length(text())>=4]", namespaces=ns): match = re.search("(\d{4})", i.text.strip()) if match is not None: year = match.group(1) pubdate = datetime.datetime(int(year), 1, 2) break log.info("Extracted Publication Year: %s" % pubdate) # ID: IDN for i in record.xpath( ".//marc21:datafield[@tag='016']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): idn = i.text.strip() break log.info("Extracted ID IDN: %s" % idn) if "idn" in exact_search: if idn != exact_search["idn"]: log.info( "Extracted IDN does not match book's IDN, skipping record" ) continue # ID: URN for i in record.xpath( ".//marc21:datafield[@tag='024']/marc21:subfield[@code='2' and text()='urn']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): urn = i.text.strip() break log.info("Extracted ID URN: %s" % urn) # ID: ISBN for i in record.xpath( ".//marc21:datafield[@tag='020']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): isbn_regex = "(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]" match = re.search(isbn_regex, i.text.strip()) isbn = match.group() isbn = isbn.replace('-', '') break log.info("Extracted ID ISBN: %s" % isbn) if "isbn" in exact_search: if isbn != exact_search["isbn"]: log.info( "Extracted ISBN does not match book's ISBN, skipping record" ) continue # ID: Sachgruppe (DDC) for i in record.xpath( ".//marc21:datafield[@tag='082']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): ddc.append(i.text.strip()) log.info("Extracted ID DDC: %s" % ",".join(ddc)) # Series and Series_Index if series is None and series_index is None: for i in record.xpath( ".//marc21:datafield[@tag='830']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..", namespaces=ns): # Series Index series_index = i.xpath(".//marc21:subfield[@code='v']", namespaces=ns)[0].text.strip() match = re.search("(\d+[,\.\d+]?)", series_index) if match is not None: series_index = match.group(1) else: series_index = "0" series_index = series_index.replace(',', '.') log.info("Extracted Series Index: %s" % series_index) # Series series = i.xpath(".//marc21:subfield[@code='a']", namespaces=ns)[0].text.strip() log.info("Extracted Series: %s" % series) break # Try to extract Series, Series Index and Title from the fetched title. # Caution: This overwrites DNB's series/series_index and modifies the title! if self.cfg_guess_series is True: guessed_series = None guessed_series_index = None parts = re.split("[:]", self.removeSortingCharacters(title)) if len(parts) == 2: if bool(re.search("\d", parts[0])) != bool( re.search("\d", parts[1])): # figure out which part contains the index if bool(re.search("\d", parts[0])): indexpart = parts[0] textpart = parts[1] else: indexpart = parts[1] textpart = parts[0] match = re.match( "^[\s\-–:]*(.+?)[\s\-–:]*$", textpart ) # remove odd characters from start and end of the text part if match: textpart = match.group(1) # from Titleparts like: "Name of the series - Episode 2" OK match = re.match( "^\s*(\S.*?)[\(\/\.,\s\-–:]*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$", indexpart) if match: guessed_series_index = match.group(2) guessed_series = match.group(1) if guessed_series is None: guessed_series = textpart title = textpart + " : Band " + guessed_series_index else: title = textpart else: # from Titleparts like: "Episode 2 Name of the series" match = re.match( "^\s*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*(\S.*?)[\/\.,\-–\s]*$", indexpart) if match: guessed_series_index = match.group(1) guessed_series = match.group(2) if guessed_series is None: guessed_series = textpart title = textpart + " : Band " + guessed_series_index else: title = textpart elif len(parts) == 1: # from Titles like: "Name of the series - Title (Episode 2)" match = re.match( "^\s*(\S.+?) \- (\S.+?) [\(\/\.,\s\-–:](?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$", parts[0]) if match: guessed_series_index = match.group(3) guessed_series = match.group(1) title = match.group(2) else: # from Titles like: "Name of the series - Episode 2" match = re.match( "^\s*(\S.+?)[\(\/\.,\s\-–:]*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$", parts[0]) if match: guessed_series_index = match.group(2) guessed_series = match.group(1) title = guessed_series + " : Band " + guessed_series_index if guessed_series is not None and guessed_series_index is not None: series = guessed_series series_index = guessed_series_index log.info("Guessed Series: %s" % series) log.info("Guessed Series Index: %s" % series_index) # GND Subjects from 689 for i in record.xpath( ".//marc21:datafield[@tag='689']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): subjects_gnd.append(i.text.strip()) # GND Subjects from 600-655 for f in range(600, 656): for i in record.xpath(".//marc21:datafield[@tag='" + str( f ) + "']/marc21:subfield[@code='2' and text()='gnd']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): if i.text.startswith("("): continue subjects_gnd.append(i.text) log.info("Extracted GND Subjects: %s" % " ".join(subjects_gnd)) # Non-GND subjects from 600-655 for f in range(600, 656): for i in record.xpath(".//marc21:datafield[@tag='" + str( f ) + "']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # ignore entries starting with "(": if i.text.startswith("("): continue subjects_non_gnd.extend(re.split(',|;', i.text)) # remove one-character subjects: for i in subjects_non_gnd: if len(i) < 2: subjects_non_gnd.remove(i) log.info("Extracted non-GND Subjects: %s" % " ".join(subjects_non_gnd)) # Edition for i in record.xpath( ".//marc21:datafield[@tag='250']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): edition = i.text.strip() break log.info("Extracted Edition: %s" % edition) # Languages for i in record.xpath( ".//marc21:datafield[@tag='041']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): languages.append(i.text.strip()) if languages is not None: log.info("Extracted Languages: %s" % ",".join(languages)) # Put it all together if self.cfg_append_edition_to_title == True and edition is not None: title = title + " : " + edition mi = Metadata( self.removeSortingCharacters(title), map(lambda i: self.removeSortingCharacters(i), authors)) mi.title_sort = self.removeSortingCharacters(title_sort) mi.author_sort = self.removeSortingCharacters(author_sort) mi.languages = languages mi.pubdate = pubdate mi.publisher = " : ".join( filter(None, [ publisher_location, self.removeSortingCharacters(publisher_name) ])) mi.series = self.removeSortingCharacters(series) mi.series_index = series_index mi.comments = comments mi.isbn = isbn # also required for cover download mi.set_identifier('urn', urn) mi.set_identifier('dnb-idn', idn) mi.set_identifier('ddc', ",".join(ddc)) if self.cfg_fetch_subjects == 0: mi.tags = self.uniq(subjects_gnd) elif self.cfg_fetch_subjects == 1: if len(subjects_gnd) > 0: mi.tags = self.uniq(subjects_gnd) else: mi.tags = self.uniq(subjects_non_gnd) elif self.cfg_fetch_subjects == 2: mi.tags = self.uniq(subjects_gnd + subjects_non_gnd) elif self.cfg_fetch_subjects == 3: if len(subjects_non_gnd) > 0: mi.tags = self.uniq(subjects_non_gnd) else: mi.tags = self.uniq(subjects_gnd) elif self.cfg_fetch_subjects == 4: mi.tags = self.uniq(subjects_non_gnd) elif self.cfg_fetch_subjects == 5: mi.tags = [] # put current result's metdata into result queue log.info("Final formatted result: %s" % mi) result_queue.put(mi)
def parse_details(self, raw, root): try: asin = self.parse_asin(root) except: self.log.exception('Error parsing asin for url: %r'%self.url) asin = None if self.testing: import tempfile, uuid with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_', suffix='.html', delete=False) as f: f.write(raw) print ('Downloaded html for', asin, 'saved in', f.name) try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r'%self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not asin: self.log.error('Could not find title/authors/asin for %r'%self.url) self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title, authors)) return mi = Metadata(title, authors) idtype = 'amazon' if self.domain == 'com' else 'amazon_'+self.domain mi.set_identifier(idtype, asin) self.amazon_id = asin try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r'%self.url) try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r'%self.url) try: self.cover_url = self.parse_cover(root, raw) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) non_hero = CSSSelect('div#bookDetails_container_div div#nonHeroSection')(root) if non_hero: # New style markup try: self.parse_new_details(root, mi, non_hero[0]) except: self.log.exception('Failed to parse new-style book details section') else: pd = root.xpath(self.pd_xpath) if pd: pd = pd[0] try: isbn = self.parse_isbn(pd) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.publisher = self.parse_publisher(pd) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_pubdate(pd) except: self.log.exception('Error parsing publish date for url: %r'%self.url) try: lang = self.parse_language(pd) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r'%self.url) else: self.log.warning('Failed to find product description for url: %r'%self.url) mi.source_relevance = self.relevance if self.amazon_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.amazon_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def to_metadata(self, log, entry_, timeout): # {{{ from calibre.utils.date import parse_date, utcnow log.info("to_metadata") douban_id = entry_.get("id") title = entry_.get("title") description = entry_.get("summary") # subtitle = entry_.get('subtitle') # TODO: std metada doesn't have this field publisher = entry_.get("publisher") isbn = entry_.get("isbn13") # ISBN11 is obsolute, use ISBN13 pubdate = entry_.get("pubdate") authors = entry_.get("author") # authors = "author" book_tags = entry_.get("tags") rating = entry_.get("rating") cover_url = entry_.get("cover") series = entry_.get("series") if not authors: authors = [("Unknown")] if not douban_id or not title: # Silently discard this entry return None mi = Metadata(title, authors) mi.identifiers = {"douban": douban_id} mi.publisher = publisher mi.comments = description # mi.subtitle = subtitle # ISBN isbns = [] if isinstance(isbn, (type(""), bytes)): if check_isbn(isbn): isbns.append(isbn) else: for x in isbn: if check_isbn(x): isbns.append(x) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags mi.tags = book_tags # pubdate if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except BaseException: log.error("Failed to parse pubdate %r" % pubdate) if rating: try: # mi.publisher += "#PrB.rating#" + str(rating) mi.rating = rating / 2.0 except BaseException: log.exception("Failed to parse rating") mi.rating = 0 # Cover mi.has_douban_cover = None u = cover_url if u: # If URL contains "book-default", the book doesn't have a cover if u.find("book-default") == -1: mi.has_douban_cover = u # Series if series: mi.series = series return mi
def to_metadata(self, browser, log, entry_, timeout): # {{{ from calibre.utils.date import parse_date, utcnow douban_id = entry_.get('id') title = entry_.get('title') description = entry_.get('summary') # subtitle = entry_.get('subtitle') # TODO: std metada doesn't have this field publisher = entry_.get('publisher') isbn = entry_.get('isbn13') # ISBN11 is obsolute, use ISBN13 pubdate = entry_.get('pubdate') authors = entry_.get('author') book_tags = entry_.get('tags') rating = entry_.get('rating') cover_url = entry_.get('images', {}).get('large') series = entry_.get('series') if not authors: authors = [_('Unknown')] if not douban_id or not title: # Silently discard this entry return None mi = Metadata(title, authors) mi.identifiers = {'douban': douban_id} mi.publisher = publisher mi.comments = description # mi.subtitle = subtitle # ISBN isbns = [] if isinstance(isbn, (type(''), bytes)): if check_isbn(isbn): isbns.append(isbn) else: for x in isbn: if check_isbn(x): isbns.append(x) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags mi.tags = [tag['name'] for tag in book_tags] # pubdate if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r' % pubdate) # Ratings if rating: try: mi.rating = float(rating['average']) / 2.0 except: log.exception('Failed to parse rating') mi.rating = 0 # Cover mi.has_douban_cover = None u = cover_url if u: # If URL contains "book-default", the book doesn't have a cover if u.find('book-default') == -1: mi.has_douban_cover = u # Series if series: mi.series = series['title'] return mi
def parse_details(self, root): try: kyobobook_id = self.parse_kyobobook_id(self.url) except: self.log.exception('Error parsing Kyobobook id for url: %r'%self.url) kyobobook_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r'%self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not kyobobook_id: self.log.error('Could not find title/authors/kyobobook id for %r'%self.url) self.log.error('Kyobobook: %r Title: %r Authors: %r'%(kyobobook_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier('kyobobook', kyobobook_id) self.kyobobook_id = kyobobook_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) try: tags = self.parse_tags(root) if tags: mi.tags = tags except: self.log.exception('Error parsing tags for url: %r'%self.url) try: mi.publisher, mi.pubdate = self.parse_publisher_and_date(root) except: self.log.exception('Error parsing publisher and date for url: %r'%self.url) try: lang = self._parse_language(root) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r'%self.url) mi.source_relevance = self.relevance if self.kyobobook_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.kyobobook_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.kyobobook_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)