def field_from_string(field, raw, field_metadata): ''' Parse the string raw to return an object that is suitable for calling set() on a Metadata object. ''' dt = field_metadata['datatype'] val = object if dt in {'int', 'float'}: val = int(raw) if dt == 'int' else float(raw) elif dt == 'rating': val = float(raw) * 2 elif dt == 'datetime': from calibre.utils.date import parse_only_date val = parse_only_date(raw) elif dt == 'bool': if raw.lower() in {'true', 'yes', 'y'}: val = True elif raw.lower() in {'false', 'no', 'n'}: val = False else: raise ValueError('Unknown value for %s: %s'%(field, raw)) elif dt == 'text': ism = field_metadata['is_multiple'] if ism: val = [x.strip() for x in raw.split(ism['ui_to_list'])] if field == 'identifiers': val = {x.partition(':')[0]:x.partition(':')[-1] for x in val} elif field == 'languages': from calibre.utils.localization import canonicalize_lang val = [canonicalize_lang(x) for x in val] val = [x for x in val if x] if val is object: val = raw return val
def convert_markdown_with_metadata(txt, title='', extensions=DEFAULT_MD_EXTENSIONS): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_only_date from calibre.db.write import get_series_values if 'meta' not in extensions: extensions.append('meta') md = create_markdown_object(extensions) html = md.convert(txt) mi = Metadata(title or _('Unknown')) m = md.Meta for k, v in iteritems({'date':'pubdate', 'summary':'comments'}): if v not in m and k in m: m[v] = m.pop(k) for k in 'title authors series tags pubdate comments publisher rating'.split(): val = m.get(k) if val: mf = mi.metadata_for_field(k) if not mf.get('is_multiple'): val = val[0] if k == 'series': val, si = get_series_values(val) mi.series_index = 1 if si is None else si if k == 'rating': try: val = max(0, min(int(float(val)), 10)) except Exception: continue if mf.get('datatype') == 'datetime': try: val = parse_only_date(val, assume_utc=False) except Exception: continue setattr(mi, k, val) return mi, HTML_TEMPLATE % (mi.title, html)
def convert_markdown_with_metadata(txt, title='', extensions=DEFAULT_MD_EXTENSIONS): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_only_date from calibre.db.write import get_series_values if 'meta' not in extensions: extensions.append('meta') md = create_markdown_object(extensions) html = md.convert(txt) mi = Metadata(title or _('Unknown')) m = md.Meta for k, v in {'date':'pubdate', 'summary':'comments'}.iteritems(): if v not in m and k in m: m[v] = m.pop(k) for k in 'title authors series tags pubdate comments publisher rating'.split(): val = m.get(k) if val: mf = mi.metadata_for_field(k) if not mf.get('is_multiple'): val = val[0] if k == 'series': val, si = get_series_values(val) mi.series_index = 1 if si is None else si if k == 'rating': try: val = max(0, min(int(float(val)), 10)) except Exception: continue if mf.get('datatype') == 'datetime': try: val = parse_only_date(val, assume_utc=False) except Exception: continue setattr(mi, k, val) return mi, HTML_TEMPLATE % (mi.title, html)
def parse_pubdate(self, pd): for x in reversed(pd.xpath(self.publisher_xpath)): if x.tail: ans = x.tail date = ans.rpartition('(')[-1].replace(')', '').strip() date = self.delocalize_datestr(date) return parse_only_date(date, assume_utc=True)
def parse_new_details(self, root, mi, non_hero): table = non_hero.xpath('descendant::table')[0] for tr in table.xpath('descendant::tr'): cells = tr.xpath('descendant::td') if len(cells) == 2: name = self.totext(cells[0]) val = self.totext(cells[1]) if not val: continue if name in self.language_names: ans = self.lang_map.get(val, None) if not ans: ans = canonicalize_lang(val) if ans: mi.language = ans elif name in self.publisher_names: pub = val.partition(';')[0].partition('(')[0].strip() if pub: mi.publisher = pub date = val.rpartition('(')[-1].replace(')', '').strip() try: from calibre.utils.date import parse_only_date date = self.delocalize_datestr(date) mi.pubdate = parse_only_date(date, assume_utc=True) except: self.log.exception('Failed to parse pubdate: %s' % val) elif name in {'ISBN', 'ISBN-10', 'ISBN-13'}: ans = check_isbn(val) if ans: self.isbn = mi.isbn = ans
def field_from_string(field, raw, field_metadata): ''' Parse the string raw to return an object that is suitable for calling set() on a Metadata object. ''' dt = field_metadata['datatype'] val = object if dt in {'int', 'float'}: val = int(raw) if dt == 'int' else float(raw) elif dt == 'rating': val = float(raw) * 2 elif dt == 'datetime': from calibre.utils.date import parse_only_date val = parse_only_date(raw) elif dt == 'bool': if raw.lower() in {'true', 'yes', 'y'}: val = True elif raw.lower() in {'false', 'no', 'n'}: val = False else: raise ValueError('Unknown value for %s: %s' % (field, raw)) elif dt == 'text': ism = field_metadata['is_multiple'] if ism: val = [x.strip() for x in raw.split(ism['ui_to_list'])] if field == 'identifiers': val = {x.partition(':')[0]: x.partition(':')[-1] for x in val} elif field == 'languages': from calibre.utils.localization import canonicalize_lang val = [canonicalize_lang(x) for x in val] val = [x for x in val if x] if val is object: val = raw return val
def read_metadata_kfx(stream, read_cover=True): ' Read the metadata.kfx file that is found in the sdr book folder for KFX files ' c = Container(stream.read()) m = extract_metadata(c.decode()) # dump_metadata(m) def has(x): return m[x] and m[x][0] def get(x, single=True): ans = m[x] if single: ans = clean_xml_chars(ans[0]) if ans else '' else: ans = [clean_xml_chars(y) for y in ans] return ans title = get('title') or _('Unknown') authors = get('authors', False) or [_('Unknown')] auth_pat = re.compile(r'([^,]+?)\s*,\s+([^,]+)$') def fix_author(x): if tweaks['author_sort_copy_method'] != 'copy': m = auth_pat.match(x.strip()) if m is not None: return m.group(2) + ' ' + m.group(1) return x mi = Metadata(title, [fix_author(x) for x in authors]) if has('author'): mi.author_sort = get('author') if has('ASIN'): mi.set_identifier('mobi-asin', get('ASIN')) elif has('content_id'): mi.set_identifier('mobi-asin', get('content_id')) if has('languages'): langs = list( filter(None, (canonicalize_lang(x) for x in get('languages', False)))) if langs: mi.languages = langs if has('issue_date'): try: mi.pubdate = parse_only_date(get('issue_date')) except Exception: pass if has('publisher') and get('publisher') != 'Unknown': mi.publisher = get('publisher') if read_cover and m[COVER_KEY]: try: data = base64.standard_b64decode(m[COVER_KEY]) fmt, w, h = identify(bytes(data)) except Exception: w, h, fmt = 0, 0, None if fmt and w > -1 and h > -1: mi.cover_data = (fmt, data) return mi
def toPubdate(log, yearAsString): # {{{ res = None if yearAsString: try: res = parse_only_date(u"01.01." + yearAsString) except: log.error('cannot parse to date %s' % yearAsString) return res
def toPubdate(log, yearAsString): # {{{ res = None if yearAsString: try: res = parse_only_date(u"01.01." + yearAsString) except: log.error('cannot parse to date %s'%yearAsString) return res
def toPubdate(log, yearAsString): # {{{ from calibre.utils.date import parse_only_date res = None if yearAsString: try: res = parse_only_date(u"01.01." + yearAsString) except: log.error('cannot parse to date %s' % yearAsString) return res
def parse_pubdate(self, pd): for x in reversed(pd.xpath(self.publisher_xpath)): if x.tail: from calibre.utils.date import parse_only_date ans = x.tail date = ans.rpartition("(")[-1].replace(")", "").strip() date = self.delocalize_datestr(date) return parse_only_date(date, assume_utc=True)
def read_metadata_kfx(stream, read_cover=True): ' Read the metadata.kfx file that is found in the sdr book folder for KFX files ' c = Container(stream.read()) m = extract_metadata(c.decode()) # dump_metadata(m) def has(x): return m[x] and m[x][0] def get(x, single=True): ans = m[x] if single: ans = clean_xml_chars(ans[0]) if ans else '' else: ans = [clean_xml_chars(y) for y in ans] return ans title = get('title') or _('Unknown') authors = get('authors', False) or [_('Unknown')] auth_pat = re.compile(r'([^,]+?)\s*,\s+([^,]+)$') def fix_author(x): if tweaks['author_sort_copy_method'] != 'copy': m = auth_pat.match(x.strip()) if m is not None: return m.group(2) + ' ' + m.group(1) return x mi = Metadata(title, [fix_author(x) for x in authors]) if has('author'): mi.author_sort = get('author') if has('ASIN'): mi.set_identifier('mobi-asin', get('ASIN')) elif has('content_id'): mi.set_identifier('mobi-asin', get('content_id')) if has('languages'): langs = list(filter(None, (canonicalize_lang(x) for x in get('languages', False)))) if langs: mi.languages = langs if has('issue_date'): try: mi.pubdate = parse_only_date(get('issue_date')) except Exception: pass if has('publisher') and get('publisher') != 'Unknown': mi.publisher = get('publisher') if read_cover and m[COVER_KEY]: try: data = base64.standard_b64decode(m[COVER_KEY]) w, h, fmt = identify_data(data) except Exception: w, h, fmt = 0, 0, None if fmt and w and h: mi.cover_data = (fmt, data) return mi
def read_metadata_kfx(stream, read_cover=True): " Read the metadata.kfx file that is found in the sdr book folder for KFX files " c = Container(stream.read()) m = extract_metadata(c.decode()) # dump_metadata(m) def has(x): return m[x] and m[x][0] def get(x, single=True): ans = m[x] if single: ans = clean_xml_chars(ans[0]) if ans else "" else: ans = [clean_xml_chars(y) for y in ans] return ans title = get("title") or _("Unknown") authors = get("authors", False) or [_("Unknown")] auth_pat = re.compile(r"([^,]+?)\s*,\s+([^,]+)$") def fix_author(x): if tweaks["author_sort_copy_method"] != "copy": m = auth_pat.match(x.strip()) if m is not None: return m.group(2) + " " + m.group(1) return x mi = Metadata(title, [fix_author(x) for x in authors]) if has("author"): mi.author_sort = get("author") if has("ASIN"): mi.set_identifier("mobi-asin", get("ASIN")) elif has("content_id"): mi.set_identifier("mobi-asin", get("content_id")) if has("languages"): langs = list(filter(None, (canonicalize_lang(x) for x in get("languages", False)))) if langs: mi.languages = langs if has("issue_date"): try: mi.pubdate = parse_only_date(get("issue_date")) except Exception: pass if has("publisher") and get("publisher") != "Unknown": mi.publisher = get("publisher") if read_cover and m[COVER_KEY]: try: data = base64.standard_b64decode(m[COVER_KEY]) fmt, w, h = identify(bytes(data)) except Exception: w, h, fmt = 0, 0, None if fmt and w > -1 and h > -1: mi.cover_data = (fmt, data) return mi
def get_comic_book_info(d, mi, series_index='volume'): # See http://code.google.com/p/comicbookinfo/wiki/Example series = d.get('series', '') if series.strip(): mi.series = series si = d.get(series_index, None) if si is None: si = d.get('issue' if series_index == 'volume' else 'volume', None) if si is not None: try: mi.series_index = float(si) except Exception: mi.series_index = 1 if d.get('language', None): lang = canonicalize_lang(d.get('lang')) if lang: mi.languages = [lang] if d.get('rating', -1) > -1: mi.rating = d['rating'] for x in ('title', 'publisher'): y = d.get(x, '').strip() if y: setattr(mi, x, y) tags = d.get('tags', []) if tags: mi.tags = tags authors = [] for credit in d.get('credits', []): if credit.get('role', '') in ('Writer', 'Artist', 'Cartoonist', 'Creator'): x = credit.get('person', '') if x: x = ' '.join(reversed(x.split(', '))) authors.append(x) if authors: mi.authors = authors comments = d.get('comments', '') if comments and comments.strip(): mi.comments = comments.strip() pubm, puby = d.get('publicationMonth', None), d.get('publicationYear', None) if puby is not None: from calibre.utils.date import parse_only_date from datetime import date try: dt = date(puby, 6 if pubm is None else pubm, 15) dt = parse_only_date(str(dt)) mi.pubdate = dt except Exception: pass
def parse_response(cls, response, isbn_initial, log): metadata_items = [] page_soup = BeautifulSoup(response.text) for idx, candidate in enumerate(cls.find(page_soup, 'b-result'), 1): title = cls.find(candidate, 'b-result__name-wrap', True) author = map( unicode.strip, cls.find(candidate, 'b-result__author', True).split(',')) comments = cls.find(candidate, 'b-result__desc__full', True).replace(u'Скрыть', '').strip() isbn = cls.find(candidate, 'b-result__isbn', True).split(':')[-1].split(',')[0].strip() log.info(u'Found candidate %s: %s' % (idx, title)) publisher = None pubdate = None other_info = cls.find(candidate, 'b-result__years', True).strip() if other_info: for entry in other_info.split(';'): k, v = entry.split(':', 1) k = k.strip() if k == u'Год': pubdate = parse_only_date('1.1.%s' % v.split(',')[0].strip()) elif k == u'Издательство': publisher = v.strip() metadata_item = Metadata(title, author) metadata_item.isbn = isbn or isbn_initial if comments: metadata_item.comments = comments if publisher is not None: metadata_item.publisher = publisher if pubdate is not None: metadata_item.pubdate = pubdate metadata_items.append(metadata_item) return metadata_items
def get_comic_book_info(d, mi, series_index='volume'): # See http://code.google.com/p/comicbookinfo/wiki/Example series = d.get('series', '') if series.strip(): mi.series = series si = d.get(series_index, None) if si is None: si = d.get('issue' if series_index == 'volume' else 'volume', None) if si is not None: try: mi.series_index = float(si) except Exception: mi.series_index = 1 if d.get('rating', -1) > -1: mi.rating = d['rating'] for x in ('title', 'publisher'): y = d.get(x, '').strip() if y: setattr(mi, x, y) tags = d.get('tags', []) if tags: mi.tags = tags authors = [] for credit in d.get('credits', []): if credit.get('role', '') in ('Writer', 'Artist', 'Cartoonist', 'Creator'): x = credit.get('person', '') if x: x = ' '.join((reversed(x.split(', ')))) authors.append(x) if authors: mi.authors = authors comments = d.get('comments', '') if comments and comments.strip(): mi.comments = comments.strip() pubm, puby = d.get('publicationMonth', None), d.get('publicationYear', None) if puby is not None: from calibre.utils.date import parse_only_date from datetime import date try: dt = date(puby, 6 if pubm is None else pubm, 15) dt = parse_only_date(str(dt)) mi.pubdate = dt except: pass
def parse_response(cls, response, isbn_initial, log): metadata_items = [] page_soup = BeautifulSoup(response.text) for idx, candidate in enumerate(cls.find(page_soup, 'b-result'), 1): title = cls.find(candidate, 'b-result__name-wrap', True) author = map(unicode.strip, cls.find(candidate, 'b-result__author', True).split(',')) comments = cls.find(candidate, 'b-result__desc__full', True).replace(u'Скрыть', '').strip() isbn = cls.find(candidate, 'b-result__isbn', True).split(':')[-1].split(',')[0].strip() log.info(u'Found candidate %s: %s' % (idx, title)) publisher = None pubdate = None other_info = cls.find(candidate, 'b-result__years', True).strip() if other_info: for entry in other_info.split(';'): k, v = entry.split(':', 1) k = k.strip() if k == u'Год': pubdate = parse_only_date('1.1.%s' % v.split(',')[0].strip()) elif k == u'Издательство': publisher = v.strip() metadata_item = Metadata(title, author) metadata_item.isbn = isbn or isbn_initial if comments: metadata_item.comments = comments if publisher is not None: metadata_item.publisher = publisher if pubdate is not None: metadata_item.pubdate = pubdate metadata_items.append(metadata_item) return metadata_items
def parse_pubdate(self, pd): matches = pd.xpath(self.publish_date_xpath) date = None if matches: date = self.totext(matches[0]) if not matches: matches = pd.xpath( '//div[@class="show_info_left" and contains(text(), "出版时间")]/../div' ) if len(matches) > 1: date = self.totext(matches[1]) if date: from calibre.utils.date import parse_only_date date = date.replace('年', '/').replace('月', '/').replace('日', '') \ .replace('-', '/').replace('出版时间', '').replace(':', '').strip() if date.endswith('/'): date = "%s1" % date date = self.delocalize_datestr(date) return parse_only_date(date, assume_utc=True)
def get_basic_data(browser, log, *skus): from calibre.utils.date import parse_only_date from mechanize import Request zeroes = ','.join('0' for sku in skus) data = { 'skus': ','.join(skus), 'drc': zeroes, 'startPosition': '0', 'sequence': '1', 'selected': zeroes, 'itemID': '0', 'orderID': '0', 'mailingID': '', 'tContentWidth': '926', 'originalOrder': ','.join(str(i) for i in range(len(skus))), 'selectedOrderID': '0', 'selectedSortColumn': '0', 'listType': '1', 'resultType': '32', 'blockView': '1', } items_data_url = 'https://www.edelweiss.plus/GetTreelineControl.aspx?controlName=/uc/listviews/ListView_Title_Multi.ascx' req = Request(items_data_url, data) response = browser.open_novisit(req) raw = response.read() root = parse_html(raw) for item in root.xpath('//div[@data-priority]'): row = item.getparent().getparent() sku = item.get('id').split('-')[-1] isbns = [ x.strip() for x in row.xpath( 'descendant::*[contains(@class, "pev_sku")]/text()')[0].split( ',') if check_isbn(x.strip()) ] isbns.sort(key=len, reverse=True) try: tags = [ x.strip() for x in astext( row.xpath( 'descendant::*[contains(@class, "pev_categories")]') [0]).split('/') ] except IndexError: tags = [] rating = 0 for bar in row.xpath( 'descendant::*[contains(@class, "bgdColorCommunity")]/@style'): m = re.search('width: (\d+)px;.*max-width: (\d+)px', bar) if m is not None: rating = float(m.group(1)) / float(m.group(2)) break try: pubdate = parse_only_date(astext( row.xpath('descendant::*[contains(@class, "pev_shipDate")]') [0]).split(':')[-1].split(u'\xa0')[-1].strip(), assume_utc=True) except Exception: log.exception('Error parsing published date') pubdate = None authors = [] for x in [ x.strip() for x in row.xpath( 'descendant::*[contains(@class, "pev_contributor")]/@title' ) ]: authors.extend(a.strip() for a in x.split(',')) entry = { 'sku': sku, 'cover': row.xpath('descendant::img/@src')[0].split('?')[0], 'publisher': astext( row.xpath('descendant::*[contains(@class, "headerPublisher")]') [0]), 'title': astext(row.xpath('descendant::*[@id="title_{}"]'.format(sku))[0]), 'authors': authors, 'isbns': isbns, 'tags': tags, 'pubdate': pubdate, 'format': ' '.join( row.xpath( 'descendant::*[contains(@class, "pev_format")]/text()')). strip(), 'rating': rating, } if entry['cover'].startswith('/'): entry['cover'] = None yield entry
def adapt_date(x): if isinstance(x, (unicode, bytes)): x = parse_only_date(x) if x is None or is_date_undefined(x): x = UNDEFINED_DATE return x
def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_only_date, UNDEFINED_DATE root = parse_html(raw) sku = CSSSelect("div.sku.attGroup")(root)[0] info = sku.getparent() top = info.getparent().getparent() banner = top.find("div") spans = banner.findall("span") title = "" for i, span in enumerate(spans): if i == 0 or "12pt" in span.get("style", ""): title += astext(span) else: break authors = [re.sub(r"\(.*\)", "", x).strip() for x in astext(spans[-1]).split(",")] mi = Metadata(title.strip(), authors) # Identifiers isbns = [check_isbn(x.strip()) for x in astext(sku).split(",")] for isbn in isbns: if isbn: self.plugin.cache_isbn_to_identifier(isbn, self.sku) isbns = sorted(isbns, key=lambda x: len(x) if x else 0, reverse=True) if isbns and isbns[0]: mi.isbn = isbns[0] mi.set_identifier("edelweiss", self.sku) # Tags bisac = CSSSelect("div.bisac.attGroup")(root) if bisac: bisac = astext(bisac[0]) mi.tags = [x.strip() for x in bisac.split(",")] mi.tags = [t[1:].strip() if t.startswith("&") else t for t in mi.tags] # Publisher pub = CSSSelect("div.supplier.attGroup")(root) if pub: pub = astext(pub[0]) mi.publisher = pub # Pubdate pub = CSSSelect("div.shipDate.attGroupItem")(root) if pub: pub = astext(pub[0]) parts = pub.partition(":")[0::2] pub = parts[1] or parts[0] try: if ", Ship Date:" in pub: pub = pub.partition(", Ship Date:")[0] q = parse_only_date(pub, assume_utc=True) if q.year != UNDEFINED_DATE: mi.pubdate = q except: self.log.exception("Error parsing published date: %r" % pub) # Comments comm = "" general = CSSSelect("div#pd-general-overview-content")(root) if general: q = self.render_comments(general[0]) if q != "<p>No title summary available. </p>": comm += q general = CSSSelect("div#pd-general-contributor-content")(root) if general: comm += self.render_comments(general[0]) general = CSSSelect("div#pd-general-quotes-content")(root) if general: comm += self.render_comments(general[0]) if comm: mi.comments = comm # Cover img = CSSSelect("img.title-image[src]")(root) if img: href = img[0].get("src").replace("jacket_covers/medium/", "jacket_covers/flyout/") self.plugin.cache_identifier_to_cover_url(self.sku, href) mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None return mi
def metadata_from_filename(name, pat=None, fallback_pat=None): if isbytestring(name): name = name.decode(filesystem_encoding, "replace") name = name.rpartition(".")[0] mi = MetaInformation(None, None) if pat is None: pat = re.compile(prefs.get("filename_pattern")) name = name.replace("_", " ") match = pat.search(name) if match is None and fallback_pat is not None: match = fallback_pat.search(name) if match is not None: try: mi.title = match.group("title") except IndexError: pass try: au = match.group("author") aus = string_to_authors(au) if aus: mi.authors = aus if prefs["swap_author_names"] and mi.authors: def swap(a): if "," in a: parts = a.split(",", 1) else: parts = a.split(None, 1) if len(parts) > 1: t = parts[-1] parts = parts[:-1] parts.insert(0, t) return " ".join(parts) mi.authors = [swap(x) for x in mi.authors] except (IndexError, ValueError): pass try: mi.series = match.group("series") except IndexError: pass try: si = match.group("series_index") mi.series_index = float(si) except (IndexError, ValueError, TypeError): pass try: si = match.group("isbn") mi.isbn = si except (IndexError, ValueError): pass try: publisher = match.group("publisher") mi.publisher = publisher except (IndexError, ValueError): pass try: pubdate = match.group("published") if pubdate: from calibre.utils.date import parse_only_date mi.pubdate = parse_only_date(pubdate) except: pass if mi.is_null("title"): mi.title = name return mi
def metadata_from_filename(name, pat=None, fallback_pat=None): if isbytestring(name): name = name.decode(filesystem_encoding, 'replace') name = name.rpartition('.')[0] mi = MetaInformation(None, None) if pat is None: pat = re.compile(prefs.get('filename_pattern')) name = name.replace('_', ' ') match = pat.search(name) if match is None and fallback_pat is not None: match = fallback_pat.search(name) if match is not None: try: mi.title = match.group('title') except IndexError: pass try: au = match.group('author') aus = string_to_authors(au) if aus: mi.authors = aus if prefs['swap_author_names'] and mi.authors: def swap(a): if ',' in a: parts = a.split(',', 1) else: parts = a.split(None, 1) if len(parts) > 1: t = parts[-1] parts = parts[:-1] parts.insert(0, t) return ' '.join(parts) mi.authors = [swap(x) for x in mi.authors] except (IndexError, ValueError): pass try: mi.series = match.group('series') except IndexError: pass try: si = match.group('series_index') mi.series_index = float(si) except (IndexError, ValueError, TypeError): pass try: si = match.group('isbn') mi.isbn = si except (IndexError, ValueError): pass try: publisher = match.group('publisher') mi.publisher = publisher except (IndexError, ValueError): pass try: pubdate = match.group('published') if pubdate: from calibre.utils.date import parse_only_date mi.pubdate = parse_only_date(pubdate) except: pass try: comments = match.group('comments') mi.comments = comments except (IndexError, ValueError): pass if mi.is_null('title'): mi.title = name return mi
def _parse_pubdate(root, mi, ctx): year = ctx.XPath('number(//fb:publish-info/fb:year/text())')(root) if float.is_integer(year): # only year is available, so use 2nd of June mi.pubdate = parse_only_date(type(u'')(int(year)))
def convert_comic_md_to_calibre_md(self, comic_metadata): ''' Maps the entries in the comic_metadata to calibre metadata ''' import unicodedata from calibre.ebooks.metadata import MetaInformation from calibre.utils.date import parse_only_date from datetime import date from calibre.utils.localization import calibre_langcode_to_name if self.comic_md_in_calibre_format: return # synonyms for artists WRITER = ['writer', 'plotter', 'scripter'] PENCILLER = ['artist', 'penciller', 'penciler', 'breakdowns'] INKER = ['inker', 'artist', 'finishes'] COLORIST = ['colorist', 'colourist', 'colorer', 'colourer'] LETTERER = ['letterer'] COVER_ARTIST = ['cover', 'covers', 'coverartist', 'cover artist'] EDITOR = ['editor'] # start with a fresh calibre metadata mi = MetaInformation(None, None) co = comic_metadata # shorten some functions role = partial(get_role, credits=co.credits) update_field = partial(update_calibre_field, target=mi) # Get title, if no title, try to assign series infos if co.title: mi.title = co.title elif co.series: mi.title = co.series if co.issue: mi.title += " " + str(co.issue) else: mi.title = "" # tags if co.tags != [] and prefs['import_tags']: if prefs['overwrite_calibre_tags']: mi.tags = co.tags else: mi.tags = list(set(self.calibre_metadata.tags + co.tags)) # simple metadata update_field("authors", role(WRITER)) update_field("series", co.series) update_field("rating", co.criticalRating) update_field("publisher", co.publisher) # special cases if co.language: update_field("language", calibre_langcode_to_name(co.language)) if co.comments: update_field("comments", co.comments.strip()) # issue if co.issue: if isinstance(co.issue, unicode): mi.series_index = unicodedata.numeric(co.issue) else: mi.series_index = float(co.issue) # pub date puby = co.year pubm = co.month if puby is not None: try: dt = date(int(puby), 6 if pubm is None else int(pubm), 15) dt = parse_only_date(str(dt)) mi.pubdate = dt except: pass # custom columns custom_cols = self.db.field_metadata.custom_field_metadata() update_column = partial(update_custom_column, calibre_metadata=mi, custom_cols=custom_cols) # artists update_column(prefs['penciller_column'], role(PENCILLER)) update_column(prefs['inker_column'], role(INKER)) update_column(prefs['colorist_column'], role(COLORIST)) update_column(prefs['letterer_column'], role(LETTERER)) update_column(prefs['cover_artist_column'], role(COVER_ARTIST)) update_column(prefs['editor_column'], role(EDITOR)) # others update_column(prefs['storyarc_column'], co.storyArc) update_column(prefs['characters_column'], co.characters) update_column(prefs['teams_column'], co.teams) update_column(prefs['locations_column'], co.locations) update_column(prefs['volume_column'], co.volume) update_column(prefs['genre_column'], co.genre) self.comic_md_in_calibre_format = mi
def convert_comic_md_to_calibre_md(self, comic_metadata): ''' Maps the entries in the comic_metadata to calibre metadata ''' import unicodedata from calibre.ebooks.metadata import MetaInformation from calibre.utils.date import parse_only_date from datetime import date from calibre.utils.localization import calibre_langcode_to_name if self.comic_md_in_calibre_format: return # start with a fresh calibre metadata mi = MetaInformation(None, None) co = comic_metadata # shorten some functions role = partial(get_role, credits=co.credits) update_field = partial(update_calibre_field, target=mi) # Get title, if no title, try to assign series infos if co.title: mi.title = co.title elif co.series: mi.title = co.series if co.issue: mi.title += " " + str(co.issue) else: mi.title = "" # tags if co.tags != [] and prefs['import_tags']: if prefs['overwrite_calibre_tags']: mi.tags = co.tags else: mi.tags = list(set(self.calibre_metadata.tags + co.tags)) # simple metadata update_field("authors", role(WRITER)) update_field("series", co.series) update_field("rating", co.criticalRating) update_field("publisher", co.publisher) # special cases if co.language: update_field("language", calibre_langcode_to_name(co.language)) if co.comments: update_field("comments", co.comments.strip()) # issue if co.issue: try: if not python3 and isinstance(co.issue, unicode): mi.series_index = unicodedata.numeric(co.issue) else: mi.series_index = float(co.issue) except ValueError: pass # pub date puby = co.year pubm = co.month if puby is not None: try: dt = date(int(puby), 6 if pubm is None else int(pubm), 15) dt = parse_only_date(str(dt)) mi.pubdate = dt except: pass # custom columns update_column = partial( update_custom_column, calibre_metadata=mi, custom_cols=self.db.field_metadata.custom_field_metadata()) # artists update_column(prefs['penciller_column'], role(PENCILLER)) update_column(prefs['inker_column'], role(INKER)) update_column(prefs['colorist_column'], role(COLORIST)) update_column(prefs['letterer_column'], role(LETTERER)) update_column(prefs['cover_artist_column'], role(COVER_ARTIST)) update_column(prefs['editor_column'], role(EDITOR)) # others update_column(prefs['storyarc_column'], co.storyArc) update_column(prefs['characters_column'], co.characters) update_column(prefs['teams_column'], co.teams) update_column(prefs['locations_column'], co.locations) update_column(prefs['genre_column'], co.genre) ensure_int(co.issueCount, update_column, prefs['count_column'], co.issueCount) ensure_int(co.volume, update_column, prefs['volume_column'], co.volume) if prefs['auto_count_pages']: update_column(prefs['pages_column'], self.count_pages()) else: update_column(prefs['pages_column'], co.pageCount) if prefs['get_image_sizes']: update_column(prefs['image_size_column'], self.get_picture_size()) update_column(prefs['comicvine_column'], '<a href="{}">Comic Vine</a>'.format(co.webLink)) update_column(prefs['manga_column'], co.manga) self.comic_md_in_calibre_format = mi
def _parse_pubdate(root, mi, ctx): year = ctx.XPath('number(//fb:publish-info/fb:year/text())')(root) if float.is_integer(year): # only year is available, so use 2nd of June mi.pubdate = parse_only_date(unicode_type(int(year)))
def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_only_date, UNDEFINED_DATE from css_selectors import Select root = parse_html(raw) selector = Select(root) sku = next(selector('div.sku.attGroup')) info = sku.getparent() top = info.getparent().getparent() banner = top.find('div') spans = banner.findall('span') title = '' for i, span in enumerate(spans): if i == 0 or '12pt' in span.get('style', ''): title += astext(span) else: break authors = [ re.sub(r'\(.*\)', '', x).strip() for x in astext(spans[-1]).split(',') ] mi = Metadata(title.strip(), authors) # Identifiers isbns = [check_isbn(x.strip()) for x in astext(sku).split(',')] for isbn in isbns: if isbn: self.plugin.cache_isbn_to_identifier(isbn, self.sku) isbns = sorted(isbns, key=lambda x: len(x) if x else 0, reverse=True) if isbns and isbns[0]: mi.isbn = isbns[0] mi.set_identifier('edelweiss', self.sku) # Tags bisac = tuple(selector('div.bisac.attGroup')) if bisac: bisac = astext(bisac[0]) mi.tags = [x.strip() for x in bisac.split(',')] mi.tags = [ t[1:].strip() if t.startswith('&') else t for t in mi.tags ] # Publisher pub = tuple(selector('div.supplier.attGroup')) if pub: pub = astext(pub[0]) mi.publisher = pub # Pubdate pub = tuple(selector('div.shipDate.attGroupItem')) if pub: pub = astext(pub[0]) parts = pub.partition(':')[0::2] pub = parts[1] or parts[0] try: if ', Ship Date:' in pub: pub = pub.partition(', Ship Date:')[0] q = parse_only_date(pub, assume_utc=True) if q.year != UNDEFINED_DATE: mi.pubdate = q except: self.log.exception('Error parsing published date: %r' % pub) # Comments comm = '' general = tuple(selector('div#pd-general-overview-content')) if general: q = self.render_comments(general[0]) if q != '<p>No title summary available. </p>': comm += q general = tuple(selector('div#pd-general-contributor-content')) if general: comm += self.render_comments(general[0]) general = tuple(selector('div#pd-general-quotes-content')) if general: comm += self.render_comments(general[0]) if comm: mi.comments = comm # Cover img = tuple(selector('img.title-image[src]')) if img: href = img[0].get('src').replace('jacket_covers/medium/', 'jacket_covers/flyout/') self.plugin.cache_identifier_to_cover_url(self.sku, href) mi.has_cover = self.plugin.cached_identifier_to_cover_url( self.sku) is not None return mi
def get_basic_data(browser, log, *skus): from calibre.utils.date import parse_only_date from mechanize import Request zeroes = ','.join('0' for sku in skus) data = { 'skus': ','.join(skus), 'drc': zeroes, 'startPosition': '0', 'sequence': '1', 'selected': zeroes, 'itemID': '0', 'orderID': '0', 'mailingID': '', 'tContentWidth': '926', 'originalOrder': ','.join(str(i) for i in range(len(skus))), 'selectedOrderID': '0', 'selectedSortColumn': '0', 'listType': '1', 'resultType': '32', 'blockView': '1', } items_data_url = 'https://www.edelweiss.plus/GetTreelineControl.aspx?controlName=/uc/listviews/ListView_Title_Multi.ascx' req = Request(items_data_url, data) response = browser.open_novisit(req) raw = response.read() root = parse_html(raw) for item in root.xpath('//div[@data-priority]'): row = item.getparent().getparent() sku = item.get('id').split('-')[-1] isbns = [x.strip() for x in row.xpath('descendant::*[contains(@class, "pev_sku")]/text()')[0].split(',') if check_isbn(x.strip())] isbns.sort(key=len, reverse=True) try: tags = [x.strip() for x in astext(row.xpath('descendant::*[contains(@class, "pev_categories")]')[0]).split('/')] except IndexError: tags = [] rating = 0 for bar in row.xpath('descendant::*[contains(@class, "bgdColorCommunity")]/@style'): m = re.search(r'width: (\d+)px;.*max-width: (\d+)px', bar) if m is not None: rating = float(m.group(1)) / float(m.group(2)) break try: pubdate = parse_only_date(astext(row.xpath('descendant::*[contains(@class, "pev_shipDate")]')[0] ).split(':')[-1].split(u'\xa0')[-1].strip(), assume_utc=True) except Exception: log.exception('Error parsing published date') pubdate = None authors = [] for x in [x.strip() for x in row.xpath('descendant::*[contains(@class, "pev_contributor")]/@title')]: authors.extend(a.strip() for a in x.split(',')) entry = { 'sku': sku, 'cover': row.xpath('descendant::img/@src')[0].split('?')[0], 'publisher': astext(row.xpath('descendant::*[contains(@class, "headerPublisher")]')[0]), 'title': astext(row.xpath('descendant::*[@id="title_{}"]'.format(sku))[0]), 'authors': authors, 'isbns': isbns, 'tags': tags, 'pubdate': pubdate, 'format': ' '.join(row.xpath('descendant::*[contains(@class, "pev_format")]/text()')).strip(), 'rating': rating, } if entry['cover'].startswith('/'): entry['cover'] = None yield entry
def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_only_date, UNDEFINED_DATE from css_selectors import Select root = parse_html(raw) selector = Select(root) sku = next(selector('div.sku.attGroup')) info = sku.getparent() top = info.getparent().getparent() banner = top.find('div') spans = banner.findall('span') title = '' for i, span in enumerate(spans): if i == 0 or '12pt' in span.get('style', ''): title += astext(span) else: break authors = [re.sub(r'\(.*\)', '', x).strip() for x in astext(spans[-1]).split(',')] mi = Metadata(title.strip(), authors) # Identifiers isbns = [check_isbn(x.strip()) for x in astext(sku).split(',')] for isbn in isbns: if isbn: self.plugin.cache_isbn_to_identifier(isbn, self.sku) isbns = sorted(isbns, key=lambda x:len(x) if x else 0, reverse=True) if isbns and isbns[0]: mi.isbn = isbns[0] mi.set_identifier('edelweiss', self.sku) # Tags bisac = tuple(selector('div.bisac.attGroup')) if bisac: bisac = astext(bisac[0]) mi.tags = [x.strip() for x in bisac.split(',')] mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags] # Publisher pub = tuple(selector('div.supplier.attGroup')) if pub: pub = astext(pub[0]) mi.publisher = pub # Pubdate pub = tuple(selector('div.shipDate.attGroupItem')) if pub: pub = astext(pub[0]) parts = pub.partition(':')[0::2] pub = parts[1] or parts[0] try: if ', Ship Date:' in pub: pub = pub.partition(', Ship Date:')[0] q = parse_only_date(pub, assume_utc=True) if q.year != UNDEFINED_DATE: mi.pubdate = q except: self.log.exception('Error parsing published date: %r'%pub) # Comments comm = '' general = tuple(selector('div#pd-general-overview-content')) if general: q = self.render_comments(general[0]) if q != '<p>No title summary available. </p>': comm += q general = tuple(selector('div#pd-general-contributor-content')) if general: comm += self.render_comments(general[0]) general = tuple(selector('div#pd-general-quotes-content')) if general: comm += self.render_comments(general[0]) if comm: mi.comments = comm # Cover img = tuple(selector('img.title-image[src]')) if img: href = img[0].get('src').replace('jacket_covers/medium/', 'jacket_covers/flyout/') self.plugin.cache_identifier_to_cover_url(self.sku, href) mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None return mi