Пример #1
0
 def test_rtf_metadata(self):
     stream = BytesIO(br'{\rtf1\ansi\ansicpg1252}')
     m = Metadata('Test ø̄title', ['Author One', 'Author БTwo'])
     m.tags = 'tag1 見tag2'.split()
     m.comments = '<p>some ⊹comments</p>'
     m.publisher = 'publiSher'
     set_metadata(stream, m)
     stream.seek(0)
     o = get_metadata(stream)
     for attr in 'title authors publisher comments tags'.split():
         self.assertEqual(getattr(m, attr), getattr(o, attr))
Пример #2
0
 def test_input_comment_multi(self):
     stream_meta = get_metadata(self.get_stream('comment_multi'))
     canon_meta = Metadata('A Comment Tag &amp; Title Ⓒ', ['James Madison', 'James Monroe', 'John Quincy Adams'])
     canon_meta.publisher = 'Publisher C'
     canon_meta.languages = ['French', 'Japanese']
     canon_meta.pubdate = parse_date('2015-01-01')
     canon_meta.timestamp = parse_date('2014-01-01')
     canon_meta.series = 'Comment Series'
     canon_meta.series_index = float(3)
     canon_meta.rating = float(0)
     canon_meta.comments = 'comment &quot;comments&quot; ♥ HTML -- too &amp;amp;'
     canon_meta.tags = ['tag d', 'tag e', 'tag f']
     canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'})
     self.compare_metadata(stream_meta, canon_meta)
Пример #3
0
 def test_input_meta_multi(self):
     stream_meta = get_metadata(self.get_stream('meta_multi'))
     canon_meta = Metadata('A Meta Tag &amp; Title Ⓒ', ['George Washington', 'John Adams', 'Thomas Jefferson'])
     canon_meta.publisher = 'Publisher A'
     canon_meta.languages = ['English', 'Spanish']
     canon_meta.pubdate = parse_date('2019-01-01')
     canon_meta.timestamp = parse_date('2018-01-01')
     canon_meta.series = 'Meta Series'
     canon_meta.series_index = float(1)
     canon_meta.rating = float(8)
     canon_meta.comments = 'meta &quot;comments&quot; ♥ HTML &amp;amp;'
     canon_meta.tags = ['tag a', 'tag b', 'tag c']
     canon_meta.set_identifiers({'isbn': '1234567890', 'url': 'http://google.com/search?q=calibre'})
     self.compare_metadata(stream_meta, canon_meta)
Пример #4
0
 def test_input_meta_single(self):
     stream_meta = get_metadata(self.get_stream('meta_single'))
     canon_meta = Metadata('A Meta Tag &amp; Title Ⓒ', ['George Washington'])
     canon_meta.publisher = 'Publisher A'
     canon_meta.languages = ['English']
     canon_meta.pubdate = parse_date('2019-01-01')
     canon_meta.timestamp = parse_date('2018-01-01')
     canon_meta.series = 'Meta Series'
     canon_meta.series_index = float(1)
     # canon_meta.rating = float(0)
     # canon_meta.comments = ''
     canon_meta.tags = ['tag a', 'tag b']
     canon_meta.set_identifiers({'isbn': '1234567890'})
     self.compare_metadata(stream_meta, canon_meta)
Пример #5
0
def metadata_from_xmp_packet(raw_bytes):
    root = parse_xmp_packet(raw_bytes)
    mi = Metadata('Unknown')
    title = first_alt('//dc:title', root)
    if title:
        if title.startswith(r'\376\377'):
            # corrupted XMP packet generated by Nitro PDF. See
            # https://bugs.launchpad.net/calibre/+bug/1541981
            raise ValueError('Corrupted XMP metadata packet detected, '
                             'probably generated by Nitro PDF')
        mi.title = title
    authors = multiple_sequences('//dc:creator', root)
    if authors:
        mi.authors = authors
    tags = multiple_sequences('//dc:subject', root) or multiple_sequences(
        '//pdf:Keywords', root)
    if tags:
        mi.tags = tags
    comments = first_alt('//dc:description', root)
    if comments:
        mi.comments = comments
    publishers = multiple_sequences('//dc:publisher', root)
    if publishers:
        mi.publisher = publishers[0]
    try:
        pubdate = (parse_date(first_sequence('//dc:date', root)
                              or first_simple('//xmp:CreateDate', root),
                              assume_utc=False))
    except Exception:
        pass
    else:
        mi.pubdate = pubdate
    bkp = first_simple('//xmp:CreatorTool', root)
    if bkp:
        mi.book_producer = bkp
    md = safe_parse_date(first_simple('//xmp:MetadataDate', root))
    mod = safe_parse_date(first_simple('//xmp:ModifyDate', root))
    fd = more_recent(md, mod)
    if fd is not None:
        mi.metadata_date = fd
    rating = first_simple('//calibre:rating', root)
    if rating is not None:
        try:
            rating = float(rating)
            if 0 <= rating <= 10:
                mi.rating = rating
        except (ValueError, TypeError):
            pass
    series, series_index = read_series(root)
    if series:
        mi.series, mi.series_index = series, series_index
    for x in ('title_sort', 'author_sort'):
        for elem in XPath('//calibre:' + x)(root):
            val = read_simple_property(elem)
            if val:
                setattr(mi, x, val)
                break
    for x in ('author_link_map', 'user_categories'):
        val = first_simple('//calibre:' + x, root)
        if val:
            try:
                setattr(mi, x, json.loads(val))
            except Exception:
                pass

    languages = multiple_sequences('//dc:language', root)
    if languages:
        languages = list(filter(None, map(canonicalize_lang, languages)))
        if languages:
            mi.languages = languages

    identifiers = {}
    for xmpid in XPath('//xmp:Identifier')(root):
        for scheme, value in read_xmp_identifers(xmpid):
            if scheme and value:
                identifiers[scheme.lower()] = value

    for namespace in ('prism', 'pdfx'):
        for scheme in KNOWN_ID_SCHEMES:
            if scheme not in identifiers:
                val = first_simple('//%s:%s' % (namespace, scheme), root)
                scheme = scheme.lower()
                if scheme == 'isbn':
                    val = check_isbn(val)
                elif scheme == 'doi':
                    val = check_doi(val)
                if val:
                    identifiers[scheme] = val

    # Check Dublin Core for recognizable identifier types
    for scheme, check_func in {'doi': check_doi, 'isbn': check_isbn}.items():
        if scheme not in identifiers:
            val = check_func(first_simple('//dc:identifier', root))
            if val:
                identifiers['doi'] = val

    if identifiers:
        mi.set_identifiers(identifiers)

    read_user_metadata(mi, root)

    return mi