Exemplos de sanitize_comments_html em Python, exemplos de calibre.library.comments.sanitize_comments_html em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: test_build.py Projeto: tletnes/calibre

 def test_markdown(self):
     from calibre.ebooks.txt.processor import create_markdown_object
     from calibre.ebooks.conversion.plugins.txt_input import MD_EXTENSIONS
     create_markdown_object(sorted(MD_EXTENSIONS))
     from calibre.library.comments import sanitize_comments_html
     sanitize_comments_html(
         b'''<script>moo</script>xxx<img src="http://moo.com/x.jpg">''')

Exemplo n.º 2

0

Exibir arquivo

Arquivo: worker.py Projeto: shga89/Calibre-KyoboBook-Metadata-Source-Plugin

    def parse_comments(self, root):
        description_nodes = root.xpath(
            "//*[preceding-sibling::comment()[. = ' *** s:%s *** '] and following-sibling::comment()[. = ' *** //e:%s *** ']]"
            % ('책소개', '책소개'))

        default_append_toc = cfg.DEFAULT_STORE_VALUES[cfg.KEY_APPEND_TOC]
        append_toc = cfg.plugin_prefs[cfg.STORE_NAME].get(
            cfg.KEY_APPEND_TOC, default_append_toc)

        comments = ''
        if description_nodes:
            for description_node in description_nodes:
                comments += tostring(description_node,
                                     method='html',
                                     encoding=str).strip()
            while comments.find('  ') >= 0:
                comments = comments.replace('  ', ' ')
            comments = sanitize_comments_html(comments)

        if append_toc:
            toc_node = root.xpath(
                '//div[@class="box_detail_content"]/h2[@class="title_detail_basic" and contains(text(),"%s")]/following-sibling::div'
                % "목차")
            if toc_node:
                toc = tostring(toc_node[0], method='html')
                toc = sanitize_comments_html(toc)
                comments += '<h3>[목차]</h3><div id="toc">' + toc + "</div>"

        if comments:
            comments += "<hr />" + '<div><div style="float:right">[kyobobook]</div></div>'
        return comments

Exemplo n.º 3

0

Exibir arquivo

Arquivo: amazon.py Projeto: AtulKumar2/calibre

    def _render_comments(self, desc):
        from calibre.library.comments import sanitize_comments_html

        for c in desc.xpath('descendant::noscript'):
            c.getparent().remove(c)
        for c in desc.xpath('descendant::*[@class="seeAll" or'
                ' @class="emptyClear" or @id="collapsePS" or'
                ' @id="expandPS"]'):
            c.getparent().remove(c)

        for a in desc.xpath('descendant::a[@href]'):
            del a.attrib['href']
            a.tag = 'span'
        desc = self.tostring(desc, method='html', encoding=unicode).strip()

        # Encoding bug in Amazon data U+fffd (replacement char)
        # in some examples it is present in place of '
        desc = desc.replace('\ufffd', "'")
        # remove all attributes from tags
        desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
        # Collapse whitespace
        # desc = re.sub('\n+', '\n', desc)
        # desc = re.sub(' +', ' ', desc)
        # Remove the notice about text referring to out of print editions
        desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
        # Remove comments
        desc = re.sub(r'(?s)<!--.*?-->', '', desc)
        return sanitize_comments_html(desc)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: amazon.py Projeto: mrmac123/calibre

    def _render_comments(self, desc):
        from calibre.library.comments import sanitize_comments_html

        for c in desc.xpath('descendant::noscript'):
            c.getparent().remove(c)
        for c in desc.xpath('descendant::*[@class="seeAll" or'
                ' @class="emptyClear" or @id="collapsePS" or'
                ' @id="expandPS"]'):
            c.getparent().remove(c)

        for a in desc.xpath('descendant::a[@href]'):
            del a.attrib['href']
            a.tag = 'span'
        desc = self.tostring(desc, method='html', encoding=unicode).strip()

        # Encoding bug in Amazon data U+fffd (replacement char)
        # in some examples it is present in place of '
        desc = desc.replace('\ufffd', "'")
        # remove all attributes from tags
        desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
        # Collapse whitespace
        # desc = re.sub('\n+', '\n', desc)
        # desc = re.sub(' +', ' ', desc)
        # Remove the notice about text referring to out of print editions
        desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
        # Remove comments
        desc = re.sub(r'(?s)<!--.*?-->', '', desc)
        return sanitize_comments_html(desc)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: overdrive.py Projeto: JimmXinu/calibre

    def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log):
        from html5_parser import parse
        from lxml import html
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.library.comments import sanitize_comments_html

        try:
            raw = br.open_novisit(metadata_url).read()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                return False
            raise
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                resolve_entities=True)[0]

        try:
            root = parse(raw, maybe_xhtml=False, sanitize_names=True)
        except Exception:
            return False

        pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
        lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()")
        subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()")
        ebook_isbn = root.xpath("//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()")
        desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]")

        if pub_date:
            from calibre.utils.date import parse_date
            try:
                mi.pubdate = parse_date(pub_date[0].strip())
            except:
                pass
        if lang:
            lang = lang[0].strip().lower()
            lang = {'english':'eng', 'french':'fra', 'german':'deu',
                    'spanish':'spa'}.get(lang, None)
            if lang:
                mi.language = lang

        if ebook_isbn:
            # print "ebook isbn is "+str(ebook_isbn[0])
            isbn = check_isbn(ebook_isbn[0].strip())
            if isbn:
                self.cache_isbn_to_identifier(isbn, ovrdrv_id)
                mi.isbn = isbn
        if subjects:
            mi.tags = [tag.strip() for tag in subjects[0].split(',')]

        if desc:
            desc = desc[0]
            desc = html.tostring(desc, method='html', encoding='unicode').strip()
            # remove all attributes from tags
            desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
            # Remove comments
            desc = re.sub(r'(?s)<!--.*?-->', '', desc)
            mi.comments = sanitize_comments_html(desc)

        return None

Exemplo n.º 6

0

Exibir arquivo

Arquivo: worker.py Projeto: beeftornado/calibre-shelfari-metadata

 def parse_comments(self, root):
     description_node = root.xpath('//div[@class="ugc nonTruncatedSum"]/p')
     if description_node:
         desc = description_node[0]
         comments = tostring(desc, method='html', encoding=unicode).strip()
         while comments.find('  ') >= 0:
             comments = comments.replace('  ',' ')
         comments = sanitize_comments_html(comments)
         return comments

Exemplo n.º 7

0

Exibir arquivo

Arquivo: worker.py Projeto: PERCE-NEIGE/Plugin-Calibre-Shelfari-metadata

 def parse_comments(self, root):
     description_node = root.xpath('//div[@class="ugc nonTruncatedSum"]/p')
     if description_node:
         desc = description_node[0]
         comments = tostring(desc, method='html', encoding=unicode).strip()
         while comments.find('  ') >= 0:
             comments = comments.replace('  ', ' ')
         comments = sanitize_comments_html(comments)
         return comments

Exemplo n.º 8

0

Exibir arquivo

Arquivo: worker.py Projeto: mrweiss/calibre-metadata-search-cbdb

 def parse_comments(self, root):
     # Look for description in a second span that gets expanded when interactively displayed [@id="display:none"]
     description_node = root.xpath('//div[@id="annotation"]')
     if description_node:
         desc = description_node[0].text_content().strip()
         comments = sanitize_comments_html(desc)
         while comments.find('  ') >= 0:
             comments = comments.replace('  ', ' ')
         return comments

Exemplo n.º 9

0

Exibir arquivo

Arquivo: worker.py Projeto: john-peterson/goodreads

 def parse_comments(self, root):
     # Look for description in a second span that gets expanded when interactively displayed [@id="display:none"]
     description_node = root.xpath('//div[@id="metacol"]/div[@id="description"]/span')
     if description_node:
         desc = description_node[0] if len(description_node) == 1 else description_node[1]
         less_link = desc.xpath('a[@class="actionLinkLite"]')
         if less_link is not None and len(less_link):
             desc.remove(less_link[0])
         comments = tostring(desc, method="html", encoding=unicode).strip()
         while comments.find("  ") >= 0:
             comments = comments.replace("  ", " ")
         comments = sanitize_comments_html(comments)
         return comments

Exemplo n.º 10

0

Exibir arquivo

Arquivo: worker.py Projeto: itmir913/calibre-naverbook-plugin

 def parse_comments(self, root):
     # Look for description in a second span that gets expanded when interactively displayed [@id="display:none"]
     description_node = root.xpath('//div[@id="bookIntroContent"]')
     if description_node:
         desc = description_node[0] if len(description_node) == 1 else description_node[1]
         less_link = desc.xpath('div[@class="section_open more_btn_t2"]')
         if less_link is not None and len(less_link):
             desc.remove(less_link[0])
         comments = tostring(desc, method='html', encoding=unicode).strip()
         while comments.find('  ') >= 0:
             comments = comments.replace('  ',' ')
         comments = sanitize_comments_html(comments)
         return comments

Exemplo n.º 11

0

Exibir arquivo

 def parse_comments(self, root):
     description_nodes = root.xpath("//*[preceding-sibling::comment()[. = ' *** s:%s *** '] and following-sibling::comment()[. = ' *** //e:%s *** ']]" % (u'책소개',u'책소개'))
     
     default_append_toc = cfg.DEFAULT_STORE_VALUES[cfg.KEY_APPEND_TOC]
     append_toc = cfg.plugin_prefs[cfg.STORE_NAME].get(cfg.KEY_APPEND_TOC, default_append_toc)
     
     comments = ''
     if description_nodes:
         for description_node in description_nodes:
             comments += tostring(description_node, method='html', encoding=unicode).strip()
         while comments.find('  ') >= 0:
             comments = comments.replace('  ',' ')
         comments = sanitize_comments_html(comments)
         
     if append_toc:
         toc_node = root.xpath('//div[@class="box_detail_content"]/h2[@class="title_detail_basic" and contains(text(),"%s")]/following-sibling::div' % u"목차")
         if toc_node:
             toc = tostring(toc_node[0], method='html')
             toc = sanitize_comments_html(toc)
             comments += '<h3>[목차]</h3><div id="toc">' + toc + "</div>"
         
     if comments:
         comments += "<hr />" + '<div><div style="float:right">[kyobobook]</div></div>'
     return comments

Exemplo n.º 12

0

Exibir arquivo

    def _render_comments(self, desc):
        from calibre.library.comments import sanitize_comments_html
        import html5lib
        # html5lib parsed noscript as CDATA

        desc = html5lib.parseFragment('<div>%s</div>' % (self.totext(desc).replace('textarea', 'div')), \
                                      treebuilder='lxml', namespaceHTMLElements=False)[0]
        matches = desc.xpath('descendant::*[contains(text(), "内容提要") \
            or contains(text(), "内容推荐") or contains(text(), "编辑推荐") \
            or contains(text(), "内容简介") or contains(text(), "基本信息")]/../*[self::p or self::div or self::span]'
                             )

        if matches:
            if len(matches) > 1:
                desc = matches[-1]
                for item in matches:
                    content_len = len(self.totext(item))
                    if content_len > 50 and content_len < 200:
                        desc = item
                        break

        for c in desc.xpath('descendant::noscript'):
            c.getparent().remove(c)
        for c in desc.xpath('descendant::*[@class="seeAll" or'
                            ' @class="emptyClear" or @id="collapsePS" or'
                            ' @id="expandPS"]'):
            c.getparent().remove(c)
        #
        for a in desc.xpath('descendant::a[@href]'):
            del a.attrib['href']
            a.tag = 'span'
        desc = self.tostring(desc, method='text', encoding=unicode).strip()
        # return desc
        # Encoding bug in Amazon data U+fffd (replacement char)
        # in some examples it is present in place of '
        desc = desc.replace('\ufffd', "'")
        # remove all attributes from tags
        desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
        # Collapse whitespace
        desc = re.sub('\n+', '\n', desc)
        desc = re.sub(' +', ' ', desc)
        # Remove the notice about text referring to out of print editions
        desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
        # Remove comments
        desc = re.sub(r'(?s)<!--.*?-->', '', desc)
        return sanitize_comments_html(desc)

Exemplo n.º 13

0

Exibir arquivo

    def parse_comments(self, root, mi):
        """
        """
        try:
            comments_node = root.xpath(
                "//p[@itemprop='description']/span/text()")
            if not comments_node:
                comments_node = root.xpath(
                    "//p[@itemprop='description']/text()")
            self.log.info("        Comments node: %s" % comments_node)

            if comments_node:
                mi.comments = self.comments = sanitize_comments_html(
                    "".join(comments_node))
            self.log.info("        Parsed comments: %s" % mi.comments)
        except:
            self.log.exception("Error parsing comments for url: %r" % self.url)

Exemplo n.º 14

0

Exibir arquivo

Arquivo: edelweiss.py Projeto: yunfile123/calibre

    def render_comments(self, desc):
        from lxml import etree
        from calibre.library.comments import sanitize_comments_html
        for c in desc.xpath('descendant::noscript'):
            c.getparent().remove(c)
        for a in desc.xpath('descendant::a[@href]'):
            del a.attrib['href']
            a.tag = 'span'
        desc = etree.tostring(desc, method='html', encoding=unicode).strip()

        # remove all attributes from tags
        desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
        # Collapse whitespace
        # desc = re.sub('\n+', '\n', desc)
        # desc = re.sub(' +', ' ', desc)
        # Remove comments
        desc = re.sub(r'(?s)<!--.*?-->', '', desc)
        return sanitize_comments_html(desc)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: edelweiss.py Projeto: JimmXinu/calibre

    def render_comments(self, desc):
        from lxml import etree
        from calibre.library.comments import sanitize_comments_html
        for c in desc.xpath('descendant::noscript'):
            c.getparent().remove(c)
        for a in desc.xpath('descendant::a[@href]'):
            del a.attrib['href']
            a.tag = 'span'
        desc = etree.tostring(desc, method='html', encoding='unicode').strip()

        # remove all attributes from tags
        desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
        # Collapse whitespace
        # desc = re.sub('\n+', '\n', desc)
        # desc = re.sub(' +', ' ', desc)
        # Remove comments
        desc = re.sub(r'(?s)<!--.*?-->', '', desc)
        return sanitize_comments_html(desc)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: edelweiss.py Projeto: bjhemens/calibre

    def render_comments(self, desc):
        from lxml import etree
        from calibre.library.comments import sanitize_comments_html

        for c in desc.xpath("descendant::noscript"):
            c.getparent().remove(c)
        for a in desc.xpath("descendant::a[@href]"):
            del a.attrib["href"]
            a.tag = "span"
        desc = etree.tostring(desc, method="html", encoding=unicode).strip()

        # remove all attributes from tags
        desc = re.sub(r"<([a-zA-Z0-9]+)\s[^>]+>", r"<\1>", desc)
        # Collapse whitespace
        # desc = re.sub('\n+', '\n', desc)
        # desc = re.sub(' +', ' ', desc)
        # Remove comments
        desc = re.sub(r"(?s)<!--.*?-->", "", desc)
        return sanitize_comments_html(desc)

Exemplo n.º 17

0

Exibir arquivo

    def _render_comments(self, desc):
        # 生成注释?
        from calibre.library.comments import sanitize_comments_html

        desc = self.tostring(desc, method='html', encoding=unicode).strip()

        # Encoding bug in 17k.com data U+fffd (replacement char)
        # in some examples it is present in place of '
        desc = desc.replace('\ufffd', "'")
        # remove all attributes from tags
        desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
        # Collapse whitespace
        # desc = re.sub('\n+', '\n', desc)
        # desc = re.sub(' +', ' ', desc)
        # Remove the notice about text referring to out of print editions
        desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
        # Remove comments
        desc = re.sub(r'(?s)<!--.*?-->', '', desc)
        return sanitize_comments_html(desc)

Exemplo n.º 18

0

Exibir arquivo

    def parse_comments(self, root):
        description_nodes = root.xpath(
            '//div[@id="anotace"]/strong/following-sibling::p')
        if not description_nodes:
            description_nodes = root.xpath(
                '//div[@id="nic"]/strong/following-sibling::p')

        if description_nodes:
            comments = []
            for node in description_nodes:
                node_text = node.text_content()
                if node_text != None:
                    comments.append("<p>" + node_text + "</p>")

            #comments = tostring(description_node, method='html')
            comments = sanitize_comments_html("".join(comments))
            return comments
        else:
            self.log.info('No comment node was found.')

Exemplo n.º 19

0

Exibir arquivo

Arquivo: __init__.py Projeto: mickkn/calibre_metadata_saxo

def parse_comments(root):
    '''
    Function for parsing comments and clean them up a little
    Re-written script from the Goodreads script
    '''
    # Look for description
    description_node = root.xpath('(//div[@class="product-page-block"]//p)[1]')
    if description_node:
        desc = description_node[0] if len(
            description_node) == 1 else description_node[1]
        less_link = desc.xpath('a[@class="actionLinkLite"]')
        if less_link is not None and len(less_link):
            desc.remove(less_link[0])
        comments = tostring(desc, method='html', encoding=unicode).strip()
        while comments.find('  ') >= 0:
            comments = comments.replace('  ', ' ')
        if "Fil størrelse:" in comments:
            comments = comments.replace(comments.split(".")[-1], "</p>")
        comments = sanitize_comments_html(comments)
        return comments

Exemplo n.º 20

0

Exibir arquivo

Arquivo: story.py Projeto: gcomyn/FanFicFare

 def get_sanitized_description(self):
     '''
     For calibre version so this code can be consolidated between
     fff_plugin.py and jobs.py
     '''
     orig = description = self.getMetadata("description")
     # logger.debug("description:%s"%description)
     if not description:
         description = ''
     else:
         if self.getConfig('keep_summary_html'):
             ## Handles desc with (supposed) html without html->MD
             ## text->html dance that sanitize_comments_html does.
             description = sanitize_html(description)
             # logger.debug("desc using sanitize_html")
         else:
             ## because of the html->MD text->html dance, text only
             ## (or MD/MD-like) descs come out better.
             description = sanitize_comments_html(description)
             # logger.debug("desc using sanitize_comments_html")
     # if orig != description:
     #     logger.debug("\nchanged description\n%s\n%s"%(orig,description))
     return description

Exemplo n.º 21

0

Exibir arquivo

 def get_sanitized_description(self):
     '''
     For calibre version so this code can be consolidated between
     fff_plugin.py and jobs.py
     '''
     orig = description = self.getMetadata("description")
     # logger.debug("description:%s"%description)
     if not description:
         description = ''
     else:
         if self.getConfig('keep_summary_html'):
             ## Handles desc with (supposed) html without html->MD
             ## text->html dance that sanitize_comments_html does.
             description = sanitize_html(description)
             # logger.debug("desc using sanitize_html")
         else:
             ## because of the html->MD text->html dance, text only
             ## (or MD/MD-like) descs come out better.
             description = sanitize_comments_html(description)
             # logger.debug("desc using sanitize_comments_html")
     # if orig != description:
     #     logger.debug("\nchanged description\n%s\n%s"%(orig,description))
     return description

Exemplo n.º 22

0

Exibir arquivo

Arquivo: __init__.py Projeto: nerrixDE/calibre-dnb

    def identify(self, log, result_queue, abort, title=None, authors=[], identifiers={}, timeout=30):
	self.load_config()

	if authors is None:
	    authors=[]

	# get identifying tags from book
	idn = identifiers.get('dnb-idn', None)
	isbn = check_isbn(identifiers.get('isbn', None))

	# ignore unknown authors
	ignored_authors = [ "V. A.", "V.A.", "Unknown", "Unbekannt" ]
	for i in ignored_authors:
	    authors = [ x for x in authors if x != i ]

	if (isbn is None) and (idn is None) and (title is None) and (authors is None):
	    log.info("This plugin requires at least either ISBN, IDN, Title or Author(s).")
	    return None


	queries=[]
	# DNB does not do an exact search when searching for a idn or isbn, so we have to filter the results
	exact_search = {}

	if idn is not None:
	    exact_search['idn'] = idn
	    # in case look for a IDN only search for the IDN and skip all the other stuff
	    queries.append('num='+idn)

	else:
	    authors_v = []
	    title_v = []

	    # create some variants of given authors
	    if authors != []:
		authors_v.append(' '.join(self.get_author_tokens(authors,only_first_author=False)))	# concat all author names ("Peter Meier Luise Stark")
		authors_v.append(' '.join(self.get_author_tokens(authors,only_first_author=True)))	# use only first author
		for a in authors:
		    authors_v.append(a)	# use all authors, one by one

		# remove duplicates
		unique_authors_v = []
		for i in authors_v:
		    if i not in unique_authors_v:
			unique_authors_v.append(i)


	    # create some variants of given title
	    if title is not None:
		title_v.append(title)	# simply use given title
		title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=False,strip_subtitle=False)))	# remove some punctation characters
		title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=False,strip_subtitle=True)))	# remove subtitle (everything after " : ")
		title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=False)))	# remove some punctation characters and joiners ("and", "&", ...)
		title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)))	# remove subtitle (everything after " : ") and joiners ("and", "&", ...)
		# TODO: remove subtitle after " - "

		# remove duplicates
		unique_title_v = []
		for i in title_v:
		    if i not in unique_title_v:
			unique_title_v.append(i)


	    # title and author
	    if authors_v != [] and title_v != []:
		for a in authors_v:
		    for t in title_v:
			if isbn is not None:
			    queries.append('tit="' + t + '" AND per="' + a + '" AND num="' + isbn + '"')
			else:
			    queries.append('tit="' + t + '" AND per="' + a + '"')

		# try with first author as title and title (without subtitle) as author
		if isbn is not None:
		    queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="'+isbn+'"')
		else:
		    queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"')

		# try with author and title (without subtitle) in any index
		if isbn is not None:
		    queries.append('"' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND "' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="'+isbn+'"')
		else:
		    queries.append('"' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND "' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"')


	    # author but no title
	    elif authors_v != [] and title_v == []:
		for i in authors_v:
		    if isbn is not None:
			queries.append('per="'+ i +'" AND num="' + isbn + '"')
		    else:
			queries.append('per="'+ i +'"')

		# try with author as title
		if isbn is not None:
		    queries.append('tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="' + isbn + '"')
		else:
		    queries.append('tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"')


	    # title but no author
	    elif authors_v == [] and title_v != []:
		for i in title_v:
		    if isbn is not None:
			queries.append('tit="' + i + '" AND num="' + isbn + '"')
		    else:
			queries.append('tit="' + i + '"')

		# try with title as author
		if isbn is not None:
		    queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND num="' + isbn + '"')
		else:
		    queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '"')


	    # as last resort only use isbn
	    if isbn is not None:
		queries.append('num=' + isbn)


	# remove duplicate queries
	uniqueQueries=[]
	for i in queries:
	    if i not in uniqueQueries:
		uniqueQueries.append(i)


	# Process queries
	results = None

	for query in uniqueQueries:
	    # SRU does not work with "+" or "?" characters in query, so we simply remove them
	    query = re.sub('[\+\?]','',query)

	    query = query + ' NOT (mat=film OR mat=music OR mat=microfiches OR cod=tt)'
	    log.info(query)

	    if self.cfg_dnb_token is None:
		results = self.getSearchResultsByScraping(log, query, timeout)
	    else:
		results = self.getSearchResults(log, query, timeout)

	    if results is None:
		continue

	    log.info("Parsing records")

	    ns = { 'marc21' : 'http://www.loc.gov/MARC21/slim' }
	    for record in results:
		series = None
		series_index = None
		publisher = None
		pubdate = None
		languages = []
		title = None
		title_sort = None
		authors = []
		author_sort = None
		edition = None
		comments = None
		idn = None
		urn = None
		isbn = None
		ddc = []
		subjects_gnd = []
		subjects_non_gnd = []
		publisher_name = None
		publisher_location = None


		##### Field 264 #####
		# Publisher Name and Location
		fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns)
		if len(fields)>0:
		    publisher_name = fields[0].xpath(".//marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns)[0].text.strip();
		    publisher_location = fields[0].xpath(".//marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns)[0].text.strip();
		else:
		    fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../..",namespaces=ns)
		    if len(fields)>0:
			publisher_name = fields[0].xpath(".//marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns)[0].text.strip();
		    else:
			fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='a' and string-length(text())>0]/../..",namespaces=ns)
			if len(fields)>0:
			    publisher_location = fields[0].xpath(".//marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns)[0].text.strip();

		# Publishing Date
		for i in record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='c' and string-length(text())>=4]",namespaces=ns):
		    match = re.search("(\d{4})", i.text.strip())
		    if match is not None:
			year = match.group(1)
			pubdate = datetime.datetime(int(year), 1, 1, 12 , 30, 0)
			break

		# Log
		if publisher_name is not None:
		    log.info("Extracted Publisher: %s" % publisher_name)
		if publisher_location is not None:
		    log.info("Extracted Publisher Location: %s" % publisher_location)
		if pubdate is not None:
		    log.info("Extracted Publication Year: %s" % pubdate)


		##### Field 245 ####
		# Title/Series/Series_Index
		title_parts = []
		for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns):
		    # if a,n,p,n,p,n,p exist:		series = a + n0 + " - " + p0 + n1 + " - " + p1,	series_index = n2,	title = p2
		    # if a,n,p,n,p exist:		series = a + n0 + " - " + p0, 			series_index = n1,	title = p1	(Example: dnb-id 1008774839)
		    # if a,n,p exist:			series = a,					series_index = n,	title = p
		    # if a exist:												title = a
		    # TODO: a,n,p,n (i.e. 956375146)

		    code_p = []
		    code_n = []
		    code_a = []

		    for j in i.xpath(".//marc21:subfield[@code='p']",namespaces=ns):
			code_p.append(j.text.strip())

		    for j in i.xpath(".//marc21:subfield[@code='n']",namespaces=ns):
			match = re.search("(\d+[,\.\d+]?)", j.text.strip())
			if match:
			    code_n.append(match.group(1))
			else:
			    code_n.append("0")	# looks like sometimes DNB does not know the series index and uses something like "[...]"

		    for j in i.xpath(".//marc21:subfield[@code='a']",namespaces=ns):
			code_a.append(j.text.strip())

		    if len(code_p) == 0:
			title_parts = title_parts + code_a

		    elif len(code_p)>0 and len(code_p) == len(code_n):
			series = " : ".join(code_a)	# I've never seen more than one code_a, but who knows...
			for i in range (0,len(code_p)-1):
			    series = series + " " + code_n[i] + " " + code_p[i]
			series_index = code_n[-1]
			title_parts.append(code_p[-1])


		# subtitle 1: Field 245
		for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns):
		    title_parts.append(i.text.strip())
		    break
		
		# subtitle 2
		#for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='c' and string-length(text())>0]",namespaces=ns):
		#    title = title + " / " + i.text.strip()
		#    break

		title = " : ".join(title_parts)

		# Log
		if series_index is not None:
		    log.info("Extracted Series_Index from Field 245: %s" % series_index)
		if series is not None:
		    log.info("Extracted Series from Field 245: %s" % series)
		    series = self.cleanUpSeries(log, series, publisher_name)
		if title is not None:
		    log.info("Extracted Title: %s" % title)
		    title = self.cleanUpTitle(log, title)

		# Title_Sort
		if len(title_parts)>0:
		    title_sort_parts = list(title_parts)
		    title_sort_regex = re.match('^(.*?)('+chr(152)+'.*'+chr(156)+')?(.*?)$',title_parts[0])
		    sortword = title_sort_regex.group(2)
		    if sortword:
			title_sort_parts[0] = ''.join(filter(None,[title_sort_regex.group(1).strip(),title_sort_regex.group(3).strip(),", "+sortword]))
		    title_sort = " : ".join(title_sort_parts)

		# Log
		if title_sort is not None:
		    log.info("Extracted Title_Sort: %s" % title_sort)


		##### Field 100 and Field 700 #####
		# Authors
		for i in record.xpath(".//marc21:datafield[@tag='100']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):	# primary authors
		    name = re.sub(" \[.*\]$","",i.text.strip());
		    authors.append(name)
		for i in record.xpath(".//marc21:datafield[@tag='700']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):	# secondary authors
		    name = re.sub(" \[.*\]$","",i.text.strip());
		    authors.append(name)
		if len(authors)==0:	# if no "real" autor was found take all persons involved
		    for i in record.xpath(".//marc21:datafield[@tag='700']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):	# secondary authors
			name = re.sub(" \[.*\]$","",i.text.strip());
			authors.append(name)
		if len(authors)>0:
		    author_sort = authors[0]

		# Log
		if len(authors)>0:
		    log.info("Extracted Authors: %s" % " & ".join(authors))
		if author_sort is not None:
		    log.info("Extracted Author_Sort: %s" % " & ".join(authors))


		##### Field 856 #####
		# Comments
		for i in record.xpath(".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",namespaces=ns):
		    if i.text.startswith("http://deposit.dnb.de/"):
			br = self.browser
			log.info('Downloading Comments from: %s' % i.text)
			try:
			    comments = br.open_novisit(i.text, timeout=30).read()
			    comments = re.sub('(\s|<br>|<p>|\n)*Angaben aus der Verlagsmeldung(\s|<br>|<p>|\n)*(<h3>.*?</h3>)?(\s|<br>|<p>|\n)*','',comments,flags=re.IGNORECASE)
			    comments = sanitize_comments_html(comments)
			    break
			except:
			    log.info("Could not download Comments from %s" % i)

		# Log
		if comments is not None:
		    log.info('Comments: %s' % comments)

		# If no comments are found for this edition, look at other editions of this book (Fields 776)
		# TODO: Make this configurable (default: yes)
		if comments is None:
		    # get all other issues
		    for i in record.xpath(".//marc21:datafield[@tag='776']/marc21:subfield[@code='w' and string-length(text())>0]",namespaces=ns):
			other_idn = re.sub("^\(.*\)","",i.text.strip());
			subquery = 'num='+other_idn+' NOT (mat=film OR mat=music OR mat=microfiches OR cod=tt)'
			log.info(subquery)

			if self.cfg_dnb_token is None:
			    subresults = self.getSearchResultsByScraping(log, subquery, timeout)
			else:
			    subresults = self.getSearchResults(log, subquery, timeout)

			if subresults is None:
			    continue

			for subrecord in subresults:
			    for i in subrecord.xpath(".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",namespaces=ns):
				if i.text.startswith("http://deposit.dnb.de/"):
				    br = self.browser
				    log.info('Downloading Comments from: %s' % i.text)
				    try:
					comments = br.open_novisit(i.text, timeout=30).read()
					comments = re.sub('(\s|<br>|<p>|\n)*Angaben aus der Verlagsmeldung(\s|<br>|<p>|\n)*(<h3>.*?</h3>)?(\s|<br>|<p>|\n)*','',comments,flags=re.IGNORECASE)
					comments = sanitize_comments_html(comments)
					break
				    except:
					log.info("Could not download Comments from %s" % i)
			    if comments is not None:
				log.info('Comments from other issue: %s' % comments)
				break


		##### Field 16 #####
		# ID: IDN
		for i in record.xpath(".//marc21:datafield[@tag='016']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    idn = i.text.strip()
		    break
		# Log
		if idn is not None:
		    log.info("Extracted ID IDN: %s" % idn)


		##### Field 24 #####
		# ID: URN
		for i in record.xpath(".//marc21:datafield[@tag='024']/marc21:subfield[@code='2' and text()='urn']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    urn = i.text.strip()
		    break

		# Log
		if urn is not None:
		    log.info("Extracted ID URN: %s" % urn)


		##### Field 20 #####
		# ID: ISBN
		for i in record.xpath(".//marc21:datafield[@tag='020']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    isbn_regex = "(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]"
		    match = re.search(isbn_regex, i.text.strip())
		    isbn = match.group()
		    isbn = isbn.replace('-','')
		    break

		# Log
		if isbn is not None:
		    log.info("Extracted ID ISBN: %s" % isbn)

		# When doing an exact search for a given ISBN skip books with wrong ISBNs
		if isbn is not None and "isbn" in exact_search:
		    if isbn != exact_search["isbn"]:
			log.info("Extracted ISBN does not match book's ISBN, skipping record")
			continue


		##### Field 82 #####
		# ID: Sachgruppe (DDC)
		for i in record.xpath(".//marc21:datafield[@tag='082']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    ddc.append(i.text.strip())

		# Log
		if len(ddc)>0:
		    log.info("Extracted ID DDC: %s" % ",".join(ddc))


		##### Field 490 #####
		# In theory this field is not used for "real" book series, use field 830 instead. But it is used.
		# Series and Series_Index
		if series is None or (series is not None and series_index == "0"):
		    for i in record.xpath(".//marc21:datafield[@tag='490']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns):
			# "v" either "Nr. 220" or "This great Seriestitle : Nr. 220" - if available use this instead of attribute a
			attr_v = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip()
			parts = re.split(" : ",attr_v)
			if len(parts)==2:
			    if bool(re.search("\d",parts[0])) != bool(re.search("\d",parts[1])):
				# figure out which part contains the index
				if bool(re.search("\d",parts[0])):
				    indexpart = parts[0]
				    textpart = parts[1]
				else:
				    indexpart = parts[1]
				    textpart = parts[0]

				match = re.search("(\d+[,\.\d+]?)", indexpart)
				if match is not None:
				    series_index = match.group(1)
				    series = textpart.strip()

			else:
			    match = re.search("(\d+[,\.\d+]?)", attr_v)
			    if match is not None:
				series_index = match.group(1)
			    else:
				series_index = "0"

			series_index = series_index.replace(',','.')

			# Use Series Name from attribute "a" if not already found in attribute "v"
			if series is None:
			    series = i.xpath(".//marc21:subfield[@code='a']",namespaces=ns)[0].text.strip()

			# Log
			if series_index is not None:
			    log.info("Extracted Series Index from Field 490: %s" % series_index)
			if series is not None:
			    log.info("Extracted Series from Field 490: %s" % series)
			    series = self.cleanUpSeries(log, series, publisher_name)
			if series is not None:
			    break


		##### Field 246 #####
		# Series and Series_Index
		if series is None or (series is not None and series_index == "0"):
		    for i in record.xpath(".//marc21:datafield[@tag='246']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
			match = re.search("^(.+?) ; (\d+[,\.\d+]?)$", i.text.strip())
			if match is not None:
			    series = match.group(1)
			    series_index = match.group(2)

			    # Log
			    if series_index is not None:
				log.info("Extracted Series Index from Field 246: %s" % series_index)
			    if series is not None:
				log.info("Extracted Series from Field 246: %s" % series)
				series = self.cleanUpSeries(log, series, publisher_name)
			    if series is not None:
				break

		##### Field 800 #####
		# Series and Series_Index
		if series is None or (series is not None and series_index == "0"):
		    for i in record.xpath(".//marc21:datafield[@tag='800']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='t' and string-length(text())>0]/..",namespaces=ns):
			# Series Index
			series_index = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip()
			match = re.search("(\d+[,\.\d+]?)", series_index)
			if match is not None:
			    series_index = match.group(1)
			else:
			    series_index = "0"
			series_index = series_index.replace(',','.')
			# Series
			series = i.xpath(".//marc21:subfield[@code='t']",namespaces=ns)[0].text.strip()

			# Log
			if series_index is not None:
			    log.info("Extracted Series Index from Field 800: %s" % series_index)
			if series is not None:
			    log.info("Extracted Series from Field 800: %s" % series)
			    series = self.cleanUpSeries(log, series, publisher_name)
			if series is not None:
			    break


		##### Field 830 #####
		# Series and Series_Index
		if series is None or (series is not None and series_index == "0"):
		    for i in record.xpath(".//marc21:datafield[@tag='830']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns):
			# Series Index
			series_index = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip()
			match = re.search("(\d+[,\.\d+]?)", series_index)
			if match is not None:
			    series_index = match.group(1)
			else:
			    series_index = "0"
			series_index = series_index.replace(',','.')
			# Series
			series = i.xpath(".//marc21:subfield[@code='a']",namespaces=ns)[0].text.strip()

			# Log
			if series_index is not None:
			    log.info("Extracted Series Index from Field 830: %s" % series_index)
			if series is not None:
			    log.info("Extracted Series from Field 830: %s" % series)
			    series = self.cleanUpSeries(log, series, publisher_name)
			if series is not None:
			    break


		##### Field 689 #####
		# GND Subjects
		for i in record.xpath(".//marc21:datafield[@tag='689']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    subjects_gnd.append(i.text.strip())
		for f in range(600,656):
		    for i in record.xpath(".//marc21:datafield[@tag='"+str(f)+"']/marc21:subfield[@code='2' and text()='gnd']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
			if i.text.startswith("("):
			    continue
			subjects_gnd.append(i.text)

		# Log
		if len(subjects_gnd)>0:
		    log.info("Extracted GND Subjects: %s" % " ".join(subjects_gnd))


		##### Fields 600-655 #####
		# Non-GND subjects
		for f in range(600,656):
		    for i in record.xpath(".//marc21:datafield[@tag='"+str(f)+"']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
			# ignore entries starting with "(":
			if i.text.startswith("("):
			    continue
			subjects_non_gnd.extend(re.split(',|;',i.text))
		# remove one-character subjects:
		for i in subjects_non_gnd:
		    if len(i)<2:
			subjects_non_gnd.remove(i)

		# Log
		if len(subjects_non_gnd)>0:
		    log.info("Extracted non-GND Subjects: %s" % " ".join(subjects_non_gnd))


		##### Field 250 #####
		# Edition
		for i in record.xpath(".//marc21:datafield[@tag='250']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    edition = i.text.strip()
		    break

		# Log
		if edition is not None:
		    log.info("Extracted Edition: %s" % edition)


		##### Field 41 #####
		# Languages
		for i in record.xpath(".//marc21:datafield[@tag='041']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    languages.append(i.text.strip())

		# Log
		if languages is not None:
		    log.info("Extracted Languages: %s" % ",".join(languages))


		##### If configured: Try to separate Series, Series Index and Title from the fetched title #####
		#if self.cfg_guess_series is True:
		if (series is None or (series is not None and series_index == "0")) and self.cfg_guess_series is True:
		    guessed_series = None
		    guessed_series_index = None
		    guessed_title = None

		    log.info("Starting Series Guesser")

		    parts = re.split("[:]",self.removeSortingCharacters(title))

		    if len(parts)==2:
			log.info("Title has two parts")
			# make sure only one part of the two parts contains digits
			if bool(re.search("\d",parts[0])) != bool(re.search("\d",parts[1])):
			    log.info("only one title part contains digits")
			    # figure out which part contains the index
			    if bool(re.search("\d",parts[0])):
				indexpart = parts[0]
				textpart = parts[1]
			    else:
				indexpart = parts[1]
				textpart = parts[0]

			    # Look at the part without digits:
			    match = re.match("^[\s\-–:]*(.+?)[\s\-–:]*$",textpart)	# remove odd characters from start and end of the text part
			    if match:
				textpart = match.group(1)

			    # Look at the part with digits:
			    # for Titleparts like: "Name of the series - Episode 2"
			    match = re.match("^\s*(\S\D*?[a-zA-Z]\D*?)\W[\(\/\.,\s\-–:]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",indexpart)
			    if match:
				guessed_series_index = match.group(2)
				guessed_series = match.group(1)
				if guessed_series is None:
				    guessed_series = textpart
				    guessed_title = textpart + " : Band " + guessed_series_index
				else:
				    guessed_title = textpart

				#log.info("ALGO1: guessed_title: " + guessed_title)
				#log.info("ALGO1: guessed_series: " + guessed_series)
				#log.info("ALGO1: guessed_series_index: " + guessed_series_index)

			    else:
				# for Titleparts like: "Episode 2 Name of the series"
				match = re.match("^\s*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*(\S\D*?[a-zA-Z]\D*?)[\/\.,\-–\s]*$",indexpart)
				if match:
				    guessed_series_index = match.group(1)
				    guessed_series = match.group(2)

				    if guessed_series is None:
					# sometimes books with multiple volumes are detected as series without name -> Add the volume to the title 
					guessed_series = textpart
					guessed_title = textpart + " : Band " + guessed_series_index
				    else:
					guessed_title = textpart

				    #log.info("ALGO2: guessed_title: " + guessed_title)
				    #log.info("ALGO2: guessed_series: " + guessed_series)
				    #log.info("ALGO2: guessed_series_index: " + guessed_series_index)

				else:
				    # for titleparts like: "Band 2"
				    match = re.match("^[\s\(]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*[\/\.,\-–\s]*$",indexpart)
				    if match:
					guessed_series_index = match.group(1)
					# ...with textpart like NAME OF SERIES\s[\-\.;:]\sNAME OF TITLE
					# some false positives
					match = re.match("^\s*(\w+.+?)\s?[\.;\-–:]+\s(\w+.+)\s*$",textpart)
					if match:
					    guessed_series = match.group(1)
					    guessed_title = match.group(2)

					    log.info("ALGO3: guessed_title: " + guessed_title)
					    log.info("ALGO3: guessed_series: " + guessed_series)
					    log.info("ALGO3: guessed_series_index: " + guessed_series_index)


		    elif len(parts)==1:
			log.info("Title has one part")
			# for Titles like: "Name of the series - Title (Episode 2)"
			match = re.match("^\s*(\S.+?) \- (\S.+?) [\(\/\.,\s\-–:](?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",parts[0])
			if match:
			    guessed_series_index = match.group(3)
			    guessed_series = match.group(1)
			    guessed_title = match.group(2)

			    #log.info("ALGO4: guessed_title: " + guessed_title)
			    #log.info("ALGO4: guessed_series: " + guessed_series)
			    #log.info("ALGO4: guessed_series_index: " + guessed_series_index)

			else:
			    # for Titles like: "Name of the series - Episode 2"
			    match = re.match("^\s*(\S.+?)[\(\/\.,\s\-–:]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",parts[0])
			    if match:
				guessed_series_index = match.group(2)
				guessed_series = match.group(1)
				guessed_title = guessed_series + " : Band " + guessed_series_index

				#log.info("ALGO5: guessed_title: " + guessed_title)
				#log.info("ALGO5: guessed_series: " + guessed_series)
				#log.info("ALGO5: guessed_series_index: " + guessed_series_index)

		    # Log
		    if guessed_series is not None:
			log.info("Guessed Series: %s" % guessed_series)
			#guessed_series = self.cleanUpSeries(log, guessed_series, publisher_name)
		    if guessed_series_index is not None:
			log.info("Guessed Series Index: %s" % guessed_series_index)
		    if guessed_title is not None:
			log.info("Guessed Title: %s" % guessed_title)
			guessed_title = self.cleanUpTitle(log, guessed_title)

		    if guessed_series is not None and guessed_series_index is not None and guessed_title is not None:
			title = guessed_title
			series = guessed_series
			series_index = guessed_series_index


		##### Filter exact searches #####
		# When doing an exact search for a given IDN skip books with wrong IDNs
		# TODO: Currently exact_search for ISBN is not implemented. Would require ISBN-10 and ISBN-13 conversions
		if idn is not None and "idn" in exact_search:
		    if idn != exact_search["idn"]:
			log.info("Extracted IDN does not match book's IDN, skipping record")
			continue

		##### Put it all together #####
		if self.cfg_append_edition_to_title == True and edition is not None:
		    title = title + " : " + edition

		mi = Metadata(self.removeSortingCharacters(title), map(lambda i: self.removeSortingCharacters(i), authors))
		mi.title_sort = self.removeSortingCharacters(title_sort)
		mi.author_sort = self.removeSortingCharacters(author_sort)
		mi.languages = languages
		mi.pubdate = pubdate
		mi.publisher = " ; ".join(filter(None,[publisher_location, self.removeSortingCharacters(publisher_name)]))
		mi.series = self.removeSortingCharacters(series)
		mi.series_index = series_index
		mi.comments = comments
		mi.isbn = isbn # also required for cover download
		mi.set_identifier('urn',urn)
		mi.set_identifier('dnb-idn',idn)
		mi.set_identifier('ddc', ",".join(ddc))

		# cfg_subjects:
		# 0: use only subjects_gnd
		if self.cfg_fetch_subjects == 0:
		    mi.tags = self.uniq(subjects_gnd)
		# 1: use only subjects_gnd if found, else subjects_non_gnd
		elif self.cfg_fetch_subjects == 1:
		    if len(subjects_gnd)>0:
			mi.tags = self.uniq(subjects_gnd)
		    else:
			mi.tags = self.uniq(subjects_non_gnd)
		# 2: subjects_gnd and subjects_non_gnd
		elif self.cfg_fetch_subjects == 2:
		    mi.tags = self.uniq(subjects_gnd + subjects_non_gnd)
		# 3: use only subjects_non_gnd if found, else subjects_gnd
		elif self.cfg_fetch_subjects == 3:
		    if len(subjects_non_gnd)>0:
			mi.tags = self.uniq(subjects_non_gnd)
		    else:
			mi.tags = self.uniq(subjects_gnd)
		# 4: use only subjects_non_gnd
		elif self.cfg_fetch_subjects == 4:
		    mi.tags = self.uniq(subjects_non_gnd)
		# 5: use no subjects at all
		elif self.cfg_fetch_subjects == 5:
		    mi.tags = []

		# put current result's metdata into result queue
		log.info("Final formatted result: \n%s" % mi)
		result_queue.put(mi)

Exemplo n.º 23

0

Exibir arquivo

Arquivo: worker.py Projeto: Marduke/CalimeplPacz

#!/usr/bin/env python

Exemplo n.º 24

0

Exibir arquivo

Arquivo: jobs.py Projeto: Zweibach/FanFicFare

def do_download_for_worker(book,options,merge,notification=lambda x,y:x):
    '''
    Child job, to download story when run as a worker job
    '''

    from calibre_plugins.fanficfare_plugin import FanFicFareBase
    fffbase = FanFicFareBase(options['plugin_path'])
    with fffbase:
        
        from calibre_plugins.fanficfare_plugin.dialogs import (NotGoingToDownload,
                OVERWRITE, OVERWRITEALWAYS, UPDATE, UPDATEALWAYS, ADDNEW, SKIP, CALIBREONLY, CALIBREONLYSAVECOL)
        from calibre_plugins.fanficfare_plugin.fanficfare import adapters, writers, exceptions
        from calibre_plugins.fanficfare_plugin.fanficfare.epubutils import get_update_data
        
        from calibre_plugins.fanficfare_plugin.fff_util import (get_fff_adapter, get_fff_config)
        
        try:
            book['comment'] = _('Download started...')
            
            configuration = get_fff_config(book['url'],
                                            options['fileform'],
                                            options['personal.ini'])

            if configuration.getConfig('use_ssl_unverified_context'):
                ## monkey patch to avoid SSL bug.  dupliated from
                ## fff_plugin.py because bg jobs run in own process
                ## space.
                import ssl
                if hasattr(ssl, '_create_unverified_context'):
                    ssl._create_default_https_context = ssl._create_unverified_context
    
            if not options['updateepubcover'] and 'epub_for_update' in book and options['collision'] in (UPDATE, UPDATEALWAYS):
                configuration.set("overrides","never_make_cover","true")
    
            # images only for epub, html, even if the user mistakenly
            # turned it on else where.
            if options['fileform'] not in ("epub","html"):
                configuration.set("overrides","include_images","false")
            
            adapter = adapters.getAdapter(configuration,book['url'])
            adapter.is_adult = book['is_adult'] 
            adapter.username = book['username'] 
            adapter.password = book['password']
            adapter.setChaptersRange(book['begin'],book['end'])
            
            adapter.load_cookiejar(options['cookiejarfile'])
            #logger.debug("cookiejar:%s"%adapter.cookiejar)
            adapter.set_pagecache(options['pagecache'])
            
            story = adapter.getStoryMetadataOnly()
            if 'calibre_series' in book:
                adapter.setSeries(book['calibre_series'][0],book['calibre_series'][1])
                
            # set PI version instead of default.
            if 'version' in options:
                story.setMetadata('version',options['version'])
                
            book['title'] = story.getMetadata("title", removeallentities=True)
            book['author_sort'] = book['author'] = story.getList("author", removeallentities=True)
            book['publisher'] = story.getMetadata("site")
            book['url'] = story.getMetadata("storyUrl")
            book['tags'] = story.getSubjectTags(removeallentities=True)
            if story.getMetadata("description"):
                book['comments'] = sanitize_comments_html(story.getMetadata("description"))
            else:
                book['comments']=''
            book['series'] = story.getMetadata("series", removeallentities=True)
    
            if story.getMetadataRaw('datePublished'):
                book['pubdate'] = story.getMetadataRaw('datePublished').replace(tzinfo=local_tz)
            if story.getMetadataRaw('dateUpdated'):
                book['updatedate'] = story.getMetadataRaw('dateUpdated').replace(tzinfo=local_tz)
            if story.getMetadataRaw('dateCreated'):
                book['timestamp'] = story.getMetadataRaw('dateCreated').replace(tzinfo=local_tz)
            else:
                book['timestamp'] = None # need *something* there for calibre.
                
            writer = writers.getWriter(options['fileform'],configuration,adapter)
    
            outfile = book['outfile']
    
            ## No need to download at all.  Shouldn't ever get down here.
            if options['collision'] in (CALIBREONLY, CALIBREONLYSAVECOL):
                logger.info("Skipping CALIBREONLY 'update' down inside worker--this shouldn't be happening...")
                book['comment'] = 'Metadata collected.'
                book['all_metadata'] = story.getAllMetadata(removeallentities=True)
                if options['savemetacol'] != '':
                    book['savemetacol'] = story.dump_html_metadata()
                
            ## checks were done earlier, it's new or not dup or newer--just write it.
            elif options['collision'] in (ADDNEW, SKIP, OVERWRITE, OVERWRITEALWAYS) or \
                    ('epub_for_update' not in book and options['collision'] in (UPDATE, UPDATEALWAYS)):
    
                # preserve logfile even on overwrite.
                if 'epub_for_update' in book:
                    adapter.logfile = get_update_data(book['epub_for_update'])[6]
                    # change the existing entries id to notid so
                    # write_epub writes a whole new set to indicate overwrite.
                    if adapter.logfile:
                        adapter.logfile = adapter.logfile.replace("span id","span notid")

                if options['collision'] == OVERWRITE and 'fileupdated' in book:
                    lastupdated=story.getMetadataRaw('dateUpdated')
                    fileupdated=book['fileupdated']

                    # updated doesn't have time (or is midnight), use dates only.
                    # updated does have time, use full timestamps.
                    if (lastupdated.time() == time.min and fileupdated.date() > lastupdated.date()) or \
                            (lastupdated.time() != time.min and fileupdated > lastupdated):
                        raise NotGoingToDownload(_("Not Overwriting, web site is not newer."),'edit-undo.png')

                
                logger.info("write to %s"%outfile)
                inject_cal_cols(book,story,configuration)
                writer.writeStory(outfilename=outfile, forceOverwrite=True)
                
                book['comment'] = 'Download %s completed, %s chapters.'%(options['fileform'],story.getMetadata("numChapters"))
                book['all_metadata'] = story.getAllMetadata(removeallentities=True)
                if options['savemetacol'] != '':
                    book['savemetacol'] = story.dump_html_metadata()
                
            ## checks were done earlier, just update it.
            elif 'epub_for_update' in book and options['collision'] in (UPDATE, UPDATEALWAYS):
    
                # update now handled by pre-populating the old images and
                # chapters in the adapter rather than merging epubs.
                urlchaptercount = int(story.getMetadata('numChapters').replace(',',''))
                (url,
                 chaptercount,
                 adapter.oldchapters,
                 adapter.oldimgs,
                 adapter.oldcover,
                 adapter.calibrebookmark,
                 adapter.logfile,
                 adapter.oldchaptersmap,
                 adapter.oldchaptersdata) = get_update_data(book['epub_for_update'])[0:9]
    
                # dup handling from fff_plugin needed for anthology updates.
                if options['collision'] == UPDATE:
                    if chaptercount == urlchaptercount:
                        if merge:
                            book['comment']=_("Already contains %d chapters.  Reuse as is.")%chaptercount
                            book['all_metadata'] = story.getAllMetadata(removeallentities=True)
                            if options['savemetacol'] != '':
                                book['savemetacol'] = story.dump_html_metadata()
                            book['outfile'] = book['epub_for_update'] # for anthology merge ops.
                            return book
                        else: # not merge,
                            raise NotGoingToDownload(_("Already contains %d chapters.")%chaptercount,'edit-undo.png')
                    elif chaptercount > urlchaptercount:
                        raise NotGoingToDownload(_("Existing epub contains %d chapters, web site only has %d. Use Overwrite to force update.") % (chaptercount,urlchaptercount),'dialog_error.png')
                    elif chaptercount == 0:
                        raise NotGoingToDownload(_("FanFicFare doesn't recognize chapters in existing epub, epub is probably from a different source. Use Overwrite to force update."),'dialog_error.png')
                    
                if not (options['collision'] == UPDATEALWAYS and chaptercount == urlchaptercount) \
                        and adapter.getConfig("do_update_hook"):
                    chaptercount = adapter.hookForUpdates(chaptercount)
    
                logger.info("Do update - epub(%d) vs url(%d)" % (chaptercount, urlchaptercount))
                logger.info("write to %s"%outfile)
    
                inject_cal_cols(book,story,configuration)
                writer.writeStory(outfilename=outfile, forceOverwrite=True)
                
                book['comment'] = _('Update %s completed, added %s chapters for %s total.')%\
                    (options['fileform'],(urlchaptercount-chaptercount),urlchaptercount)
                book['all_metadata'] = story.getAllMetadata(removeallentities=True)
                if options['savemetacol'] != '':
                    book['savemetacol'] = story.dump_html_metadata()

            if options['do_wordcount'] == SAVE_YES or (
                options['do_wordcount'] == SAVE_YES_UNLESS_SITE and not story.getMetadataRaw('numWords') ):
                wordcount = get_word_count(outfile)
                logger.info("get_word_count:%s"%wordcount)
                story.setMetadata('numWords',wordcount)
                writer.writeStory(outfilename=outfile, forceOverwrite=True)
                book['all_metadata'] = story.getAllMetadata(removeallentities=True)
                if options['savemetacol'] != '':
                    book['savemetacol'] = story.dump_html_metadata()
                
            if options['smarten_punctuation'] and options['fileform'] == "epub" \
                    and calibre_version >= (0, 9, 39):
                # for smarten punc
                from calibre.ebooks.oeb.polish.main import polish, ALL_OPTS
                from calibre.utils.logging import Log
                from collections import namedtuple

                # do smarten_punctuation from calibre's polish feature
                data = {'smarten_punctuation':True}
                opts = ALL_OPTS.copy()
                opts.update(data)
                O = namedtuple('Options', ' '.join(ALL_OPTS.iterkeys()))
                opts = O(**opts)
                
                log = Log(level=Log.DEBUG)
                polish({outfile:outfile}, opts, log, logger.info)
            
        except NotGoingToDownload as d:
            book['good']=False
            book['comment']=unicode(d)
            book['icon'] = d.icon
    
        except Exception as e:
            book['good']=False
            book['comment']=unicode(e)
            book['icon']='dialog_error.png'
            book['status'] = 'Error'
            logger.info("Exception: %s:%s"%(book,unicode(e)))
            traceback.print_exc()
            
        #time.sleep(10)
    return book

Exemplo n.º 25

0

Exibir arquivo

Arquivo: test_build.py Projeto: Lensman-86/glacier.io

 def test_markdown(self):
     from calibre.ebooks.markdown import Markdown
     Markdown(extensions=['extra'])
     from calibre.library.comments import sanitize_comments_html
     sanitize_comments_html(b'''<script>moo</script>xxx<img src="http://moo.com/x.jpg">''')

Exemplo n.º 26

0

Exibir arquivo

Arquivo: worker.py Projeto: sseeookk/Calibre-Aladin.co.kr-Metadata-Source-Plugin

    def parse_comments(self, root):
        # <!-- 책소개-->
        # aladin uses other request for description and toc.
        # http://www.aladin.co.kr/shop/product/getContents.aspx?ISBN=8970122648&name=Introduce&type=0&date=16

        urlDesc = "http://www.aladin.co.kr/shop/product/getContents.aspx?ISBN=%s&name=Introduce&type=0&date=%s" % (
            self.isbn,
            datetime.datetime.now().hour,
        )

        # TODO: foreign book description
        # 출판사 제공 책소개 - 외국 도서,
        # http://www.aladin.co.kr/shop/product/getContents.aspx?ISBN=0385340583&name=PublisherDesc&type=0&date=15

        comments = ""
        toc = ""

        try:
            self.browser.addheaders = [("Referer", self.url)]
            rawDesc = self.browser.open_novisit(urlDesc, timeout=self.timeout).read().strip()
        except Exception as e:
            if callable(getattr(e, "getcode", None)) and e.getcode() == 404:
                self.log.error("URL malformed: %r" % urlDesc)
            else:
                attr = getattr(e, "args", [None])
                attr = attr if attr else [None]
                if isinstance(attr[0], socket.timeout):
                    msg = "Aladin timed out. Try again later."
                    self.log.error(msg)
                else:
                    msg = "Failed to make Descrpitions query: %r" % urlDesc
                    self.log.exception(msg)

        if rawDesc:
            try:
                # rawDesc = rawDesc.decode('euc-kr', errors='replace')
                # 2015-03-19 22:26:51
                rawDesc = rawDesc.decode("utf-8", errors="replace")
                rootDesc = fromstring(clean_ascii_chars(rawDesc))
                nodeDesc = rootDesc.xpath('//div[@class="p_textbox"]')
                if nodeDesc:
                    self._removeTags(nodeDesc[0], ["object", "script", "style"])
                    comments = tostring(nodeDesc[0], method="html")
            except:
                msg = "Failed to parse aladin details page: %r" % urlDesc
                self.log.exception(msg)

            default_append_toc = cfg.DEFAULT_STORE_VALUES[cfg.KEY_APPEND_TOC]
            append_toc = cfg.plugin_prefs[cfg.STORE_NAME].get(cfg.KEY_APPEND_TOC, default_append_toc)

            if rootDesc and append_toc:
                toc_node = rootDesc.xpath('//div[@id="div_TOC_All"]//p')
                if not toc_node:
                    toc_node = rootDesc.xpath('//div[@id="div_TOC_Short"]//p')
                if toc_node:
                    toc = tostring(toc_node[0], method="html")
                    toc = sanitize_comments_html(toc)
        if not comments:
            # Look for description in a meta
            description_node = root.xpath('//meta[@name="Description"]/@content')
            if description_node:
                # return description_node[0]
                comments = description_node[0]
        if comments:
            comments = '<div id="comments">' + comments + "</div>"
        if toc:
            comments += '<h3>[목차]</h3><div id="toc">' + toc + "</div>"
        if comments:
            comments_suffix = cfg.DEFAULT_STORE_VALUES[cfg.KEY_COMMENTS_SUFFIX]
            comments_suffix = cfg.plugin_prefs[cfg.STORE_NAME].get(cfg.KEY_COMMENTS_SUFFIX, comments_suffix)
            # comments += '<hr /><div><div style="float:right">[aladin.co.kr]</div></div>'
            if comments_suffix:
                comments += comments_suffix
        return comments

Exemplo n.º 27

0

Exibir arquivo

Arquivo: overdrive.py Projeto: WilliamRJohns/glacier.io

            # print "ebook isbn is "+str(ebook_isbn[0])
            isbn = check_isbn(ebook_isbn[0].strip())
            if isbn:
                self.cache_isbn_to_identifier(isbn, ovrdrv_id)
                mi.isbn = isbn
        if subjects:
            mi.tags = [tag.strip() for tag in subjects[0].split(',')]

        if desc:
            desc = desc[0]
            desc = html.tostring(desc, method='html', encoding=unicode).strip()
            # remove all attributes from tags
            desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
            # Remove comments
            desc = re.sub(r'(?s)<!--.*?-->', '', desc)
            mi.comments = sanitize_comments_html(desc)

        return None


if __name__ == '__main__':
    # To run these test use:
    # calibre-debug -e src/calibre/ebooks/metadata/sources/overdrive.py
    from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
                                                      title_test, authors_test)
    test_identify_plugin(OverDrive.name, [
        ({
            'title': 'The Sea Kings Daughter',
            'authors': ['Elizabeth Peters']
        }, [
            title_test('The Sea Kings Daughter', exact=False),

Exemplo n.º 28

0

Exibir arquivo

Arquivo: overdrive.py Projeto: rlugojr/calibre

            # print "ebook isbn is "+str(ebook_isbn[0])
            isbn = check_isbn(ebook_isbn[0].strip())
            if isbn:
                self.cache_isbn_to_identifier(isbn, ovrdrv_id)
                mi.isbn = isbn
        if subjects:
            mi.tags = [tag.strip() for tag in subjects[0].split(',')]

        if desc:
            desc = desc[0]
            desc = html.tostring(desc, method='html', encoding=unicode).strip()
            # remove all attributes from tags
            desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
            # Remove comments
            desc = re.sub(r'(?s)<!--.*?-->', '', desc)
            mi.comments = sanitize_comments_html(desc)

        return None


if __name__ == '__main__':
    # To run these test use:
    # calibre-debug -e src/calibre/ebooks/metadata/sources/overdrive.py
    from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
            title_test, authors_test)
    test_identify_plugin(OverDrive.name,
        [

            (
                {'title':'The Sea Kings Daughter',
                    'authors':['Elizabeth Peters']},

Exemplo n.º 29

0

Exibir arquivo

Arquivo: jobs.py Projeto: smrehm/FanFicFare

def do_download_for_worker(book, options, merge, notification=lambda x, y: x):
    '''
    Child job, to download story when run as a worker job
    '''

    from calibre_plugins.fanficfare_plugin import FanFicFareBase
    fffbase = FanFicFareBase(options['plugin_path'])
    with fffbase:

        from calibre_plugins.fanficfare_plugin.dialogs import (
            NotGoingToDownload, OVERWRITE, OVERWRITEALWAYS, UPDATE,
            UPDATEALWAYS, ADDNEW, SKIP, CALIBREONLY, CALIBREONLYSAVECOL)
        from calibre_plugins.fanficfare_plugin.fanficfare import adapters, writers, exceptions
        from calibre_plugins.fanficfare_plugin.fanficfare.epubutils import get_update_data

        from calibre_plugins.fanficfare_plugin.fff_util import (
            get_fff_adapter, get_fff_config)

        try:
            book['comment'] = _('Download started...')

            configuration = get_fff_config(book['url'], options['fileform'],
                                           options['personal.ini'])

            if configuration.getConfig('use_ssl_unverified_context'):
                ## monkey patch to avoid SSL bug.  dupliated from
                ## fff_plugin.py because bg jobs run in own process
                ## space.
                import ssl
                if hasattr(ssl, '_create_unverified_context'):
                    ssl._create_default_https_context = ssl._create_unverified_context

            if not options[
                    'updateepubcover'] and 'epub_for_update' in book and options[
                        'collision'] in (UPDATE, UPDATEALWAYS):
                configuration.set("overrides", "never_make_cover", "true")

            # images only for epub, html, even if the user mistakenly
            # turned it on else where.
            if options['fileform'] not in ("epub", "html"):
                configuration.set("overrides", "include_images", "false")

            adapter = adapters.getAdapter(configuration, book['url'])
            adapter.is_adult = book['is_adult']
            adapter.username = book['username']
            adapter.password = book['password']
            adapter.setChaptersRange(book['begin'], book['end'])

            adapter.load_cookiejar(options['cookiejarfile'])
            logger.debug("cookiejar:%s" % adapter.cookiejar)
            adapter.set_pagecache(options['pagecache'])

            story = adapter.getStoryMetadataOnly()
            if 'calibre_series' in book:
                adapter.setSeries(book['calibre_series'][0],
                                  book['calibre_series'][1])

            # set PI version instead of default.
            if 'version' in options:
                story.setMetadata('version', options['version'])

            book['title'] = story.getMetadata("title", removeallentities=True)
            book['author_sort'] = book['author'] = story.getList(
                "author", removeallentities=True)
            book['publisher'] = story.getMetadata("site")
            book['url'] = story.getMetadata("storyUrl")
            book['tags'] = story.getSubjectTags(removeallentities=True)
            if story.getMetadata("description"):
                book['comments'] = sanitize_comments_html(
                    story.getMetadata("description"))
            else:
                book['comments'] = ''
            book['series'] = story.getMetadata("series",
                                               removeallentities=True)

            if story.getMetadataRaw('datePublished'):
                book['pubdate'] = story.getMetadataRaw(
                    'datePublished').replace(tzinfo=local_tz)
            if story.getMetadataRaw('dateUpdated'):
                book['updatedate'] = story.getMetadataRaw(
                    'dateUpdated').replace(tzinfo=local_tz)
            if story.getMetadataRaw('dateCreated'):
                book['timestamp'] = story.getMetadataRaw(
                    'dateCreated').replace(tzinfo=local_tz)
            else:
                book['timestamp'] = None  # need *something* there for calibre.

            writer = writers.getWriter(options['fileform'], configuration,
                                       adapter)

            outfile = book['outfile']

            ## No need to download at all.  Shouldn't ever get down here.
            if options['collision'] in (CALIBREONLY, CALIBREONLYSAVECOL):
                logger.info(
                    "Skipping CALIBREONLY 'update' down inside worker--this shouldn't be happening..."
                )
                book['comment'] = 'Metadata collected.'
                book['all_metadata'] = story.getAllMetadata(
                    removeallentities=True)
                if options['savemetacol'] != '':
                    book['savemetacol'] = story.dump_html_metadata()

            ## checks were done earlier, it's new or not dup or newer--just write it.
            elif options['collision'] in (ADDNEW, SKIP, OVERWRITE, OVERWRITEALWAYS) or \
                    ('epub_for_update' not in book and options['collision'] in (UPDATE, UPDATEALWAYS)):

                # preserve logfile even on overwrite.
                if 'epub_for_update' in book:
                    adapter.logfile = get_update_data(
                        book['epub_for_update'])[6]
                    # change the existing entries id to notid so
                    # write_epub writes a whole new set to indicate overwrite.
                    if adapter.logfile:
                        adapter.logfile = adapter.logfile.replace(
                            "span id", "span notid")

                if options['collision'] == OVERWRITE and 'fileupdated' in book:
                    lastupdated = story.getMetadataRaw('dateUpdated')
                    fileupdated = book['fileupdated']

                    # updated doesn't have time (or is midnight), use dates only.
                    # updated does have time, use full timestamps.
                    if (lastupdated.time() == time.min and fileupdated.date() > lastupdated.date()) or \
                            (lastupdated.time() != time.min and fileupdated > lastupdated):
                        raise NotGoingToDownload(
                            _("Not Overwriting, web site is not newer."),
                            'edit-undo.png')

                logger.info("write to %s" % outfile)
                inject_cal_cols(book, story, configuration)
                writer.writeStory(outfilename=outfile, forceOverwrite=True)
                book['comment'] = 'Download %s completed, %s chapters.' % (
                    options['fileform'], story.getMetadata("numChapters"))
                book['all_metadata'] = story.getAllMetadata(
                    removeallentities=True)
                if options['savemetacol'] != '':
                    book['savemetacol'] = story.dump_html_metadata()

            ## checks were done earlier, just update it.
            elif 'epub_for_update' in book and options['collision'] in (
                    UPDATE, UPDATEALWAYS):

                # update now handled by pre-populating the old images and
                # chapters in the adapter rather than merging epubs.
                urlchaptercount = int(
                    story.getMetadata('numChapters').replace(',', ''))
                (url, chaptercount, adapter.oldchapters, adapter.oldimgs,
                 adapter.oldcover, adapter.calibrebookmark, adapter.logfile,
                 adapter.oldchaptersmap,
                 adapter.oldchaptersdata) = get_update_data(
                     book['epub_for_update'])[0:9]

                # dup handling from fff_plugin needed for anthology updates.
                if options['collision'] == UPDATE:
                    if chaptercount == urlchaptercount:
                        if merge:
                            book['comment'] = _(
                                "Already contains %d chapters.  Reuse as is."
                            ) % chaptercount
                            book['all_metadata'] = story.getAllMetadata(
                                removeallentities=True)
                            if options['savemetacol'] != '':
                                book['savemetacol'] = story.dump_html_metadata(
                                )
                            book['outfile'] = book[
                                'epub_for_update']  # for anthology merge ops.
                            return book
                        else:  # not merge,
                            raise NotGoingToDownload(
                                _("Already contains %d chapters.") %
                                chaptercount, 'edit-undo.png')
                    elif chaptercount > urlchaptercount:
                        raise NotGoingToDownload(
                            _("Existing epub contains %d chapters, web site only has %d. Use Overwrite to force update."
                              ) % (chaptercount, urlchaptercount),
                            'dialog_error.png')
                    elif chaptercount == 0:
                        raise NotGoingToDownload(
                            _("FanFicFare doesn't recognize chapters in existing epub, epub is probably from a different source. Use Overwrite to force update."
                              ), 'dialog_error.png')

                if not (options['collision'] == UPDATEALWAYS and chaptercount == urlchaptercount) \
                        and adapter.getConfig("do_update_hook"):
                    chaptercount = adapter.hookForUpdates(chaptercount)

                logger.info("Do update - epub(%d) vs url(%d)" %
                            (chaptercount, urlchaptercount))
                logger.info("write to %s" % outfile)

                inject_cal_cols(book, story, configuration)
                writer.writeStory(outfilename=outfile, forceOverwrite=True)

                book['comment'] = _('Update %s completed, added %s chapters for %s total.')%\
                    (options['fileform'],(urlchaptercount-chaptercount),urlchaptercount)
                book['all_metadata'] = story.getAllMetadata(
                    removeallentities=True)
                if options['savemetacol'] != '':
                    book['savemetacol'] = story.dump_html_metadata()

            if options['smarten_punctuation'] and options['fileform'] == "epub" \
                    and calibre_version >= (0, 9, 39):
                # for smarten punc
                from calibre.ebooks.oeb.polish.main import polish, ALL_OPTS
                from calibre.utils.logging import Log
                from collections import namedtuple

                # do smarten_punctuation from calibre's polish feature
                data = {'smarten_punctuation': True}
                opts = ALL_OPTS.copy()
                opts.update(data)
                O = namedtuple('Options', ' '.join(ALL_OPTS.iterkeys()))
                opts = O(**opts)

                log = Log(level=Log.DEBUG)
                # report = []
                polish({outfile: outfile}, opts, log,
                       logger.info)  # report.append

        except NotGoingToDownload as d:
            book['good'] = False
            book['comment'] = unicode(d)
            book['icon'] = d.icon

        except Exception as e:
            book['good'] = False
            book['comment'] = unicode(e)
            book['icon'] = 'dialog_error.png'
            book['status'] = 'Error'
            logger.info("Exception: %s:%s" % (book, unicode(e)))
            traceback.print_exc()

        #time.sleep(10)
    return book

Exemplo n.º 30

0

Exibir arquivo

Arquivo: test_build.py Projeto: JimmXinu/calibre

 def test_markdown(self):
     from calibre.ebooks.txt.processor import create_markdown_object
     from calibre.ebooks.conversion.plugins.txt_input import MD_EXTENSIONS
     create_markdown_object(sorted(MD_EXTENSIONS))
     from calibre.library.comments import sanitize_comments_html
     sanitize_comments_html(b'''<script>moo</script>xxx<img src="http://moo.com/x.jpg">''')

Exemplo n.º 31

0

Exibir arquivo

    def parse_comments(self, root):
        # <!-- 책소개-->
        # aladin uses other request for description and toc.
        # http://www.aladin.co.kr/shop/product/getContents.aspx?ISBN=8970122648&name=Introduce&type=0&date=16

        urlDesc = "http://www.aladin.co.kr/shop/product/getContents.aspx?ISBN=%s&name=Introduce&type=0&date=%s" % (
            self.isbn, datetime.datetime.now().hour)

        # TODO: foreign book description
        # 출판사 제공 책소개 - 외국 도서,
        # http://www.aladin.co.kr/shop/product/getContents.aspx?ISBN=0385340583&name=PublisherDesc&type=0&date=15

        comments = ''
        toc = ''

        try:
            self.browser.addheaders = [('Referer', self.url)]
            rawDesc = self.browser.open_novisit(
                urlDesc, timeout=self.timeout).read().strip()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
                self.log.error('URL malformed: %r' % urlDesc)
            else:
                attr = getattr(e, 'args', [None])
                attr = attr if attr else [None]
                if isinstance(attr[0], socket.timeout):
                    msg = 'Aladin timed out. Try again later.'
                    self.log.error(msg)
                else:
                    msg = 'Failed to make Descrpitions query: %r' % urlDesc
                    self.log.exception(msg)

        if rawDesc:
            try:
                #rawDesc = rawDesc.decode('euc-kr', errors='replace')
                # 2015-03-19 22:26:51
                rawDesc = rawDesc.decode('utf-8', errors='replace')
                rootDesc = fromstring(clean_ascii_chars(rawDesc))
                nodeDesc = rootDesc.xpath('//div[@class="p_textbox"]')
                if nodeDesc:
                    self._removeTags(nodeDesc[0],
                                     ["object", "script", "style"])
                    comments = tostring(nodeDesc[0], method='html')
            except:
                msg = 'Failed to parse aladin details page: %r' % urlDesc
                self.log.exception(msg)

            default_append_toc = cfg.DEFAULT_STORE_VALUES[cfg.KEY_APPEND_TOC]
            append_toc = cfg.plugin_prefs[cfg.STORE_NAME].get(
                cfg.KEY_APPEND_TOC, default_append_toc)

            if rootDesc and append_toc:
                toc_node = rootDesc.xpath('//div[@id="div_TOC_All"]//p')
                if not toc_node:
                    toc_node = rootDesc.xpath('//div[@id="div_TOC_Short"]//p')
                if toc_node:
                    toc = tostring(toc_node[0], method='html')
                    toc = sanitize_comments_html(toc)
        if not comments:
            # Look for description in a meta
            description_node = root.xpath(
                '//meta[@name="Description"]/@content')
            if description_node:
                # return description_node[0]
                comments = description_node[0]
        if comments:
            comments = '<div id="comments">' + comments + '</div>'
        if toc:
            comments += '<h3>[목차]</h3><div id="toc">' + toc + "</div>"
        if comments:
            comments_suffix = cfg.DEFAULT_STORE_VALUES[cfg.KEY_COMMENTS_SUFFIX]
            comments_suffix = cfg.plugin_prefs[cfg.STORE_NAME].get(
                cfg.KEY_COMMENTS_SUFFIX, comments_suffix)
            # comments += '<hr /><div><div style="float:right">[aladin.co.kr]</div></div>'
            if comments_suffix:
                comments += comments_suffix
        return comments

Exemplo n.º 32

0

Exibir arquivo

Arquivo: overdrive.py Projeto: zyhong/calibre

    def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log):
        from html5_parser import parse
        from lxml import html
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.library.comments import sanitize_comments_html

        try:
            raw = br.open_novisit(metadata_url).read()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                return False
            raise
        raw = xml_to_unicode(raw,
                             strip_encoding_pats=True,
                             resolve_entities=True)[0]

        try:
            root = parse(raw, maybe_xhtml=False, sanitize_names=True)
        except Exception:
            return False

        pub_date = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
        lang = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()")
        subjects = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()")
        ebook_isbn = root.xpath(
            "//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()")
        desc = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]"
        )

        if pub_date:
            from calibre.utils.date import parse_date
            try:
                mi.pubdate = parse_date(pub_date[0].strip())
            except:
                pass
        if lang:
            lang = lang[0].strip().lower()
            lang = {
                'english': 'eng',
                'french': 'fra',
                'german': 'deu',
                'spanish': 'spa'
            }.get(lang, None)
            if lang:
                mi.language = lang

        if ebook_isbn:
            # print("ebook isbn is "+type('')(ebook_isbn[0]))
            isbn = check_isbn(ebook_isbn[0].strip())
            if isbn:
                self.cache_isbn_to_identifier(isbn, ovrdrv_id)
                mi.isbn = isbn
        if subjects:
            mi.tags = [tag.strip() for tag in subjects[0].split(',')]

        if desc:
            desc = desc[0]
            desc = html.tostring(desc, method='html',
                                 encoding='unicode').strip()
            # remove all attributes from tags
            desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
            # Remove comments
            desc = re.sub(r'(?s)<!--.*?-->', '', desc)
            mi.comments = sanitize_comments_html(desc)

        return None

Exemplo n.º 33

0

Exibir arquivo

Arquivo: __init__.py Projeto: gismo2006/calibre-dnb

    def identify(self,
                 log,
                 result_queue,
                 abort,
                 title=None,
                 authors=None,
                 identifiers={},
                 timeout=30):
        self.load_config()

        # get identifying tags from book
        idn = identifiers.get('dnb-idn', None)
        isbn = check_isbn(identifiers.get('isbn', None))

        # ignore unknown authors
        if authors is "V. A." or authors is "V.A." or authors is "Unknown" or authors is "Unbekannt":
            authors = None

        if (isbn is None) and (idn is None) and (title is None) and (authors is
                                                                     None):
            log.info(
                "This plugin requires at least either ISBN, IDN, Title or Author(s)."
            )
            return None

        queries = []
        # DNB does not do an exact search when searching for a idn or isbn, so we have to filter the results
        exact_search = {}

        if idn is not None:
            queries.append('num=' + idn)
            exact_search['idn'] = idn

        else:
            authors_v = []
            title_v = []

            if authors is not None:
                authors_v.append(' '.join(authors))
                authors_v.append(' '.join(
                    self.get_author_tokens(authors, only_first_author=False)))
                authors_v.append(' '.join(
                    self.get_author_tokens(authors, only_first_author=True)))

            if title is not None:
                title_v.append(title)
                title_v.append(' '.join(
                    self.get_title_tokens(title,
                                          strip_joiners=False,
                                          strip_subtitle=False)))
                title_v.append(' '.join(
                    self.get_title_tokens(title,
                                          strip_joiners=False,
                                          strip_subtitle=True)))

            if isbn is not None:
                exact_search['isbn'] = isbn

            # title and author
            if authors is not None and title is not None:
                for a in authors_v:
                    for t in title_v:
                        if isbn is not None:
                            queries.append('tit="' + t + '" AND per="' + a +
                                           '" AND num="' + isbn + '"')
                        else:
                            queries.append('tit="' + t + '" AND per="' + a +
                                           '"')

                # try with author and title swapped
                if isbn is not None:
                    queries.append('per="' + title + '" AND tit="' +
                                   authors[0] + '" AND num="' + isbn + '"')
                else:
                    queries.append('per="' + title + '" AND tit="' +
                                   authors[0] + '"')

            # title but no author
            elif authors is not None and title is None:
                for i in authors_v:
                    if isbn is not None:
                        queries.append('per="' + i + '" AND num="' + isbn +
                                       '"')
                    else:
                        queries.append('per="' + i + '"')

                # try with author and title swapped
                if isbn is not None:
                    queries.append('tit="' + authors[0] + '" AND num="' +
                                   isbn + '"')
                else:
                    queries.append('tit="' + authors[0] + '"')

            # author but no title
            elif authors is None and title is not None:
                for i in title_v:
                    if isbn is not None:
                        queries.append('tit="' + i + '" AND num="' + isbn +
                                       '"')
                    else:
                        queries.append('tit="' + i + '"')

                # try with author and title swapped
                if isbn is not None:
                    queries.append('per="' + title + '" AND num="' + isbn +
                                   '"')
                else:
                    queries.append('per="' + title + '"')

            # as last resort only use isbn
            if isbn is not None:
                queries.append('num=' + isbn)

            # Sort queries descending by length (assumption: longer query -> less but better results)
            #queries.sort(key=len)
            #queries.reverse()

        # remove duplicate queries
        uniqueQueries = []
        for i in queries:
            if i not in uniqueQueries:
                uniqueQueries.append(i)

        # Process queries
        results = None

        for query in uniqueQueries:
            query = query + ' NOT (mat=film OR mat=music OR mat=microfiches)'
            log.info(query)

            if self.cfg_dnb_token is None:
                results = self.getSearchResultsByScraping(log, query, timeout)
            else:
                results = self.getSearchResults(log, query, timeout)

            if results is None:
                continue

            log.info("Parsing records")

            ns = {'marc21': 'http://www.loc.gov/MARC21/slim'}
            for record in results:
                series = None
                series_index = None
                publisher = None
                pubdate = None
                languages = []
                title = None
                title_sort = None
                edition = None
                comments = None
                idn = None
                urn = None
                isbn = None
                ddc = []
                subjects_gnd = []
                subjects_non_gnd = []

                # Title: Field 245
                title_parts = []
                # if a,n,p exist: series = a, series_index = n, title = p
                for i in record.xpath(
                        ".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]/../marc21:subfield[@code='n' and string-length(text())>0]/../marc21:subfield[@code='p' and string-length(text())>0]/..",
                        namespaces=ns):
                    series_index = i.xpath(".//marc21:subfield[@code='n']",
                                           namespaces=ns)[0].text.strip()
                    match = re.search("(\d+[,\.\d+]?)", series_index)
                    if match:
                        series_index = match.group(1)
                    else:
                        series_index = "0"  # looks like sometimes DNB does not know the series index and uses something like "[...]"
                    series_index = series_index.replace(',', '.')
                    series = i.xpath(".//marc21:subfield[@code='a']",
                                     namespaces=ns)[0].text.strip()
                    title_parts.append(
                        i.xpath(".//marc21:subfield[@code='p']",
                                namespaces=ns)[0].text.strip())
                    log.info("Extracted Series: %s" % series)
                    log.info("Extracted Series Index: %s" % series_index)
                    break
                # otherwise: title = a
                if len(title_parts) == 0:
                    for i in record.xpath(
                            ".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]",
                            namespaces=ns):
                        title_parts.append(i.text.strip())
                        break

                # subtitle 1
                for i in record.xpath(
                        ".//marc21:datafield[@tag='245']/marc21:subfield[@code='b' and string-length(text())>0]",
                        namespaces=ns):
                    title_parts.append(i.text.strip())
                    break

                # subtitle 2
                #for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='c' and string-length(text())>0]",namespaces=ns):
                #    title = title + " / " + i.text.strip()
                #    break

                title = " : ".join(title_parts)
                log.info("Extracted Title: %s" % title)

                # Title_Sort
                title_sort_parts = list(title_parts)
                title_sort_regex = re.match(
                    '^(.*?)(' + chr(152) + '.*' + chr(156) + ')?(.*?)$',
                    title_parts[0])
                sortword = title_sort_regex.group(2)
                if sortword:
                    title_sort_parts[0] = ''.join(
                        filter(None, [
                            title_sort_regex.group(1).strip(),
                            title_sort_regex.group(3).strip(), ", " + sortword
                        ]))
                title_sort = " : ".join(title_sort_parts)
                log.info("Extracted Title_Sort: %s" % title_sort)

                # Authors
                authors = []
                author_sort = None
                for i in record.xpath(
                        ".//marc21:datafield[@tag='100']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):  # primary authors
                    name = re.sub(" \[.*\]$", "", i.text.strip())
                    authors.append(name)
                for i in record.xpath(
                        ".//marc21:datafield[@tag='700']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):  # secondary authors
                    name = re.sub(" \[.*\]$", "", i.text.strip())
                    authors.append(name)
                if len(
                        authors
                ) == 0:  # if no "real" autor was found take all persons involved
                    for i in record.xpath(
                            ".//marc21:datafield[@tag='700']/marc21:subfield[@code='a' and string-length(text())>0]",
                            namespaces=ns):  # secondary authors
                        name = re.sub(" \[.*\]$", "", i.text.strip())
                        authors.append(name)
                if len(authors) > 0:
                    author_sort = authors[0]
                log.info("Extracted Authors: %s" % " & ".join(authors))

                # Comments
                for i in record.xpath(
                        ".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",
                        namespaces=ns):
                    if i.text.startswith("http://deposit.dnb.de/"):
                        br = self.browser
                        log.info('Downloading Comments from: %s' % i.text)
                        try:
                            comments = br.open_novisit(i.text,
                                                       timeout=30).read()
                            comments = sanitize_comments_html(comments)
                            log.info('Comments: %s' % comments)
                            break
                        except:
                            log.info("Could not download Comments from %s" % i)

                # Publisher Name and Location
                publisher_name = None
                publisher_location = None
                fields = record.xpath(
                    ".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",
                    namespaces=ns)
                if len(fields) > 0:
                    publisher_name = fields[0].xpath(
                        ".//marc21:subfield[@code='b' and string-length(text())>0]",
                        namespaces=ns)[0].text.strip()
                    publisher_location = fields[0].xpath(
                        ".//marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns)[0].text.strip()
                else:
                    fields = record.xpath(
                        ".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../..",
                        namespaces=ns)
                    if len(fields) > 0:
                        publisher_name = fields[0].xpath(
                            ".//marc21:subfield[@code='b' and string-length(text())>0]",
                            namespaces=ns)[0].text.strip()
                    else:
                        fields = record.xpath(
                            ".//marc21:datafield[@tag='264']/marc21:subfield[@code='a' and string-length(text())>0]/../..",
                            namespaces=ns)
                        if len(fields) > 0:
                            publisher_location = fields[0].xpath(
                                ".//marc21:subfield[@code='a' and string-length(text())>0]",
                                namespaces=ns)[0].text.strip()

                log.info("Extracted Publisher: %s" % publisher_name)
                log.info("Extracted Publisher Location: %s" %
                         publisher_location)

                # Publishing Date
                for i in record.xpath(
                        ".//marc21:datafield[@tag='264']/marc21:subfield[@code='c' and string-length(text())>=4]",
                        namespaces=ns):
                    match = re.search("(\d{4})", i.text.strip())
                    if match is not None:
                        year = match.group(1)
                        pubdate = datetime.datetime(int(year), 1, 2)
                        break
                log.info("Extracted Publication Year: %s" % pubdate)

                # ID: IDN
                for i in record.xpath(
                        ".//marc21:datafield[@tag='016']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    idn = i.text.strip()
                    break
                log.info("Extracted ID IDN: %s" % idn)
                if "idn" in exact_search:
                    if idn != exact_search["idn"]:
                        log.info(
                            "Extracted IDN does not match book's IDN, skipping record"
                        )
                        continue

                # ID: URN
                for i in record.xpath(
                        ".//marc21:datafield[@tag='024']/marc21:subfield[@code='2' and text()='urn']/../marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    urn = i.text.strip()
                    break
                log.info("Extracted ID URN: %s" % urn)

                # ID: ISBN
                for i in record.xpath(
                        ".//marc21:datafield[@tag='020']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    isbn_regex = "(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]"
                    match = re.search(isbn_regex, i.text.strip())
                    isbn = match.group()
                    isbn = isbn.replace('-', '')
                    break
                log.info("Extracted ID ISBN: %s" % isbn)
                if "isbn" in exact_search:
                    if isbn != exact_search["isbn"]:
                        log.info(
                            "Extracted ISBN does not match book's ISBN, skipping record"
                        )
                        continue

                # ID: Sachgruppe (DDC)
                for i in record.xpath(
                        ".//marc21:datafield[@tag='082']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    ddc.append(i.text.strip())
                log.info("Extracted ID DDC: %s" % ",".join(ddc))

                # Series and Series_Index
                if series is None and series_index is None:
                    for i in record.xpath(
                            ".//marc21:datafield[@tag='830']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",
                            namespaces=ns):
                        # Series Index
                        series_index = i.xpath(".//marc21:subfield[@code='v']",
                                               namespaces=ns)[0].text.strip()
                        match = re.search("(\d+[,\.\d+]?)", series_index)
                        if match is not None:
                            series_index = match.group(1)
                        else:
                            series_index = "0"
                        series_index = series_index.replace(',', '.')
                        log.info("Extracted Series Index: %s" % series_index)
                        # Series
                        series = i.xpath(".//marc21:subfield[@code='a']",
                                         namespaces=ns)[0].text.strip()
                        log.info("Extracted Series: %s" % series)
                        break

                # Try to extract Series, Series Index and Title from the fetched title.
                # Caution: This overwrites DNB's series/series_index and modifies the title!
                if self.cfg_guess_series is True:
                    guessed_series = None
                    guessed_series_index = None
                    parts = re.split("[:]",
                                     self.removeSortingCharacters(title))
                    if len(parts) == 2:
                        if bool(re.search("\d", parts[0])) != bool(
                                re.search("\d", parts[1])):
                            # figure out which part contains the index
                            if bool(re.search("\d", parts[0])):
                                indexpart = parts[0]
                                textpart = parts[1]
                            else:
                                indexpart = parts[1]
                                textpart = parts[0]

                            match = re.match(
                                "^[\s\-–:]*(.+?)[\s\-–:]*$", textpart
                            )  # remove odd characters from start and end of the text part
                            if match:
                                textpart = match.group(1)

                            # from Titleparts like: "Name of the series - Episode 2"	OK
                            match = re.match(
                                "^\s*(\S.*?)[\(\/\.,\s\-–:]*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",
                                indexpart)
                            if match:
                                guessed_series_index = match.group(2)
                                guessed_series = match.group(1)
                                if guessed_series is None:
                                    guessed_series = textpart
                                    title = textpart + " : Band " + guessed_series_index
                                else:
                                    title = textpart
                            else:
                                # from Titleparts like: "Episode 2 Name of the series"
                                match = re.match(
                                    "^\s*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*(\S.*?)[\/\.,\-–\s]*$",
                                    indexpart)
                                if match:
                                    guessed_series_index = match.group(1)
                                    guessed_series = match.group(2)
                                    if guessed_series is None:
                                        guessed_series = textpart
                                        title = textpart + " : Band " + guessed_series_index
                                    else:
                                        title = textpart
                    elif len(parts) == 1:
                        # from Titles like: "Name of the series - Title (Episode 2)"
                        match = re.match(
                            "^\s*(\S.+?) \- (\S.+?) [\(\/\.,\s\-–:](?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",
                            parts[0])
                        if match:
                            guessed_series_index = match.group(3)
                            guessed_series = match.group(1)
                            title = match.group(2)

                        else:
                            # from Titles like: "Name of the series - Episode 2"
                            match = re.match(
                                "^\s*(\S.+?)[\(\/\.,\s\-–:]*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",
                                parts[0])
                            if match:
                                guessed_series_index = match.group(2)
                                guessed_series = match.group(1)
                                title = guessed_series + " : Band " + guessed_series_index

                    if guessed_series is not None and guessed_series_index is not None:
                        series = guessed_series
                        series_index = guessed_series_index
                        log.info("Guessed Series: %s" % series)
                        log.info("Guessed Series Index: %s" % series_index)

                # GND Subjects from 689
                for i in record.xpath(
                        ".//marc21:datafield[@tag='689']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    subjects_gnd.append(i.text.strip())
                # GND Subjects from 600-655
                for f in range(600, 656):
                    for i in record.xpath(".//marc21:datafield[@tag='" + str(
                            f
                    ) + "']/marc21:subfield[@code='2' and text()='gnd']/../marc21:subfield[@code='a' and string-length(text())>0]",
                                          namespaces=ns):
                        if i.text.startswith("("):
                            continue
                        subjects_gnd.append(i.text)
                log.info("Extracted GND Subjects: %s" % " ".join(subjects_gnd))

                # Non-GND subjects from 600-655
                for f in range(600, 656):
                    for i in record.xpath(".//marc21:datafield[@tag='" + str(
                            f
                    ) + "']/marc21:subfield[@code='a' and string-length(text())>0]",
                                          namespaces=ns):
                        # ignore entries starting with "(":
                        if i.text.startswith("("):
                            continue
                        subjects_non_gnd.extend(re.split(',|;', i.text))
                # remove one-character subjects:
                for i in subjects_non_gnd:
                    if len(i) < 2:
                        subjects_non_gnd.remove(i)
                log.info("Extracted non-GND Subjects: %s" %
                         " ".join(subjects_non_gnd))

                # Edition
                for i in record.xpath(
                        ".//marc21:datafield[@tag='250']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    edition = i.text.strip()
                    break
                log.info("Extracted Edition: %s" % edition)

                # Languages
                for i in record.xpath(
                        ".//marc21:datafield[@tag='041']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    languages.append(i.text.strip())
                if languages is not None:
                    log.info("Extracted Languages: %s" % ",".join(languages))

                # Put it all together
                if self.cfg_append_edition_to_title == True and edition is not None:
                    title = title + " : " + edition

                mi = Metadata(
                    self.removeSortingCharacters(title),
                    map(lambda i: self.removeSortingCharacters(i), authors))
                mi.title_sort = self.removeSortingCharacters(title_sort)
                mi.author_sort = self.removeSortingCharacters(author_sort)
                mi.languages = languages
                mi.pubdate = pubdate
                mi.publisher = " : ".join(
                    filter(None, [
                        publisher_location,
                        self.removeSortingCharacters(publisher_name)
                    ]))
                mi.series = self.removeSortingCharacters(series)
                mi.series_index = series_index
                mi.comments = comments
                mi.isbn = isbn  # also required for cover download
                mi.set_identifier('urn', urn)
                mi.set_identifier('dnb-idn', idn)
                mi.set_identifier('ddc', ",".join(ddc))

                if self.cfg_fetch_subjects == 0:
                    mi.tags = self.uniq(subjects_gnd)
                elif self.cfg_fetch_subjects == 1:
                    if len(subjects_gnd) > 0:
                        mi.tags = self.uniq(subjects_gnd)
                    else:
                        mi.tags = self.uniq(subjects_non_gnd)
                elif self.cfg_fetch_subjects == 2:
                    mi.tags = self.uniq(subjects_gnd + subjects_non_gnd)
                elif self.cfg_fetch_subjects == 3:
                    if len(subjects_non_gnd) > 0:
                        mi.tags = self.uniq(subjects_non_gnd)
                    else:
                        mi.tags = self.uniq(subjects_gnd)
                elif self.cfg_fetch_subjects == 4:
                    mi.tags = self.uniq(subjects_non_gnd)
                elif self.cfg_fetch_subjects == 5:
                    mi.tags = []

                # put current result's metdata into result queue
                log.info("Final formatted result: %s" % mi)
                result_queue.put(mi)