Exemplo n.º 1
0
    def connect(self):
        if not self.is_configured():
            return False
        br = self.browser
        try:
            url = "%snew/"%self.BASE_URL
            clean = clean_ascii_chars(br.open(url).read().strip())
            parser = etree.HTMLParser(recover=True)            
            feed = fromstring(clean, parser=parser)
            
            formUrl = feed.xpath('//form[@id="form"]/@action')
            self.log('formUrl %s'%formUrl[0])
            
            url = self.BASE_URL + formUrl[0]

            parameters = {
                "sendform":"1",
                "login_name":self.prefs['login'],
                "login_password":self.prefs['password']
            }
            data = urllib.urlencode(parameters)
            self.log(url)
            self.log(data)
            clean = clean_ascii_chars(br.open(url,data).read().strip())
            parser = etree.HTMLParser(recover=True)
            feed = fromstring(clean, parser=parser)
            self.log(clean)
            return len(feed.xpath('//input[@id="login_name"]/@name')) == 0
        except Exception as e:
            self.log.exception(e)
            return False
Exemplo n.º 2
0
    def get_details(self):
        try:
            self.log.info('YES24 url: %r'%self.url)
            raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                self.log.error('URL malformed: %r'%self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'YES24 timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r'%self.url
                self.log.exception(msg)
            return

        raw = raw.decode('euc-kr', errors='replace')
        #open('P:\\yes24.html', 'wb').write(raw)

        if 'HTTP 404.' in raw:
            self.log.error('URL malformed: %r'%self.url)
            return

        try:
            root = fromstring(clean_ascii_chars(raw))
        except:
            msg = 'Failed to parse YES24 details page: %r'%self.url
            self.log.exception(msg)
            return

        self.parse_details(root)
Exemplo n.º 3
0
    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from calibre.ebooks.txt.txtml import TXTMLizer
        from calibre.utils.cleantext import clean_ascii_chars
        from calibre.ebooks.txt.newlines import specified_newlines, TxtNewlines

        if opts.txt_output_formatting.lower() == 'markdown':
            from calibre.ebooks.txt.markdownml import MarkdownMLizer
            self.writer = MarkdownMLizer(log)
        elif opts.txt_output_formatting.lower() == 'textile':
            from calibre.ebooks.txt.textileml import TextileMLizer
            self.writer = TextileMLizer(log)
        else:
            self.writer = TXTMLizer(log)

        txt = self.writer.extract_content(oeb_book, opts)
        txt = clean_ascii_chars(txt)

        log.debug('\tReplacing newlines with selected type...')
        txt = specified_newlines(TxtNewlines(opts.newline).newline, txt)

        close = False
        if not hasattr(output_path, 'write'):
            close = True
            if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
                os.makedirs(os.path.dirname(output_path))
            out_stream = open(output_path, 'wb')
        else:
            out_stream = output_path

        out_stream.seek(0)
        out_stream.truncate()
        out_stream.write(txt.encode(opts.txt_output_encoding, 'replace'))

        if close:
            out_stream.close()
Exemplo n.º 4
0
 def get_image_urls(self, title, author, log, abort, timeout):
     from calibre.utils.cleantext import clean_ascii_chars
     from urllib import urlencode
     import html5lib
     import json
     from collections import OrderedDict
     ans = OrderedDict()
     br = self.browser
     q = urlencode({
         'as_q': ('%s %s' % (title, author)).encode('utf-8')
     }).decode('utf-8')
     sz = self.prefs['size']
     if sz == 'any':
         sz = ''
     elif sz == 'l':
         sz = 'isz:l,'
     else:
         sz = 'isz:lt,islt:%s,' % sz
     # See https://www.google.com/advanced_image_search to understand this
     # URL scheme
     url = 'https://www.google.com/search?as_st=y&tbm=isch&{}&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs={}iar:t,ift:jpg'.format(
         q, sz)
     log('Search URL: ' + url)
     raw = br.open(url).read().decode('utf-8')
     root = html5lib.parse(clean_ascii_chars(raw),
                           treebuilder='lxml',
                           namespaceHTMLElements=False)
     for div in root.xpath('//div[@class="rg_meta"]'):
         try:
             data = json.loads(div.text)
         except Exception:
             continue
         if 'ou' in data:
             ans[data['ou']] = True
     return list(ans.iterkeys())
Exemplo n.º 5
0
    def run(self):
        if self.xml is None:
            raw = None
            url = None
            try:
                url = self.plugin.create_query(self.title, self.number)
                self.log('download page search %s'%url)
                raw = self.plugin.browser.open(url, timeout=self.timeout).read().strip()
            except Exception as e:
                self.log.exception('Failed to make identify query: %r'%url)
                return as_unicode(e)

            if raw is not None:
                try:
                    parser = etree.XMLParser(recover=True)
                    clean = clean_ascii_chars(raw)
                    clean = re.sub("<br>", "<br/>", clean)
                    clean = re.sub("&nbsp;", " ", clean)
                    clean = re.sub("&hellip;", "...", clean)
                    self.xml = fromstring(clean, parser=parser)
                    if len(parser.error_log) > 0: #some errors while parsing
                        self.log('while parsing page occus some errors:')
                        self.log(parser.error_log)

                except Exception as e:
                    self.log.exception('Failed to parse xml for url: %s'%url)

        self.parse()
Exemplo n.º 6
0
    def get_details(self):
        try:
            self.log.info('Legie url: %r' % self.url)
            raw = self.browser.open_novisit(
                self.url, timeout=self.timeout).read().strip()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                self.log.error('URL malformed: %r' % self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'Legie timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r' % self.url
                self.log.exception(msg)
            return

        raw = raw.decode('utf-8', errors='replace')
        #open('E:\\t3.html', 'wb').write(raw)

        if '<title>404 - ' in raw:
            self.log.error('URL malformed: %r' % self.url)
            return

        try:
            root = fromstring(clean_ascii_chars(raw))
        except:
            msg = 'Failed to parse Legie details page: %r' % self.url
            self.log.exception(msg)
            return

        self.parse_details(root)
Exemplo n.º 7
0
def parse_html(raw):
    import html5lib
    from calibre.ebooks.chardet import xml_to_unicode
    from calibre.utils.cleantext import clean_ascii_chars

    raw = clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)[0])
    return html5lib.parse(raw, treebuilder="lxml", namespaceHTMLElements=False).getroot()
Exemplo n.º 8
0
def clean_txt(txt):
    '''
    Run transformations on the text to put it into
    consistent state.
    '''
    if isbytestring(txt):
        txt = txt.decode('utf-8', 'replace')
    # Strip whitespace from the end of the line. Also replace
    # all line breaks with \n.
    txt = '\n'.join([line.rstrip() for line in txt.splitlines()])

    # Replace whitespace at the beginning of the line with &nbsp;
    txt = re.sub('(?m)(?<=^)([ ]{2,}|\t+)(?=.)', '&nbsp;' * 4, txt)

    # Condense redundant spaces
    txt = re.sub('[ ]{2,}', ' ', txt)

    # Remove blank space from the beginning and end of the document.
    txt = re.sub(r'^\s+(?=.)', '', txt)
    txt = re.sub(r'(?<=.)\s+$', '', txt)
    # Remove excessive line breaks.
    txt = re.sub('\n{5,}', '\n\n\n\n', txt)
    # remove ASCII invalid chars : 0 to 8 and 11-14 to 24
    txt = clean_ascii_chars(txt)

    return txt
Exemplo n.º 9
0
    def _parse_editions_for_book(self, log, editions_url, matches, timeout,
                                 title_tokens):
        def ismatch(title):
            title = lower(title)
            match = not title_tokens
            for t in title_tokens:
                if lower(t) in title:
                    match = True
                    break
            return match

        br = self.browser
        try:
            raw = br.open_novisit(editions_url, timeout=timeout).read().strip()
        except Exception as e:
            err = 'Failed identify editions query: %r' % editions_url
            log.exception(err)
            return as_unicode(e)
        try:
            raw = raw.decode('utf-8', errors='replace')
            if not raw:
                log.error('Failed to get raw result for query: %r' %
                          editions_url)
                return
            # open('E:\\s.html', 'wb').write(raw)
            root = fromstring(clean_ascii_chars(raw))
        except:
            msg = 'Failed to parse CBDB page for query: %r' % editions_url
            log.exception(msg)
            return msg

        first_non_valid = None
        for div_link in root.xpath(
                '//div[@class="editionData"]/div[1]/a[@class="bookTitle"]'):
            title = tostring(div_link, 'text').strip().lower()
            if title:
                # Verify it is not an audio edition
                valid_title = True
                for exclusion in [
                        '(audio cd)', '(compact disc)', '(audio cassette)'
                ]:
                    if exclusion in title:
                        log.info('Skipping audio edition: %s' % title)
                        valid_title = False
                        if first_non_valid is None:
                            first_non_valid = BASE_URL + div_link.get('href')
                        break
                if valid_title:
                    # Verify it is not a foreign language edition
                    if not ismatch(title):
                        log.info('Skipping alternate title:', title)
                        continue
                    matches.append(BASE_URL + div_link.get('href'))
                    if len(matches) >= CBDB.MAX_EDITIONS:
                        return
        if len(matches) == 0 and first_non_valid:
            # We have found only audio editions. In which case return the first match
            # rather than tell the user there are no matches.
            log.info('Choosing the first audio edition as no others found.')
            matches.append(first_non_valid)
Exemplo n.º 10
0
 def get_image_urls(self, title, author, log, abort, timeout):
     from calibre.utils.cleantext import clean_ascii_chars
     from urllib import urlencode
     import html5lib
     import json
     from collections import OrderedDict
     ans = OrderedDict()
     br = self.browser
     q = urlencode({'as_q': ('%s %s'%(title, author)).encode('utf-8')}).decode('utf-8')
     sz = self.prefs['size']
     if sz == 'any':
         sz = ''
     elif sz == 'l':
         sz = 'isz:l,'
     else:
         sz = 'isz:lt,islt:%s,' % sz
     # See https://www.google.com/advanced_image_search to understand this
     # URL scheme
     url = 'https://www.google.com/search?as_st=y&tbm=isch&{}&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs={}iar:t,ift:jpg'.format(q, sz)
     log('Search URL: ' + url)
     raw = br.open(url).read().decode('utf-8')
     root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', namespaceHTMLElements=False)
     for div in root.xpath('//div[@class="rg_meta"]'):
         try:
             data = json.loads(div.text)
         except Exception:
             continue
         if 'ou' in data:
             ans[data['ou']] = True
     return list(ans.iterkeys())
Exemplo n.º 11
0
 def root_from_url(cls, browser, url, timeout, log):
     log.info('Fetching: %s' % url)
     response = browser.open_novisit(url, timeout=timeout)
     raw = response.read()
     parser = XMLParser(recover=True, no_network=True)
     return fromstring(xml_to_unicode(clean_ascii_chars(raw),
         strip_encoding_pats=True)[0], parser=parser)
Exemplo n.º 12
0
 def key_press_event(self, ev, which=0):
     code = ev.key()
     if self.capture == 0 or code in (0, Qt.Key_unknown,
             Qt.Key_Shift, Qt.Key_Control, Qt.Key_Alt, Qt.Key_Meta,
             Qt.Key_AltGr, Qt.Key_CapsLock, Qt.Key_NumLock, Qt.Key_ScrollLock):
         return QWidget.keyPressEvent(self, ev)
     button = getattr(self, 'button%d'%which)
     button.setStyleSheet('QPushButton { font-weight: normal}')
     mods = int(ev.modifiers()) & ~Qt.KeypadModifier
     # for some reason qt sometimes produces ascii control codes in text,
     # for example ctrl+shift+u will give text == '\x15' on linux
     txt = clean_ascii_chars(ev.text())
     if txt and txt.lower() == txt.upper():
         # We have a symbol like ! or > etc. In this case the value of code
         # already includes Shift, so remove it
         mods &= ~Qt.ShiftModifier
     sequence = QKeySequence(code|mods)
     button.setText(sequence.toString(QKeySequence.NativeText))
     self.capture = 0
     dup_desc = self.dup_check(sequence)
     if dup_desc is not None:
         error_dialog(self, _('Already assigned'),
                 unicode(sequence.toString(QKeySequence.NativeText)) + ' ' +
                 _('already assigned to') + ' ' + dup_desc, show=True)
         self.clear_clicked(which=which)
Exemplo n.º 13
0
    def make_query(self,
                   q,
                   abort,
                   title=None,
                   authors=None,
                   identifiers={},
                   max_pages=10,
                   timeout=30):
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.utils.cleantext import clean_ascii_chars

        page_num = 1
        parser = etree.XMLParser(recover=True, no_network=True)
        br = self.browser

        seen = set()

        candidates = []
        total_found = 0
        while page_num <= max_pages and not abort.is_set():
            url = q.replace('&page_number=1&', '&page_number=%d&' % page_num)
            page_num += 1
            raw = br.open_novisit(url, timeout=timeout).read()
            feed = etree.fromstring(xml_to_unicode(
                clean_ascii_chars(raw), strip_encoding_pats=True)[0],
                                    parser=parser)
            total, found, results = self.parse_feed(feed, seen, title, authors,
                                                    identifiers)
            total_found += found
            candidates += results
            if total_found >= total or len(candidates) > 9:
                break

        return candidates
Exemplo n.º 14
0
    def run(self):
        if self.xml is None:
            raw = None
            url = None
            try:
                url = self.plugin.create_query(self.title, self.authors, self.number)
                self.log('download page search %s'%url)
                raw = self.plugin.browser.open(url, timeout=self.timeout).read().strip()
            except Exception as e:
                self.log.exception('Failed to make identify query: %r'%url)
                return as_unicode(e)

            if raw is not None:
                try:
                    parser = etree.HTMLParser()
                    clean = clean_ascii_chars(raw)
                    self.xml = fromstring(clean, parser=parser)
#                     if len(parser.error_log) > 0: #some errors while parsing
#                         self.log('while parsing page occus some errors:')
#                         self.log(parser.error_log)

                except Exception as e:
                    self.log.exception('Failed to parse xml for url: %s'%url)

        self.parse()
Exemplo n.º 15
0
    def download_parse(self, query, timeout):
#         self.downloads_count += 1
#         number = self.downloads_count
        br = self.browser
        try:
            self.log('download page search %s'%query)
            data = urllib.urlencode(query[1])
            raw = br.open(query[0],data,timeout=timeout).read().strip()
        except Exception as e:
            self.log.exception('Failed to make identify query: %r'%query)
            return as_unicode(e)

        try:
            parser = etree.HTMLParser(recover=True)
            clean = clean_ascii_chars(raw)

#             self.log.filelog(clean, "\\tmp\\test.html")
            feed = fromstring(clean, parser=parser)

#             if len(parser.error_log) > 0: #some errors while parsing
#                 self.log('while parsing page occus some errors:')
#                 self.log(parser.error_log)

            return feed
        except Exception as e:
            self.log.exception('Failed to parse identify results')
            return as_unicode(e)
Exemplo n.º 16
0
    def make_query(self, q, abort, title=None, authors=None, identifiers={},
            max_pages=10, timeout=30):
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.utils.cleantext import clean_ascii_chars

        page_num = 1
        parser = etree.XMLParser(recover=True, no_network=True)
        br = self.browser

        seen = set()

        candidates = []
        total_found = 0
        while page_num <= max_pages and not abort.is_set():
            url = q.replace('&page_number=1&', '&page_number=%d&'%page_num)
            page_num += 1
            raw = br.open_novisit(url, timeout=timeout).read()
            feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
                strip_encoding_pats=True)[0], parser=parser)
            total, found, results = self.parse_feed(
                    feed, seen, title, authors, identifiers)
            total_found += found
            candidates += results
            if total_found >= total or len(candidates) > 9:
                break

        return candidates
Exemplo n.º 17
0
    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from calibre.ebooks.txt.txtml import TXTMLizer
        from calibre.utils.cleantext import clean_ascii_chars
        from calibre.ebooks.txt.newlines import specified_newlines, TxtNewlines

        if opts.txt_output_formatting.lower() == 'markdown':
            from calibre.ebooks.txt.markdownml import MarkdownMLizer
            self.writer = MarkdownMLizer(log)
        elif opts.txt_output_formatting.lower() == 'textile':
            from calibre.ebooks.txt.textileml import TextileMLizer
            self.writer = TextileMLizer(log)
        else:
            self.writer = TXTMLizer(log)

        txt = self.writer.extract_content(oeb_book, opts)
        txt = clean_ascii_chars(txt)

        log.debug('\tReplacing newlines with selected type...')
        txt = specified_newlines(TxtNewlines(opts.newline).newline, txt)

        close = False
        if not hasattr(output_path, 'write'):
            close = True
            if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
                os.makedirs(os.path.dirname(output_path))
            out_stream = open(output_path, 'wb')
        else:
            out_stream = output_path

        out_stream.seek(0)
        out_stream.truncate()
        out_stream.write(txt.encode(opts.txt_output_encoding, 'replace'))

        if close:
            out_stream.close()
Exemplo n.º 18
0
def clean_txt(txt):
    '''
    Run transformations on the text to put it into
    consistent state.
    '''
    if isbytestring(txt):
        txt = txt.decode('utf-8', 'replace')
    # Strip whitespace from the end of the line. Also replace
    # all line breaks with \n.
    txt = '\n'.join([line.rstrip() for line in txt.splitlines()])

    # Replace whitespace at the beginning of the line with &nbsp;
    txt = re.sub('(?m)(?<=^)([ ]{2,}|\t+)(?=.)', '&nbsp;' * 4, txt)

    # Condense redundant spaces
    txt = re.sub('[ ]{2,}', ' ', txt)

    # Remove blank space from the beginning and end of the document.
    txt = re.sub('^\s+(?=.)', '', txt)
    txt = re.sub('(?<=.)\s+$', '', txt)
    # Remove excessive line breaks.
    txt = re.sub('\n{5,}', '\n\n\n\n', txt)
    # remove ASCII invalid chars : 0 to 8 and 11-14 to 24
    txt = clean_ascii_chars(txt)

    return txt
Exemplo n.º 19
0
    def identify(  # {{{
            self,
            log,
            result_queue,
            abort,
            title=None,
            authors=None,
            identifiers={},
            timeout=30):
        from lxml import etree
        entry = XPath('//atom:entry')

        query = self.create_query(log,
                                  title=title,
                                  authors=authors,
                                  identifiers=identifiers)
        if not query:
            log.error('Insufficient metadata to construct query')
            return
        br = self.browser
        log('Making query:', query)
        try:
            raw = br.open_novisit(query, timeout=timeout).read()
        except Exception as e:
            log.exception('Failed to make identify query: %r' % query)
            return as_unicode(e)

        try:
            parser = etree.XMLParser(recover=True, no_network=True)
            feed = etree.fromstring(xml_to_unicode(
                clean_ascii_chars(raw), strip_encoding_pats=True)[0],
                                    parser=parser)
            entries = entry(feed)
        except Exception as e:
            log.exception('Failed to parse identify results')
            return as_unicode(e)

        if not entries and title and not abort.is_set():
            if identifiers:
                log('No results found, retrying without identifiers')
                return self.identify(log,
                                     result_queue,
                                     abort,
                                     title=title,
                                     authors=authors,
                                     timeout=timeout)
            ntitle = cleanup_title(title)
            if ntitle and ntitle != title:
                log('No results found, retrying without sub-title')
                return self.identify(log,
                                     result_queue,
                                     abort,
                                     title=ntitle,
                                     authors=authors,
                                     timeout=timeout)

        # There is no point running these queries in threads as google
        # throttles requests returning 403 Forbidden errors
        self.get_all_details(br, log, entries, abort, result_queue, timeout)
    def get_details(self):
        try:
            self.log.info('Shelfari book url: %r' % self.url)
            raw = self.browser.open_novisit(
                self.url, timeout=self.timeout).read().strip()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                self.log.error('URL malformed: %r' % self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'Shelfari timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r' % self.url
                self.log.exception(msg)
            return

        raw = raw.decode('utf-8', errors='replace')
        #open('c:\\shelfari.html', 'wb').write(raw)

        if '<title>404 - ' in raw:
            self.log.error('URL malformed: %r' % self.url)
            return

        try:
            root = fromstring(clean_ascii_chars(raw))
        except:
            msg = 'Failed to parse shelfari details page: %r' % self.url
            self.log.exception(msg)
            return

        try:
            # Look at the <title> attribute for page to make sure that we were actually returned
            # a details page for a book. If the user had specified an invalid ISBN, then the results
            # page will just do a textual search.
            title_node = root.xpath('//title')
            if title_node:
                page_title = title_node[0].text_content().strip()
                if page_title is None:
                    self.log.error(
                        'Failed to see search results in page title: %r' %
                        self.url)
                    return
        except:
            msg = 'Failed to read shelfari page title: %r' % self.url
            self.log.exception(msg)
            return

        errmsg = root.xpath('//*[@id="errorMessage"]')
        if errmsg:
            msg = 'Failed to parse shelfari details page: %r' % self.url
            msg += tostring(errmsg, method='text', encoding=unicode).strip()
            self.log.error(msg)
            return

        self.parse_details(root)
Exemplo n.º 21
0
    def get_details(self):
        try:
            raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                self.log.error('URL malformed: %r'%self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'Kyobobook timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r'%self.url
                self.log.exception(msg)
            return

        # open('c:\\Kyobobook1.html', 'wb').write(raw)
        # raw = raw.decode('utf-8', errors='replace') #00
        # open('c:\\Kyobobook2.html', 'wb').write(raw)

        # if '<title>404 - ' in raw:
            # self.log.error('URL malformed: %r'%self.url)
            # return

        try:
            root = fromstring(clean_ascii_chars(raw))
        except:
            msg = 'Failed to parse Kyobobook details page: %r'%self.url
            self.log.exception(msg)
            return

        try:
            # Look at the <title> attribute for page to make sure that we were actually returned
            # a details page for a book. If the user had specified an invalid ISBN, then the results
            # page will just do a textual search.
            title_node = root.xpath('//title')
            if title_node:
                page_title = title_node[0].text_content().strip()
                
                # search success : "나의 문화유산답사기 1 - 인터넷교보문고"
                # search fail : " - 인터넷교보문고"
                if page_title is None or page_title == " - 인터넷교보문고":
                    self.log.error('Failed to see search results in page title: %r'%self.url)
                    return
        except:
            msg = 'Failed to read Kyobobook page title: %r'%self.url
            self.log.exception(msg)
            return

        errmsg = root.xpath('//*[@id="errorMessage"]')
        if errmsg:
            msg = 'Failed to parse Kyobobook details page: %r'%self.url
            msg += tostring(errmsg, method='text', encoding=unicode).strip()
            self.log.error(msg)
            return

        self.parse_details(root)
Exemplo n.º 22
0
def clean_html(raw):
    from calibre.ebooks.chardet import xml_to_unicode
    from calibre.utils.cleantext import clean_ascii_chars
    return clean_ascii_chars(
        xml_to_unicode(raw,
                       strip_encoding_pats=True,
                       resolve_entities=True,
                       assume_utf8=True)[0])
Exemplo n.º 23
0
def parse_html(raw):
    import html5lib
    from calibre.ebooks.chardet import xml_to_unicode
    from calibre.utils.cleantext import clean_ascii_chars
    raw = clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True,
                                resolve_entities=True, assume_utf8=True)[0])
    return html5lib.parse(raw, treebuilder='lxml',
                              namespaceHTMLElements=False).getroot()
Exemplo n.º 24
0
    def fetch_raw(
            self,
            log,
            url,
            br,
            testing,  # {{{
            identifiers={},
            timeout=30):
        from calibre.utils.cleantext import clean_ascii_chars
        from calibre.ebooks.chardet import xml_to_unicode
        from lxml.html import tostring
        import html5lib
        try:
            raw = br.open_novisit(
                url, timeout=timeout).read().decode('gb18030').strip()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                            e.getcode() == 404:
                log.error('Query malformed: %r' % url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = _('DangDang timed out. Try again later.')
                log.error(msg)
            else:
                msg = 'Failed to make identify query: %r' % url
                log.exception(msg)
            return as_unicode(msg)

        raw = clean_ascii_chars(
            xml_to_unicode(raw,
                           strip_encoding_pats=True,
                           resolve_entities=True)[0])

        if testing:
            import tempfile
            with tempfile.NamedTemporaryFile(prefix='dangdang_results_',
                                             suffix='.html',
                                             delete=False) as f:
                f.write(raw.encode('utf-8'))
            print('Downloaded html for results page saved in', f.name)

        matches = []
        found = '<title>对不起,您要访问的页面暂时没有找到' not in raw

        if found:
            try:
                root = html5lib.parse(raw,
                                      treebuilder='lxml',
                                      namespaceHTMLElements=False)
            except:
                msg = 'Failed to parse DangDang page for query: %r' % url
                log.exception(msg)
                return msg

        return found, root
Exemplo n.º 25
0
    def _parse_editions_for_book(self, log, editions_url, matches, timeout, title_tokens):

        def ismatch(title):
            title = lower(title)
            match = not title_tokens
            for t in title_tokens:
                if lower(t) in title:
                    match = True
                    break
            return match

        br = self.browser
        try:
            raw = br.open_novisit(editions_url, timeout=timeout).read().strip()
        except Exception as e:
            err = 'Failed identify editions query: %r'%editions_url
            log.exception(err)
            return as_unicode(e)
        try:
            raw = raw.decode('utf-8', errors='replace')
            if not raw:
                log.error('Failed to get raw result for query: %r'%editions_url)
                return
            #open('E:\\s.html', 'wb').write(raw)
            root = fromstring(clean_ascii_chars(raw))
        except:
            msg = 'Failed to parse goodreads page for query: %r'%editions_url
            log.exception(msg)
            return msg

        first_non_valid = None
        for div_link in root.xpath('//div[@class="editionData"]/div[1]/a[@class="bookTitle"]'):
            title = tostring(div_link, 'text').strip().lower()
            if title:
                # Verify it is not an audio edition
                valid_title = True
                for exclusion in ['(audio cd)', '(compact disc)', '(audio cassette)']:
                    if exclusion in title:
                        log.info('Skipping audio edition: %s'%title)
                        valid_title = False
                        if first_non_valid is None:
                            first_non_valid = Goodreads.BASE_URL + div_link.get('href')
                        break
                if valid_title:
                    # Verify it is not a foreign language edition
                    if not ismatch(title):
                        log.info('Skipping alternate title:', title)
                        continue
                    matches.append(Goodreads.BASE_URL + div_link.get('href'))
                    if len(matches) >= Goodreads.MAX_EDITIONS:
                        return
        if len(matches) == 0 and first_non_valid:
            # We have found only audio editions. In which case return the first match
            # rather than tell the user there are no matches.
            log.info('Choosing the first audio edition as no others found.')
            matches.append(first_non_valid)
    def get_details(self):
        try:
            raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip()
        except Exception as e:
            if callable(getattr(e, "getcode", None)) and e.getcode() == 404:
                self.log.error("URL malformed: %r" % self.url)
                return
            attr = getattr(e, "args", [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = "Aladin timed out. Try again later."
                self.log.error(msg)
            else:
                msg = "Failed to make details query: %r" % self.url
                self.log.exception(msg)
            return

        # raw = raw.decode('utf-8', errors='replace') #00

        # if '<title>404 - ' in raw:
        # self.log.error('URL malformed: %r'%self.url)
        # return

        try:
            root = fromstring(clean_ascii_chars(raw))
        except:
            msg = "Failed to parse aladin details page: %r" % self.url
            self.log.exception(msg)
            return

        try:
            # Look at the <title> attribute for page to make sure that we were actually returned
            # a details page for a book. If the user had specified an invalid ISBN, then the results
            # page will just do a textual search.
            title_node = root.xpath("//title")
            if title_node:
                page_title = title_node[0].text_content().strip()

                # search success : '[알라딘]나의 문화유산답사기 1 - 남도답사 일번지, 개정판'
                # search fail : '[알라딘] "좋은 책을 고르는 방법, 알라딘"'
                if page_title is None or page_title.find("좋은 책을 고르는 방법, 알라딘") > -1:
                    self.log.error("Failed to see search results in page title: %r" % self.url)
                    return
        except:
            msg = "Failed to read aladin page title: %r" % self.url
            self.log.exception(msg)
            return

        errmsg = root.xpath('//*[@id="errorMessage"]')
        if errmsg:
            msg = "Failed to parse aladin details page: %r" % self.url
            msg += tostring(errmsg, method="text", encoding=unicode).strip()
            self.log.error(msg)
            return

        self.parse_details(root)
Exemplo n.º 27
0
    def convert_new(self, stream, accelerators):
        from calibre.ebooks.pdf.pdftohtml import pdftohtml
        from calibre.utils.cleantext import clean_ascii_chars
        from calibre.ebooks.pdf.reflow import PDFDocument

        pdftohtml(os.getcwdu(), stream.name, self.opts.no_images, as_xml=True)
        with open(u'index.xml', 'rb') as f:
            xml = clean_ascii_chars(f.read())
        PDFDocument(xml, self.opts, self.log)
        return os.path.join(os.getcwdu(), u'metadata.opf')
Exemplo n.º 28
0
    def convert_new(self, stream, accelerators):
        from calibre.ebooks.pdf.pdftohtml import pdftohtml
        from calibre.utils.cleantext import clean_ascii_chars
        from calibre.ebooks.pdf.reflow import PDFDocument

        pdftohtml(os.getcwd(), stream.name, self.opts.no_images, as_xml=True)
        with lopen('index.xml', 'rb') as f:
            xml = clean_ascii_chars(f.read())
        PDFDocument(xml, self.opts, self.log)
        return os.path.join(os.getcwd(), 'metadata.opf')
Exemplo n.º 29
0
    def get_details(self):
        try:
            self.log.info("Goodreads book url: %r" % self.url)
            raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip()
        except Exception as e:
            if callable(getattr(e, "getcode", None)) and e.getcode() == 404:
                self.log.error("URL malformed: %r" % self.url)
                return
            attr = getattr(e, "args", [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = "Goodreads timed out. Try again later."
                self.log.error(msg)
            else:
                msg = "Failed to make details query: %r" % self.url
                self.log.exception(msg)
            return

        raw = raw.decode("utf-8", errors="replace")
        # open('c:\\goodreads.html', 'wb').write(raw)

        if "<title>404 - " in raw:
            self.log.error("URL malformed: %r" % self.url)
            return

        try:
            root = fromstring(clean_ascii_chars(raw))
        except:
            msg = "Failed to parse goodreads details page: %r" % self.url
            self.log.exception(msg)
            return

        try:
            # Look at the <title> attribute for page to make sure that we were actually returned
            # a details page for a book. If the user had specified an invalid ISBN, then the results
            # page will just do a textual search.
            title_node = root.xpath("//title")
            if title_node:
                page_title = title_node[0].text_content().strip()
                if page_title is None or page_title.find("search results for") != -1:
                    self.log.error("Failed to see search results in page title: %r" % self.url)
                    return
        except:
            msg = "Failed to read goodreads page title: %r" % self.url
            self.log.exception(msg)
            return

        errmsg = root.xpath('//*[@id="errorMessage"]')
        if errmsg:
            msg = "Failed to parse goodreads details page: %r" % self.url
            msg += tostring(errmsg, method="text", encoding=unicode).strip()
            self.log.error(msg)
            return

        self.parse_details(root)
Exemplo n.º 30
0
    def get_details(self):
        from calibre.utils.cleantext import clean_ascii_chars
        from calibre.ebooks.chardet import xml_to_unicode
        import html5lib

        try:
            raw = self.browser.open_novisit(
                self.url, timeout=self.timeout).read().strip()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                self.log.error('URL malformed: %r' % self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'Amazon timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r' % self.url
                self.log.exception(msg)
            return

        oraw = raw
        raw = xml_to_unicode(raw,
                             strip_encoding_pats=True,
                             resolve_entities=True)[0]
        if '<title>404 - ' in raw:
            self.log.error('URL malformed: %r' % self.url)
            return

        try:
            root = html5lib.parse(clean_ascii_chars(raw),
                                  treebuilder='lxml',
                                  namespaceHTMLElements=False)
        except:
            msg = 'Failed to parse amazon details page: %r' % self.url
            self.log.exception(msg)
            return
        if self.domain == 'jp':
            for a in root.xpath('//a[@href]'):
                if 'black-curtain-redirect.html' in a.get('href'):
                    self.url = 'http://amazon.co.jp' + a.get('href')
                    self.log('Black curtain redirect found, following')
                    return self.get_details()

        errmsg = root.xpath('//*[@id="errorMessage"]')
        if errmsg:
            msg = 'Failed to parse amazon details page: %r' % self.url
            msg += self.tostring(errmsg, method='text',
                                 encoding=unicode).strip()
            self.log.error(msg)
            return

        self.parse_details(oraw, root)
Exemplo n.º 31
0
def parse_details_page(url, log, timeout, browser, domain):
    from calibre.utils.cleantext import clean_ascii_chars
    from calibre.ebooks.chardet import xml_to_unicode
    import html5lib
    from lxml.html import tostring
    try:
        raw = browser.open_novisit(url, timeout=timeout).read().strip()
    except Exception as e:
        if callable(getattr(e, 'getcode', None)) and \
                e.getcode() == 404:
            log.error('URL malformed: %r'%url)
            return
        attr = getattr(e, 'args', [None])
        attr = attr if attr else [None]
        if isinstance(attr[0], socket.timeout):
            msg = 'Amazon timed out. Try again later.'
            log.error(msg)
        else:
            msg = 'Failed to make details query: %r'%url
            log.exception(msg)
        return

    oraw = raw
    if 'amazon.com.br' in url:
        raw = raw.decode('utf-8')  # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag
    raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]
    if '<title>404 - ' in raw:
        log.error('URL malformed: %r'%url)
        return

    try:
        root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml',
                namespaceHTMLElements=False)
    except:
        msg = 'Failed to parse amazon details page: %r'%url
        log.exception(msg)
        return
    if domain == 'jp':
        for a in root.xpath('//a[@href]'):
            if 'black-curtain-redirect.html' in a.get('href'):
                url = 'http://amazon.co.jp'+a.get('href')
                log('Black curtain redirect found, following')
                return parse_details_page(url, log, timeout, browser, domain)

    errmsg = root.xpath('//*[@id="errorMessage"]')
    if errmsg:
        msg = 'Failed to parse amazon details page: %r'%url
        msg += tostring(errmsg, method='text', encoding=unicode).strip()
        log.error(msg)
        return

    from css_selectors import Select
    selector = Select(root)
    return oraw, root, selector
Exemplo n.º 32
0
def parse_details_page(url, log, timeout, browser, domain):
    from calibre.utils.cleantext import clean_ascii_chars
    from calibre.ebooks.chardet import xml_to_unicode
    import html5lib
    from lxml.html import tostring
    try:
        raw = browser.open_novisit(url, timeout=timeout).read().strip()
    except Exception as e:
        if callable(getattr(e, 'getcode', None)) and \
                e.getcode() == 404:
            log.error('URL malformed: %r'%url)
            return
        attr = getattr(e, 'args', [None])
        attr = attr if attr else [None]
        if isinstance(attr[0], socket.timeout):
            msg = 'Amazon timed out. Try again later.'
            log.error(msg)
        else:
            msg = 'Failed to make details query: %r'%url
            log.exception(msg)
        return

    oraw = raw
    if 'amazon.com.br' in url:
        raw = raw.decode('utf-8')  # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag
    raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]
    if '<title>404 - ' in raw:
        log.error('URL malformed: %r'%url)
        return

    try:
        root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml',
                namespaceHTMLElements=False)
    except:
        msg = 'Failed to parse amazon details page: %r'%url
        log.exception(msg)
        return
    if domain == 'jp':
        for a in root.xpath('//a[@href]'):
            if 'black-curtain-redirect.html' in a.get('href'):
                url = 'http://amazon.co.jp'+a.get('href')
                log('Black curtain redirect found, following')
                return parse_details_page(url, log, timeout, browser, domain)

    errmsg = root.xpath('//*[@id="errorMessage"]')
    if errmsg:
        msg = 'Failed to parse amazon details page: %r'%url
        msg += tostring(errmsg, method='text', encoding=unicode).strip()
        log.error(msg)
        return

    from css_selectors import Select
    selector = Select(root)
    return oraw, root, selector
Exemplo n.º 33
0
 def populate_article_metadata(self, article, soup, first):
     els = soup.findAll(name=['span', 'p'],
                        attrs={'class': ['flytitle-and-title__title', 'blog-post__rubric']})
     result = []
     for el in els[0:2]:
         if el is not None and el.contents:
             for descendant in el.contents:
                 if isinstance(descendant, NavigableString):
                     result.append(type(u'')(descendant))
     article.summary = u'. '.join(result) + u'.'
     article.text_summary = clean_ascii_chars(article.summary)
Exemplo n.º 34
0
 def replace_illegals(self):
     """
     """
     with open(self.__file, 'r') as read_obj:
         with open(self.__write_to, 'w') as write_obj:
             for line in read_obj:
                 write_obj.write(clean_ascii_chars(line))
     copy_obj = copy.Copy()
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "replace_illegals.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
Exemplo n.º 35
0
 def download_short_story_list(self, url):
     query_short_stories = self.plugin.BASE_URL + url
     try:
         self.log('download page with short stories list %s'%query_short_stories)
         data = self.browser.open(query_short_stories, timeout=self.timeout).read().strip()
         parser = etree.XMLParser(recover=True)
         clean = clean_ascii_chars(data)
         xml = fromstring(clean,  parser=parser)
         return xml
     except Exception as e:
         self.log.exception('Failed to make download : %r'%query_short_stories)
         return None
Exemplo n.º 36
0
 def replace_illegals(self):
     """
     """
     with open_for_read(self.__file) as read_obj:
         with open_for_write(self.__write_to) as write_obj:
             for line in read_obj:
                 write_obj.write(clean_ascii_chars(line))
     copy_obj = copy.Copy()
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "replace_illegals.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
Exemplo n.º 37
0
 def __init__(self, id, title, url, author, summary, published, content):
     from lxml import html
     self.downloaded = False
     self.id = id
     if not title or not isinstance(title, string_or_bytes):
         title = _('Unknown')
     title = force_unicode(title, 'utf-8')
     self._title = clean_xml_chars(title).strip()
     try:
         self._title = re.sub(r'&(\S+?);',
             entity_to_unicode, self._title)
     except:
         pass
     self._title = clean_ascii_chars(self._title)
     self.url = url
     self.author = author
     self.toc_thumbnail = None
     self.internal_toc_entries = ()
     if author and not isinstance(author, str):
         author = author.decode('utf-8', 'replace')
     if summary and not isinstance(summary, str):
         summary = summary.decode('utf-8', 'replace')
     summary = clean_xml_chars(summary) if summary else summary
     self.summary = summary
     if summary and '<' in summary:
         try:
             s = html.fragment_fromstring(summary, create_parent=True)
             summary = html.tostring(s, method='text', encoding='unicode')
         except:
             print('Failed to process article summary, deleting:')
             print(summary.encode('utf-8'))
             traceback.print_exc()
             summary = ''
     self.text_summary = clean_ascii_chars(summary)
     self.author = author
     self.content = content
     self.date = published
     self.utctime = dt_factory(self.date, assume_utc=True, as_utc=True)
     self.localtime = self.utctime.astimezone(local_tz)
     self._formatted_date = None
Exemplo n.º 38
0
    def identify(
            self,
            log,
            result_queue,
            abort,
            title=None,
            authors=None,  # {{{
            identifiers={},
            timeout=30):
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.utils.cleantext import clean_ascii_chars

        XPath = partial(etree.XPath, namespaces=NAMESPACES)
        entry = XPath('//atom:entry')

        query = self.create_query(log,
                                  title=title,
                                  authors=authors,
                                  identifiers=identifiers)
        if not query:
            log.error('Insufficient metadata to construct query')
            return
        br = self.browser
        try:
            raw = br.open_novisit(query, timeout=timeout).read()
        except Exception as e:
            log.exception('Failed to make identify query: %r' % query)
            return as_unicode(e)
        try:
            parser = etree.XMLParser(recover=True, no_network=True)
            feed = etree.fromstring(xml_to_unicode(
                clean_ascii_chars(raw), strip_encoding_pats=True)[0],
                                    parser=parser)
            entries = entry(feed)
        except Exception as e:
            log.exception('Failed to parse identify results')
            return as_unicode(e)
        if not entries and identifiers and title and authors and \
                not abort.is_set():
            return self.identify(log,
                                 result_queue,
                                 abort,
                                 title=title,
                                 authors=authors,
                                 timeout=timeout)

        # There is no point running these queries in threads as douban
        # throttles requests returning 403 Forbidden errors
        self.get_all_details(br, log, entries, abort, result_queue, timeout)

        return None
Exemplo n.º 39
0
    def get_details(self):
        '''
        从书籍详情页获取书籍详情信息
        '''
        from calibre.utils.cleantext import clean_ascii_chars
        from calibre.ebooks.chardet import xml_to_unicode
        import html5lib

        try:
            raw = self.browser.open_novisit(
                self.url, timeout=self.timeout).read().strip()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                self.log.error('URL malformed: %r' % self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = '17k.com timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r' % self.url
                self.log.exception(msg)
            return

        oraw = raw
        raw = xml_to_unicode(raw,
                             strip_encoding_pats=True,
                             resolve_entities=True)[0]
        if '<title>404 - ' in raw:
            self.log.error('URL malformed: %r' % self.url)
            return

        try:
            root = html5lib.parse(clean_ascii_chars(raw),
                                  treebuilder='lxml',
                                  namespaceHTMLElements=False)
        except:
            msg = 'Failed to parse 17k.com details page: %r' % self.url
            self.log.exception(msg)
            return

        errmsg = root.xpath('//*[@id="errorMessage"]')
        if errmsg:
            msg = 'Failed to parse 17k.com details page: %r' % self.url
            msg += self.tostring(errmsg, method='text',
                                 encoding=unicode).strip()
            self.log.error(msg)
            return

        self.parse_details(oraw, root)
Exemplo n.º 40
0
 def download_detail(self):
     query = self.plugin.BASE_URL + self.ident
     br = self.browser
     try:
         self.log('download page detail %s'%query)
         data = br.open(query, timeout=self.timeout).read().strip()
         parser = etree.XMLParser(recover=True)
         clean = clean_ascii_chars(data)
         xml = fromstring(clean,  parser=parser)
         return xml
     except Exception as e:
         self.log.exception('Failed to make download : %r'%query)
         return None
Exemplo n.º 41
0
 def __init__(self, id, title, url, author, summary, published, content):
     from lxml import html
     self.downloaded = False
     self.id = id
     if not title or not isinstance(title, string_or_bytes):
         title = _('Unknown')
     title = force_unicode(title, 'utf-8')
     self._title = clean_xml_chars(title).strip()
     try:
         self._title = re.sub(r'&(\S+?);',
             entity_to_unicode, self._title)
     except:
         pass
     self._title = clean_ascii_chars(self._title)
     self.url = url
     self.author = author
     self.toc_thumbnail = None
     if author and not isinstance(author, unicode_type):
         author = author.decode('utf-8', 'replace')
     if summary and not isinstance(summary, unicode_type):
         summary = summary.decode('utf-8', 'replace')
     summary = clean_xml_chars(summary) if summary else summary
     self.summary = summary
     if summary and '<' in summary:
         try:
             s = html.fragment_fromstring(summary, create_parent=True)
             summary = html.tostring(s, method='text', encoding=unicode_type)
         except:
             print('Failed to process article summary, deleting:')
             print(summary.encode('utf-8'))
             traceback.print_exc()
             summary = u''
     self.text_summary = clean_ascii_chars(summary)
     self.author = author
     self.content = content
     self.date = published
     self.utctime = dt_factory(self.date, assume_utc=True, as_utc=True)
     self.localtime = self.utctime.astimezone(local_tz)
     self._formatted_date = None
Exemplo n.º 42
0
    def get_details(self):
        from calibre.utils.cleantext import clean_ascii_chars
        from calibre.ebooks.chardet import xml_to_unicode
        import html5lib

        try:
            raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                self.log.error('URL malformed: %r'%self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'Amazon timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r'%self.url
                self.log.exception(msg)
            return

        oraw = raw
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                resolve_entities=True)[0]
        if '<title>404 - ' in raw:
            self.log.error('URL malformed: %r'%self.url)
            return

        try:
            root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml',
                    namespaceHTMLElements=False)
        except:
            msg = 'Failed to parse amazon details page: %r'%self.url
            self.log.exception(msg)
            return
        if self.domain == 'jp':
            for a in root.xpath('//a[@href]'):
                if 'black-curtain-redirect.html' in a.get('href'):
                    self.url = 'http://amazon.co.jp'+a.get('href')
                    self.log('Black curtain redirect found, following')
                    return self.get_details()

        errmsg = root.xpath('//*[@id="errorMessage"]')
        if errmsg:
            msg = 'Failed to parse amazon details page: %r'%self.url
            msg += self.tostring(errmsg, method='text', encoding=unicode).strip()
            self.log.error(msg)
            return

        self.parse_details(oraw, root)
Exemplo n.º 43
0
    def get_details(self):
        try:
            self.log.info('Naver book url: %r'%self.url)
            raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                self.log.error('URL malformed: %r'%self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'Naver timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r'%self.url
                self.log.exception(msg)
            return

        raw = raw.decode('utf-8', errors='replace')
        #open('c:\\naverbook.html', 'wb').write(raw)

        try:
            root = fromstring(clean_ascii_chars(raw))
        except:
            msg = 'Failed to parse Naver details page: %r'%self.url
            self.log.exception(msg)
            return

        try:
            # Look at the <title> attribute for page to make sure that we were actually returned
            # a details page for a book. If the user had specified an invalid ISBN, then the results
            # page will just do a textual search.
            title = root.xpath('//meta[@property="og:title"]/@content')
            if title:
                if title is None:
                    self.log.error('Failed to see search results in page title: %r'%self.url)
                    return
        except:
            msg = 'Failed to read naverbook page title: %r'%self.url
            self.log.exception(msg)
            return

        errmsg = root.xpath('//*[@id="errorMessage"]')
        if errmsg:
            msg = 'Failed to parse naverbook details page: %r'%self.url
            msg += tostring(errmsg, method='text', encoding=unicode).strip()
            self.log.error(msg)
            return

        self.parse_details(root)
Exemplo n.º 44
0
 def download_moreinfo(self):
     query_more_info = '%shelpful/ajax/more_binfo.php?bid=%d'%(self.plugin.BASE_URL, self.number)
     try:
         self.log('download page moreinfo %s'%query_more_info)
         data = self.browser.open(query_more_info, timeout=self.timeout).read().strip()
         #fix - ajax request in not valid XML
         data = '<html>%s</html>'%data
         parser = etree.XMLParser(recover=True)
         clean = clean_ascii_chars(data)
         xml = fromstring(clean,  parser=parser)
         return xml
     except Exception as e:
         self.log.exception('Failed to make download : %r'%query_more_info)
         return None
Exemplo n.º 45
0
 def download_detail(self):
     query = "%snew/?mainpage=pub&subpage=detail&id=%s"%(self.plugin.BASE_URL, self.ident)
     br = self.browser
     try:
         self.log('download page detail %s'%query)
         data = br.open(query, timeout=self.timeout).read().strip()
         parser = etree.HTMLParser(recover=True)
         clean = clean_ascii_chars(data)
         xml = fromstring(clean,  parser=parser)
         self.log.filelog(clean, "\\tmp\\worker-%s.html"%self.ident)
         return xml
     except Exception as e:
         self.log.exception('Failed to make download : %r'%query)
         return None
Exemplo n.º 46
0
    def get_editions(self):
        url_parts = self.url.split('#')
        if len(url_parts) == 2:
            base_url, edition_year = url_parts
        else:
            base_url = url_parts[0]
            edition_year = None
        url = '%s/vydani' % (base_url)
        try:
            self.log.info('Legie url: %r' % url)
            raw = self.browser.open_novisit(
                url, timeout=self.timeout).read().strip()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                self.log.error('URL malformed: %r' % url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'Legie timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r' % url
                self.log.exception(msg)
            return

        raw = raw.decode('utf-8', errors='replace')
        #open('E:\\t3.html', 'wb').write(raw)

        if '<title>404 - ' in raw:
            self.log.error('URL malformed: %r' % url)
            return

        try:
            root = fromstring(clean_ascii_chars(raw))
        except:
            msg = 'Failed to parse Legie details page: %r' % url
            self.log.exception(msg)
            return

        self.log.info('Trying to parse editions')
        try:
            editions = self.parse_editions(root, edition_year)
        except:
            self.log.exception('Failed to parse editions page')
            editions = []

        return editions
Exemplo n.º 47
0
    def download_detail(self):
        query = self.plugin.BASE_DETAIL_URL + self.ident
        br = self.browser
        try:
            self.log('download page detail %s'%query)
            data = br.open(query, timeout=self.timeout).read().strip()
            parser = etree.HTMLParser(recover=True)
            clean = clean_ascii_chars(data)
            self.log.filelog(clean, 'D:\\tmp\\file' + self.ident +'.html')
            xml = fromstring(clean, parser=parser)
#             for error in parser.error_log:
#                 self.log(error.message)
            return xml
        except Exception as e:
            self.log.exception('Failed to make download : %r'%query)
            return None
Exemplo n.º 48
0
 def fix_endings(self):
     # read
     with open(self.__file, 'r') as read_obj:
         input_file = read_obj.read()
     # calibre go from win and mac to unix
     input_file = input_file.replace('\r\n', '\n')
     input_file = input_file.replace('\r', '\n')
     # remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
     if self.__replace_illegals:
         input_file = clean_ascii_chars(input_file)
     # write
     with open(self.__write_to, 'wb') as write_obj:
         write_obj.write(input_file)
     # copy
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "line_endings.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
Exemplo n.º 49
0
 def fix_endings(self):
     # read
     with open(self.__file, 'rb') as read_obj:
         input_file = read_obj.read()
     # calibre go from win and mac to unix
     input_file = input_file.replace(b'\r\n', b'\n')
     input_file = input_file.replace(b'\r', b'\n')
     # remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
     if self.__replace_illegals:
         input_file = clean_ascii_chars(input_file)
     # write
     with open(self.__write_to, 'wb') as write_obj:
         write_obj.write(input_file)
     # copy
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "line_endings.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
Exemplo n.º 50
0
    def download_detail(self):
        url = "%shtml/csv_txt_export_hledani.php?dotaz=%s,0"%(self.plugin.BASE_URL, self.ident)
        parameters = {
            "vystup":"csv",
            "oddelovac":"pipe",
            "rozsah":"vse",
            "odeslano":"true"
        }
        query= [url, parameters]

        try:
            self.log('download page search %s'%query)
            data = urllib.urlencode(query[1])
            raw = self.browser.open(query[0],data,timeout=self.timeout).read().strip()
            clean = clean_ascii_chars(raw)
            return unicode(clean, 'cp1250')
        except Exception as e:
            self.log.exception('Failed to make identify query: %r'%query)
            return as_unicode(e)
Exemplo n.º 51
0
        def make_query(query):
            log('Making query:', query)
            try:
                raw = br.open_novisit(query, timeout=timeout).read()
            except Exception as e:
                log.exception('Failed to make identify query: %r' % query)
                return False, as_unicode(e)

            try:
                feed = etree.fromstring(
                    xml_to_unicode(clean_ascii_chars(raw),
                                   strip_encoding_pats=True)[0],
                    parser=etree.XMLParser(recover=True,
                                           no_network=True,
                                           resolve_entities=False))
                return True, entry(feed)
            except Exception as e:
                log.exception('Failed to parse identify results')
                return False, as_unicode(e)
Exemplo n.º 52
0
    def _get_details(self):
        try:
            print('Goodreads book url: %r'%self.url)
            br = browser()
            raw = br.open_novisit(self.url, timeout=self.timeout).read().strip()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                print('URL malformed: %r'%self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'Goodreads timed out. Try again later.'
                print(msg)
            else:
                msg = 'Failed to make details query: %r'%self.url
                print(msg)
            return

        raw = raw.decode('utf-8', errors='replace')
        #open('E:\\t.html', 'wb').write(raw)

        if '<title>404 - ' in raw:
            print('URL malformed: %r'%self.url)
            return

        try:
            root = fromstring(clean_ascii_chars(raw))
        except:
            msg = 'Failed to parse goodreads details page: %r'%self.url
            print(msg)
            return

        errmsg = root.xpath('//*[@id="errorMessage"]')
        if errmsg:
            msg = 'Failed to parse goodreads details page: %r'%self.url
            msg += tostring(errmsg, method='text', encoding=unicode).strip()
            print(msg)
            return

        self._parse_page_count(root)
Exemplo n.º 53
0
    def run(self):
        if self.xml is None:
            raw = None
            url = None
            try:
                self.log([self.title, self.number])
                url = self.plugin.create_query(self.title, self.number)
                self.log('download page search %s'%url)
                raw = self.plugin.browser.open(url, timeout=self.timeout).read().strip()
            except Exception as e:
                self.log.exception('Failed to make identify query: %r'%url)
                return as_unicode(e)

            if raw is not None:
                try:
                    parser = etree.XMLParser(recover=True)
                    clean = clean_ascii_chars(raw)
                    self.xml = fromstring(clean, parser=parser)
                except Exception as e:
                    self.log.exception('Failed to parse xml for url: %s'%self.url)

        self.parse()
Exemplo n.º 54
0
	def get_details(self):
		try:
			raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip()
			raw = raw.decode('utf-8', errors='replace')
			if not raw:
				log.error('Failed to get raw result for query: %r'%query)
				return
		except Exception as e:
			if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
				self.log.error('URL malformed: %r'%self.url)
				return
			attr = getattr(e, 'args', [None])
			attr = attr if attr else [None]
			if isinstance(attr[0], socket.timeout):
				msg = 'Moly.hu timed out. Try again later.'
				self.log.error(msg)
			else:
				msg = 'Failed to make details query: %r'%self.url
				self.log.exception(msg)
			return

		root = fromstring(clean_ascii_chars(raw))
		self.parse_details(root)
Exemplo n.º 55
0
 def save_history(self):
     items = []
     ct = str(self.currentText())
     if ct:
         items.append(ct)
     for i in range(self.count()):
         item = str(self.itemText(i))
         if item not in items:
             items.append(item)
     self.blockSignals(True)
     self.clear()
     self.addItems(items)
     self.setEditText(ct)
     self.blockSignals(False)
     try:
         history.set(self.store_name, items)
     except ValueError:
         from calibre.utils.cleantext import clean_ascii_chars
         items = [clean_ascii_chars(force_unicode(x)) for x in items]
         try:
             history.set(self.store_name, items)
         except ValueError:
             pass
Exemplo n.º 56
0
 def get_image_urls(self, title, author, log, abort, timeout):
     from calibre.utils.cleantext import clean_ascii_chars
     try:
         from urllib.parse import urlencode
     except ImportError:
         from urllib import urlencode
     from collections import OrderedDict
     ans = OrderedDict()
     br = self.browser
     q = urlencode({'as_q': ('%s %s' % (title, author)).encode('utf-8')})
     if isinstance(q, bytes):
         q = q.decode('utf-8')
     sz = self.prefs['size']
     if sz == 'any':
         sz = ''
     elif sz == 'l':
         sz = 'isz:l,'
     else:
         sz = 'isz:lt,islt:%s,' % sz
     # See https://www.google.com/advanced_image_search to understand this
     # URL scheme
     url = 'https://www.google.com/search?as_st=y&tbm=isch&{}&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs={}iar:t,ift:jpg'.format(
         q, sz)
     log('Search URL: ' + url)
     raw = clean_ascii_chars(br.open(url).read().decode('utf-8'))
     root = parse_html(raw)
     results = root.xpath('//div/@data-tbnid')  # could also use data-id
     # from calibre.utils.ipython import ipython
     # ipython({'root': root, 'raw': raw, 'url': url, 'results': results})
     for tbnid in results:
         try:
             imgurl = imgurl_from_id(raw, tbnid)
         except Exception:
             continue
         if imgurl:
             ans[imgurl] = True
     return list(ans)
Exemplo n.º 57
0
			if isinstance(attr[0], socket.timeout):
				msg = 'ISFDB.org timed out. Try again later.'
				self.log.error(msg)
			else:
				msg = 'Failed to make details query: %r'%self.url
				self.log.exception(msg)
			return

		raw = raw.decode('cp1252', errors='replace')

		if '<title>404 - ' in raw:
			self.log.error('URL malformed: %r'%self.url)
			return

		try:
			root = fromstring(clean_ascii_chars(raw))
		except:
			msg = 'Failed to parse ISFDB details page: %r'%self.url
			self.log.exception(msg)
			return

		self.parse_details(root)

	def parse_details(self, root):
		isfdb_id = None
		title = None
		authors = []
		isbn = None
		publisher = None
		pubdate = None
		
Exemplo n.º 58
0
def to_metadata(browser, log, entry_, timeout):  # {{{
    from lxml import etree
    XPath = partial(etree.XPath, namespaces=NAMESPACES)

    # total_results  = XPath('//openSearch:totalResults')
    # start_index    = XPath('//openSearch:startIndex')
    # items_per_page = XPath('//openSearch:itemsPerPage')
    entry          = XPath('//atom:entry')
    entry_id       = XPath('descendant::atom:id')
    creator        = XPath('descendant::dc:creator')
    identifier     = XPath('descendant::dc:identifier')
    title          = XPath('descendant::dc:title')
    date           = XPath('descendant::dc:date')
    publisher      = XPath('descendant::dc:publisher')
    subject        = XPath('descendant::dc:subject')
    description    = XPath('descendant::dc:description')
    language       = XPath('descendant::dc:language')
    rating         = XPath('descendant::gd:rating[@average]')

    def get_text(extra, x):
        try:
            ans = x(extra)
            if ans:
                ans = ans[0].text
                if ans and ans.strip():
                    return ans.strip()
        except:
            log.exception('Programming error:')
        return None

    id_url = entry_id(entry_)[0].text
    google_id = id_url.split('/')[-1]
    title_ = ': '.join([x.text for x in title(entry_)]).strip()
    authors = [x.text.strip() for x in creator(entry_) if x.text]
    if not authors:
        authors = [_('Unknown')]
    if not id_url or not title:
        # Silently discard this entry
        return None

    mi = Metadata(title_, authors)
    mi.identifiers = {'google':google_id}
    try:
        raw = get_details(browser, id_url, timeout)
        feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
            strip_encoding_pats=True)[0])
        extra = entry(feed)[0]
    except:
        log.exception('Failed to get additional details for', mi.title)
        return mi

    mi.comments = get_text(extra, description)
    lang = canonicalize_lang(get_text(extra, language))
    if lang:
        mi.language = lang
    mi.publisher = get_text(extra, publisher)

    # ISBN
    isbns = []
    for x in identifier(extra):
        t = str(x.text).strip()
        if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):
            if t[:5].upper() == 'ISBN:':
                t = check_isbn(t[5:])
                if t:
                    isbns.append(t)
    if isbns:
        mi.isbn = sorted(isbns, key=len)[-1]
    mi.all_isbns = isbns

    # Tags
    try:
        btags = [x.text for x in subject(extra) if x.text]
        tags = []
        for t in btags:
            atags = [y.strip() for y in t.split('/')]
            for tag in atags:
                if tag not in tags:
                    tags.append(tag)
    except:
        log.exception('Failed to parse tags:')
        tags = []
    if tags:
        mi.tags = [x.replace(',', ';') for x in tags]

    # pubdate
    pubdate = get_text(extra, date)
    if pubdate:
        from calibre.utils.date import parse_date, utcnow
        try:
            default = utcnow().replace(day=15)
            mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
        except:
            log.error('Failed to parse pubdate %r'%pubdate)

    # Ratings
    for x in rating(extra):
        try:
            mi.rating = float(x.get('average'))
            if mi.rating > 5:
                mi.rating /= 2
        except:
            log.exception('Failed to parse rating')

    # Cover
    mi.has_google_cover = None
    for x in extra.xpath(
            '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'):
        mi.has_google_cover = x.get('href')
        break

    return mi
Exemplo n.º 59
0
def sanitize(s):
    return unicodedata.normalize(
        'NFC', clean_xml_chars(clean_ascii_chars(force_unicode(s or ''))))
Exemplo n.º 60
0
    def identify(self, log, result_queue, abort, title=None, authors=None,
            identifiers={}, timeout=30):
        '''
        Note this method will retry without identifiers automatically if no
        match is found with identifiers.
        '''
        matches = []

        # If we have a Legie id then we do not need to fire a "search".
        # Instead we will go straight to the URL for that book.
        legie_id = identifiers.get('legie', None)
        br = self.browser
        if legie_id:
            matches.append('%s/kniha/%s'%(Legie.BASE_URL, legie_id))
        else:
            query = self.create_title_query(log, title=title)
            if query is None:
                log.error('Insufficient metadata to construct query')
                return
            try:
                log.info('Querying: %s'%query)
                response = br.open_novisit(query, timeout=timeout)
                raw = response.read()
                redirected = response.geturl()
            except Exception as e:
                err = 'Failed to make identify query: %r'%query
                log.exception(err)
                return as_unicode(e)
            root = fromstring(clean_ascii_chars(raw))
            # Now grab the match from the search result, provided the
            # title appears to be for the same book
            if redirected == query:
            	log.info('No direct link for book, needed to search results page')
                self._parse_search_results(log, title, root, matches, timeout, query)
            else:
                matches.append(redirected)

        if abort.is_set():
            return

        if not matches:
            log.error('No matches found with query: %r'%query)
            return

        from calibre_plugins.legie.worker import Worker
        author_tokens = list(self.get_author_tokens(authors))
        workers = [Worker(url, author_tokens, result_queue, br, log, i, self) for i, url in
                enumerate(matches)]

        for w in workers:
            w.start()
            # Don't send all requests at the same time
            time.sleep(0.1)

        while not abort.is_set():
            a_worker_is_alive = False
            for w in workers:
                w.join(0.2)
                if abort.is_set():
                    break
                if w.is_alive():
                    a_worker_is_alive = True
            if not a_worker_is_alive:
                break

        return None