def connect(self): if not self.is_configured(): return False br = self.browser try: url = "%snew/"%self.BASE_URL clean = clean_ascii_chars(br.open(url).read().strip()) parser = etree.HTMLParser(recover=True) feed = fromstring(clean, parser=parser) formUrl = feed.xpath('//form[@id="form"]/@action') self.log('formUrl %s'%formUrl[0]) url = self.BASE_URL + formUrl[0] parameters = { "sendform":"1", "login_name":self.prefs['login'], "login_password":self.prefs['password'] } data = urllib.urlencode(parameters) self.log(url) self.log(data) clean = clean_ascii_chars(br.open(url,data).read().strip()) parser = etree.HTMLParser(recover=True) feed = fromstring(clean, parser=parser) self.log(clean) return len(feed.xpath('//input[@id="login_name"]/@name')) == 0 except Exception as e: self.log.exception(e) return False
def get_details(self): try: self.log.info('YES24 url: %r'%self.url) raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: self.log.error('URL malformed: %r'%self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'YES24 timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r'%self.url self.log.exception(msg) return raw = raw.decode('euc-kr', errors='replace') #open('P:\\yes24.html', 'wb').write(raw) if 'HTTP 404.' in raw: self.log.error('URL malformed: %r'%self.url) return try: root = fromstring(clean_ascii_chars(raw)) except: msg = 'Failed to parse YES24 details page: %r'%self.url self.log.exception(msg) return self.parse_details(root)
def convert(self, oeb_book, output_path, input_plugin, opts, log): from calibre.ebooks.txt.txtml import TXTMLizer from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.txt.newlines import specified_newlines, TxtNewlines if opts.txt_output_formatting.lower() == 'markdown': from calibre.ebooks.txt.markdownml import MarkdownMLizer self.writer = MarkdownMLizer(log) elif opts.txt_output_formatting.lower() == 'textile': from calibre.ebooks.txt.textileml import TextileMLizer self.writer = TextileMLizer(log) else: self.writer = TXTMLizer(log) txt = self.writer.extract_content(oeb_book, opts) txt = clean_ascii_chars(txt) log.debug('\tReplacing newlines with selected type...') txt = specified_newlines(TxtNewlines(opts.newline).newline, txt) close = False if not hasattr(output_path, 'write'): close = True if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '': os.makedirs(os.path.dirname(output_path)) out_stream = open(output_path, 'wb') else: out_stream = output_path out_stream.seek(0) out_stream.truncate() out_stream.write(txt.encode(opts.txt_output_encoding, 'replace')) if close: out_stream.close()
def get_image_urls(self, title, author, log, abort, timeout): from calibre.utils.cleantext import clean_ascii_chars from urllib import urlencode import html5lib import json from collections import OrderedDict ans = OrderedDict() br = self.browser q = urlencode({ 'as_q': ('%s %s' % (title, author)).encode('utf-8') }).decode('utf-8') sz = self.prefs['size'] if sz == 'any': sz = '' elif sz == 'l': sz = 'isz:l,' else: sz = 'isz:lt,islt:%s,' % sz # See https://www.google.com/advanced_image_search to understand this # URL scheme url = 'https://www.google.com/search?as_st=y&tbm=isch&{}&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs={}iar:t,ift:jpg'.format( q, sz) log('Search URL: ' + url) raw = br.open(url).read().decode('utf-8') root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', namespaceHTMLElements=False) for div in root.xpath('//div[@class="rg_meta"]'): try: data = json.loads(div.text) except Exception: continue if 'ou' in data: ans[data['ou']] = True return list(ans.iterkeys())
def run(self): if self.xml is None: raw = None url = None try: url = self.plugin.create_query(self.title, self.number) self.log('download page search %s'%url) raw = self.plugin.browser.open(url, timeout=self.timeout).read().strip() except Exception as e: self.log.exception('Failed to make identify query: %r'%url) return as_unicode(e) if raw is not None: try: parser = etree.XMLParser(recover=True) clean = clean_ascii_chars(raw) clean = re.sub("<br>", "<br/>", clean) clean = re.sub(" ", " ", clean) clean = re.sub("…", "...", clean) self.xml = fromstring(clean, parser=parser) if len(parser.error_log) > 0: #some errors while parsing self.log('while parsing page occus some errors:') self.log(parser.error_log) except Exception as e: self.log.exception('Failed to parse xml for url: %s'%url) self.parse()
def get_details(self): try: self.log.info('Legie url: %r' % self.url) raw = self.browser.open_novisit( self.url, timeout=self.timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: self.log.error('URL malformed: %r' % self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Legie timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r' % self.url self.log.exception(msg) return raw = raw.decode('utf-8', errors='replace') #open('E:\\t3.html', 'wb').write(raw) if '<title>404 - ' in raw: self.log.error('URL malformed: %r' % self.url) return try: root = fromstring(clean_ascii_chars(raw)) except: msg = 'Failed to parse Legie details page: %r' % self.url self.log.exception(msg) return self.parse_details(root)
def parse_html(raw): import html5lib from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.cleantext import clean_ascii_chars raw = clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)[0]) return html5lib.parse(raw, treebuilder="lxml", namespaceHTMLElements=False).getroot()
def clean_txt(txt): ''' Run transformations on the text to put it into consistent state. ''' if isbytestring(txt): txt = txt.decode('utf-8', 'replace') # Strip whitespace from the end of the line. Also replace # all line breaks with \n. txt = '\n'.join([line.rstrip() for line in txt.splitlines()]) # Replace whitespace at the beginning of the line with txt = re.sub('(?m)(?<=^)([ ]{2,}|\t+)(?=.)', ' ' * 4, txt) # Condense redundant spaces txt = re.sub('[ ]{2,}', ' ', txt) # Remove blank space from the beginning and end of the document. txt = re.sub(r'^\s+(?=.)', '', txt) txt = re.sub(r'(?<=.)\s+$', '', txt) # Remove excessive line breaks. txt = re.sub('\n{5,}', '\n\n\n\n', txt) # remove ASCII invalid chars : 0 to 8 and 11-14 to 24 txt = clean_ascii_chars(txt) return txt
def _parse_editions_for_book(self, log, editions_url, matches, timeout, title_tokens): def ismatch(title): title = lower(title) match = not title_tokens for t in title_tokens: if lower(t) in title: match = True break return match br = self.browser try: raw = br.open_novisit(editions_url, timeout=timeout).read().strip() except Exception as e: err = 'Failed identify editions query: %r' % editions_url log.exception(err) return as_unicode(e) try: raw = raw.decode('utf-8', errors='replace') if not raw: log.error('Failed to get raw result for query: %r' % editions_url) return # open('E:\\s.html', 'wb').write(raw) root = fromstring(clean_ascii_chars(raw)) except: msg = 'Failed to parse CBDB page for query: %r' % editions_url log.exception(msg) return msg first_non_valid = None for div_link in root.xpath( '//div[@class="editionData"]/div[1]/a[@class="bookTitle"]'): title = tostring(div_link, 'text').strip().lower() if title: # Verify it is not an audio edition valid_title = True for exclusion in [ '(audio cd)', '(compact disc)', '(audio cassette)' ]: if exclusion in title: log.info('Skipping audio edition: %s' % title) valid_title = False if first_non_valid is None: first_non_valid = BASE_URL + div_link.get('href') break if valid_title: # Verify it is not a foreign language edition if not ismatch(title): log.info('Skipping alternate title:', title) continue matches.append(BASE_URL + div_link.get('href')) if len(matches) >= CBDB.MAX_EDITIONS: return if len(matches) == 0 and first_non_valid: # We have found only audio editions. In which case return the first match # rather than tell the user there are no matches. log.info('Choosing the first audio edition as no others found.') matches.append(first_non_valid)
def get_image_urls(self, title, author, log, abort, timeout): from calibre.utils.cleantext import clean_ascii_chars from urllib import urlencode import html5lib import json from collections import OrderedDict ans = OrderedDict() br = self.browser q = urlencode({'as_q': ('%s %s'%(title, author)).encode('utf-8')}).decode('utf-8') sz = self.prefs['size'] if sz == 'any': sz = '' elif sz == 'l': sz = 'isz:l,' else: sz = 'isz:lt,islt:%s,' % sz # See https://www.google.com/advanced_image_search to understand this # URL scheme url = 'https://www.google.com/search?as_st=y&tbm=isch&{}&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs={}iar:t,ift:jpg'.format(q, sz) log('Search URL: ' + url) raw = br.open(url).read().decode('utf-8') root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', namespaceHTMLElements=False) for div in root.xpath('//div[@class="rg_meta"]'): try: data = json.loads(div.text) except Exception: continue if 'ou' in data: ans[data['ou']] = True return list(ans.iterkeys())
def root_from_url(cls, browser, url, timeout, log): log.info('Fetching: %s' % url) response = browser.open_novisit(url, timeout=timeout) raw = response.read() parser = XMLParser(recover=True, no_network=True) return fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0], parser=parser)
def key_press_event(self, ev, which=0): code = ev.key() if self.capture == 0 or code in (0, Qt.Key_unknown, Qt.Key_Shift, Qt.Key_Control, Qt.Key_Alt, Qt.Key_Meta, Qt.Key_AltGr, Qt.Key_CapsLock, Qt.Key_NumLock, Qt.Key_ScrollLock): return QWidget.keyPressEvent(self, ev) button = getattr(self, 'button%d'%which) button.setStyleSheet('QPushButton { font-weight: normal}') mods = int(ev.modifiers()) & ~Qt.KeypadModifier # for some reason qt sometimes produces ascii control codes in text, # for example ctrl+shift+u will give text == '\x15' on linux txt = clean_ascii_chars(ev.text()) if txt and txt.lower() == txt.upper(): # We have a symbol like ! or > etc. In this case the value of code # already includes Shift, so remove it mods &= ~Qt.ShiftModifier sequence = QKeySequence(code|mods) button.setText(sequence.toString(QKeySequence.NativeText)) self.capture = 0 dup_desc = self.dup_check(sequence) if dup_desc is not None: error_dialog(self, _('Already assigned'), unicode(sequence.toString(QKeySequence.NativeText)) + ' ' + _('already assigned to') + ' ' + dup_desc, show=True) self.clear_clicked(which=which)
def make_query(self, q, abort, title=None, authors=None, identifiers={}, max_pages=10, timeout=30): from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.cleantext import clean_ascii_chars page_num = 1 parser = etree.XMLParser(recover=True, no_network=True) br = self.browser seen = set() candidates = [] total_found = 0 while page_num <= max_pages and not abort.is_set(): url = q.replace('&page_number=1&', '&page_number=%d&' % page_num) page_num += 1 raw = br.open_novisit(url, timeout=timeout).read() feed = etree.fromstring(xml_to_unicode( clean_ascii_chars(raw), strip_encoding_pats=True)[0], parser=parser) total, found, results = self.parse_feed(feed, seen, title, authors, identifiers) total_found += found candidates += results if total_found >= total or len(candidates) > 9: break return candidates
def run(self): if self.xml is None: raw = None url = None try: url = self.plugin.create_query(self.title, self.authors, self.number) self.log('download page search %s'%url) raw = self.plugin.browser.open(url, timeout=self.timeout).read().strip() except Exception as e: self.log.exception('Failed to make identify query: %r'%url) return as_unicode(e) if raw is not None: try: parser = etree.HTMLParser() clean = clean_ascii_chars(raw) self.xml = fromstring(clean, parser=parser) # if len(parser.error_log) > 0: #some errors while parsing # self.log('while parsing page occus some errors:') # self.log(parser.error_log) except Exception as e: self.log.exception('Failed to parse xml for url: %s'%url) self.parse()
def download_parse(self, query, timeout): # self.downloads_count += 1 # number = self.downloads_count br = self.browser try: self.log('download page search %s'%query) data = urllib.urlencode(query[1]) raw = br.open(query[0],data,timeout=timeout).read().strip() except Exception as e: self.log.exception('Failed to make identify query: %r'%query) return as_unicode(e) try: parser = etree.HTMLParser(recover=True) clean = clean_ascii_chars(raw) # self.log.filelog(clean, "\\tmp\\test.html") feed = fromstring(clean, parser=parser) # if len(parser.error_log) > 0: #some errors while parsing # self.log('while parsing page occus some errors:') # self.log(parser.error_log) return feed except Exception as e: self.log.exception('Failed to parse identify results') return as_unicode(e)
def make_query(self, q, abort, title=None, authors=None, identifiers={}, max_pages=10, timeout=30): from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.cleantext import clean_ascii_chars page_num = 1 parser = etree.XMLParser(recover=True, no_network=True) br = self.browser seen = set() candidates = [] total_found = 0 while page_num <= max_pages and not abort.is_set(): url = q.replace('&page_number=1&', '&page_number=%d&'%page_num) page_num += 1 raw = br.open_novisit(url, timeout=timeout).read() feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0], parser=parser) total, found, results = self.parse_feed( feed, seen, title, authors, identifiers) total_found += found candidates += results if total_found >= total or len(candidates) > 9: break return candidates
def clean_txt(txt): ''' Run transformations on the text to put it into consistent state. ''' if isbytestring(txt): txt = txt.decode('utf-8', 'replace') # Strip whitespace from the end of the line. Also replace # all line breaks with \n. txt = '\n'.join([line.rstrip() for line in txt.splitlines()]) # Replace whitespace at the beginning of the line with txt = re.sub('(?m)(?<=^)([ ]{2,}|\t+)(?=.)', ' ' * 4, txt) # Condense redundant spaces txt = re.sub('[ ]{2,}', ' ', txt) # Remove blank space from the beginning and end of the document. txt = re.sub('^\s+(?=.)', '', txt) txt = re.sub('(?<=.)\s+$', '', txt) # Remove excessive line breaks. txt = re.sub('\n{5,}', '\n\n\n\n', txt) # remove ASCII invalid chars : 0 to 8 and 11-14 to 24 txt = clean_ascii_chars(txt) return txt
def identify( # {{{ self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): from lxml import etree entry = XPath('//atom:entry') query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: log.error('Insufficient metadata to construct query') return br = self.browser log('Making query:', query) try: raw = br.open_novisit(query, timeout=timeout).read() except Exception as e: log.exception('Failed to make identify query: %r' % query) return as_unicode(e) try: parser = etree.XMLParser(recover=True, no_network=True) feed = etree.fromstring(xml_to_unicode( clean_ascii_chars(raw), strip_encoding_pats=True)[0], parser=parser) entries = entry(feed) except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e) if not entries and title and not abort.is_set(): if identifiers: log('No results found, retrying without identifiers') return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) ntitle = cleanup_title(title) if ntitle and ntitle != title: log('No results found, retrying without sub-title') return self.identify(log, result_queue, abort, title=ntitle, authors=authors, timeout=timeout) # There is no point running these queries in threads as google # throttles requests returning 403 Forbidden errors self.get_all_details(br, log, entries, abort, result_queue, timeout)
def get_details(self): try: self.log.info('Shelfari book url: %r' % self.url) raw = self.browser.open_novisit( self.url, timeout=self.timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: self.log.error('URL malformed: %r' % self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Shelfari timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r' % self.url self.log.exception(msg) return raw = raw.decode('utf-8', errors='replace') #open('c:\\shelfari.html', 'wb').write(raw) if '<title>404 - ' in raw: self.log.error('URL malformed: %r' % self.url) return try: root = fromstring(clean_ascii_chars(raw)) except: msg = 'Failed to parse shelfari details page: %r' % self.url self.log.exception(msg) return try: # Look at the <title> attribute for page to make sure that we were actually returned # a details page for a book. If the user had specified an invalid ISBN, then the results # page will just do a textual search. title_node = root.xpath('//title') if title_node: page_title = title_node[0].text_content().strip() if page_title is None: self.log.error( 'Failed to see search results in page title: %r' % self.url) return except: msg = 'Failed to read shelfari page title: %r' % self.url self.log.exception(msg) return errmsg = root.xpath('//*[@id="errorMessage"]') if errmsg: msg = 'Failed to parse shelfari details page: %r' % self.url msg += tostring(errmsg, method='text', encoding=unicode).strip() self.log.error(msg) return self.parse_details(root)
def get_details(self): try: raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: self.log.error('URL malformed: %r'%self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Kyobobook timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r'%self.url self.log.exception(msg) return # open('c:\\Kyobobook1.html', 'wb').write(raw) # raw = raw.decode('utf-8', errors='replace') #00 # open('c:\\Kyobobook2.html', 'wb').write(raw) # if '<title>404 - ' in raw: # self.log.error('URL malformed: %r'%self.url) # return try: root = fromstring(clean_ascii_chars(raw)) except: msg = 'Failed to parse Kyobobook details page: %r'%self.url self.log.exception(msg) return try: # Look at the <title> attribute for page to make sure that we were actually returned # a details page for a book. If the user had specified an invalid ISBN, then the results # page will just do a textual search. title_node = root.xpath('//title') if title_node: page_title = title_node[0].text_content().strip() # search success : "나의 문화유산답사기 1 - 인터넷교보문고" # search fail : " - 인터넷교보문고" if page_title is None or page_title == " - 인터넷교보문고": self.log.error('Failed to see search results in page title: %r'%self.url) return except: msg = 'Failed to read Kyobobook page title: %r'%self.url self.log.exception(msg) return errmsg = root.xpath('//*[@id="errorMessage"]') if errmsg: msg = 'Failed to parse Kyobobook details page: %r'%self.url msg += tostring(errmsg, method='text', encoding=unicode).strip() self.log.error(msg) return self.parse_details(root)
def clean_html(raw): from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.cleantext import clean_ascii_chars return clean_ascii_chars( xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)[0])
def parse_html(raw): import html5lib from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.cleantext import clean_ascii_chars raw = clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)[0]) return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False).getroot()
def fetch_raw( self, log, url, br, testing, # {{{ identifiers={}, timeout=30): from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode from lxml.html import tostring import html5lib try: raw = br.open_novisit( url, timeout=timeout).read().decode('gb18030').strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: log.error('Query malformed: %r' % url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = _('DangDang timed out. Try again later.') log.error(msg) else: msg = 'Failed to make identify query: %r' % url log.exception(msg) return as_unicode(msg) raw = clean_ascii_chars( xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]) if testing: import tempfile with tempfile.NamedTemporaryFile(prefix='dangdang_results_', suffix='.html', delete=False) as f: f.write(raw.encode('utf-8')) print('Downloaded html for results page saved in', f.name) matches = [] found = '<title>对不起,您要访问的页面暂时没有找到' not in raw if found: try: root = html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) except: msg = 'Failed to parse DangDang page for query: %r' % url log.exception(msg) return msg return found, root
def _parse_editions_for_book(self, log, editions_url, matches, timeout, title_tokens): def ismatch(title): title = lower(title) match = not title_tokens for t in title_tokens: if lower(t) in title: match = True break return match br = self.browser try: raw = br.open_novisit(editions_url, timeout=timeout).read().strip() except Exception as e: err = 'Failed identify editions query: %r'%editions_url log.exception(err) return as_unicode(e) try: raw = raw.decode('utf-8', errors='replace') if not raw: log.error('Failed to get raw result for query: %r'%editions_url) return #open('E:\\s.html', 'wb').write(raw) root = fromstring(clean_ascii_chars(raw)) except: msg = 'Failed to parse goodreads page for query: %r'%editions_url log.exception(msg) return msg first_non_valid = None for div_link in root.xpath('//div[@class="editionData"]/div[1]/a[@class="bookTitle"]'): title = tostring(div_link, 'text').strip().lower() if title: # Verify it is not an audio edition valid_title = True for exclusion in ['(audio cd)', '(compact disc)', '(audio cassette)']: if exclusion in title: log.info('Skipping audio edition: %s'%title) valid_title = False if first_non_valid is None: first_non_valid = Goodreads.BASE_URL + div_link.get('href') break if valid_title: # Verify it is not a foreign language edition if not ismatch(title): log.info('Skipping alternate title:', title) continue matches.append(Goodreads.BASE_URL + div_link.get('href')) if len(matches) >= Goodreads.MAX_EDITIONS: return if len(matches) == 0 and first_non_valid: # We have found only audio editions. In which case return the first match # rather than tell the user there are no matches. log.info('Choosing the first audio edition as no others found.') matches.append(first_non_valid)
def get_details(self): try: raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip() except Exception as e: if callable(getattr(e, "getcode", None)) and e.getcode() == 404: self.log.error("URL malformed: %r" % self.url) return attr = getattr(e, "args", [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = "Aladin timed out. Try again later." self.log.error(msg) else: msg = "Failed to make details query: %r" % self.url self.log.exception(msg) return # raw = raw.decode('utf-8', errors='replace') #00 # if '<title>404 - ' in raw: # self.log.error('URL malformed: %r'%self.url) # return try: root = fromstring(clean_ascii_chars(raw)) except: msg = "Failed to parse aladin details page: %r" % self.url self.log.exception(msg) return try: # Look at the <title> attribute for page to make sure that we were actually returned # a details page for a book. If the user had specified an invalid ISBN, then the results # page will just do a textual search. title_node = root.xpath("//title") if title_node: page_title = title_node[0].text_content().strip() # search success : '[알라딘]나의 문화유산답사기 1 - 남도답사 일번지, 개정판' # search fail : '[알라딘] "좋은 책을 고르는 방법, 알라딘"' if page_title is None or page_title.find("좋은 책을 고르는 방법, 알라딘") > -1: self.log.error("Failed to see search results in page title: %r" % self.url) return except: msg = "Failed to read aladin page title: %r" % self.url self.log.exception(msg) return errmsg = root.xpath('//*[@id="errorMessage"]') if errmsg: msg = "Failed to parse aladin details page: %r" % self.url msg += tostring(errmsg, method="text", encoding=unicode).strip() self.log.error(msg) return self.parse_details(root)
def convert_new(self, stream, accelerators): from calibre.ebooks.pdf.pdftohtml import pdftohtml from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.pdf.reflow import PDFDocument pdftohtml(os.getcwdu(), stream.name, self.opts.no_images, as_xml=True) with open(u'index.xml', 'rb') as f: xml = clean_ascii_chars(f.read()) PDFDocument(xml, self.opts, self.log) return os.path.join(os.getcwdu(), u'metadata.opf')
def convert_new(self, stream, accelerators): from calibre.ebooks.pdf.pdftohtml import pdftohtml from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.pdf.reflow import PDFDocument pdftohtml(os.getcwd(), stream.name, self.opts.no_images, as_xml=True) with lopen('index.xml', 'rb') as f: xml = clean_ascii_chars(f.read()) PDFDocument(xml, self.opts, self.log) return os.path.join(os.getcwd(), 'metadata.opf')
def get_details(self): try: self.log.info("Goodreads book url: %r" % self.url) raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip() except Exception as e: if callable(getattr(e, "getcode", None)) and e.getcode() == 404: self.log.error("URL malformed: %r" % self.url) return attr = getattr(e, "args", [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = "Goodreads timed out. Try again later." self.log.error(msg) else: msg = "Failed to make details query: %r" % self.url self.log.exception(msg) return raw = raw.decode("utf-8", errors="replace") # open('c:\\goodreads.html', 'wb').write(raw) if "<title>404 - " in raw: self.log.error("URL malformed: %r" % self.url) return try: root = fromstring(clean_ascii_chars(raw)) except: msg = "Failed to parse goodreads details page: %r" % self.url self.log.exception(msg) return try: # Look at the <title> attribute for page to make sure that we were actually returned # a details page for a book. If the user had specified an invalid ISBN, then the results # page will just do a textual search. title_node = root.xpath("//title") if title_node: page_title = title_node[0].text_content().strip() if page_title is None or page_title.find("search results for") != -1: self.log.error("Failed to see search results in page title: %r" % self.url) return except: msg = "Failed to read goodreads page title: %r" % self.url self.log.exception(msg) return errmsg = root.xpath('//*[@id="errorMessage"]') if errmsg: msg = "Failed to parse goodreads details page: %r" % self.url msg += tostring(errmsg, method="text", encoding=unicode).strip() self.log.error(msg) return self.parse_details(root)
def get_details(self): from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode import html5lib try: raw = self.browser.open_novisit( self.url, timeout=self.timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: self.log.error('URL malformed: %r' % self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Amazon timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r' % self.url self.log.exception(msg) return oraw = raw raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] if '<title>404 - ' in raw: self.log.error('URL malformed: %r' % self.url) return try: root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', namespaceHTMLElements=False) except: msg = 'Failed to parse amazon details page: %r' % self.url self.log.exception(msg) return if self.domain == 'jp': for a in root.xpath('//a[@href]'): if 'black-curtain-redirect.html' in a.get('href'): self.url = 'http://amazon.co.jp' + a.get('href') self.log('Black curtain redirect found, following') return self.get_details() errmsg = root.xpath('//*[@id="errorMessage"]') if errmsg: msg = 'Failed to parse amazon details page: %r' % self.url msg += self.tostring(errmsg, method='text', encoding=unicode).strip() self.log.error(msg) return self.parse_details(oraw, root)
def parse_details_page(url, log, timeout, browser, domain): from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode import html5lib from lxml.html import tostring try: raw = browser.open_novisit(url, timeout=timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: log.error('URL malformed: %r'%url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Amazon timed out. Try again later.' log.error(msg) else: msg = 'Failed to make details query: %r'%url log.exception(msg) return oraw = raw if 'amazon.com.br' in url: raw = raw.decode('utf-8') # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] if '<title>404 - ' in raw: log.error('URL malformed: %r'%url) return try: root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', namespaceHTMLElements=False) except: msg = 'Failed to parse amazon details page: %r'%url log.exception(msg) return if domain == 'jp': for a in root.xpath('//a[@href]'): if 'black-curtain-redirect.html' in a.get('href'): url = 'http://amazon.co.jp'+a.get('href') log('Black curtain redirect found, following') return parse_details_page(url, log, timeout, browser, domain) errmsg = root.xpath('//*[@id="errorMessage"]') if errmsg: msg = 'Failed to parse amazon details page: %r'%url msg += tostring(errmsg, method='text', encoding=unicode).strip() log.error(msg) return from css_selectors import Select selector = Select(root) return oraw, root, selector
def populate_article_metadata(self, article, soup, first): els = soup.findAll(name=['span', 'p'], attrs={'class': ['flytitle-and-title__title', 'blog-post__rubric']}) result = [] for el in els[0:2]: if el is not None and el.contents: for descendant in el.contents: if isinstance(descendant, NavigableString): result.append(type(u'')(descendant)) article.summary = u'. '.join(result) + u'.' article.text_summary = clean_ascii_chars(article.summary)
def replace_illegals(self): """ """ with open(self.__file, 'r') as read_obj: with open(self.__write_to, 'w') as write_obj: for line in read_obj: write_obj.write(clean_ascii_chars(line)) copy_obj = copy.Copy() if self.__copy: copy_obj.copy_file(self.__write_to, "replace_illegals.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to)
def download_short_story_list(self, url): query_short_stories = self.plugin.BASE_URL + url try: self.log('download page with short stories list %s'%query_short_stories) data = self.browser.open(query_short_stories, timeout=self.timeout).read().strip() parser = etree.XMLParser(recover=True) clean = clean_ascii_chars(data) xml = fromstring(clean, parser=parser) return xml except Exception as e: self.log.exception('Failed to make download : %r'%query_short_stories) return None
def replace_illegals(self): """ """ with open_for_read(self.__file) as read_obj: with open_for_write(self.__write_to) as write_obj: for line in read_obj: write_obj.write(clean_ascii_chars(line)) copy_obj = copy.Copy() if self.__copy: copy_obj.copy_file(self.__write_to, "replace_illegals.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to)
def __init__(self, id, title, url, author, summary, published, content): from lxml import html self.downloaded = False self.id = id if not title or not isinstance(title, string_or_bytes): title = _('Unknown') title = force_unicode(title, 'utf-8') self._title = clean_xml_chars(title).strip() try: self._title = re.sub(r'&(\S+?);', entity_to_unicode, self._title) except: pass self._title = clean_ascii_chars(self._title) self.url = url self.author = author self.toc_thumbnail = None self.internal_toc_entries = () if author and not isinstance(author, str): author = author.decode('utf-8', 'replace') if summary and not isinstance(summary, str): summary = summary.decode('utf-8', 'replace') summary = clean_xml_chars(summary) if summary else summary self.summary = summary if summary and '<' in summary: try: s = html.fragment_fromstring(summary, create_parent=True) summary = html.tostring(s, method='text', encoding='unicode') except: print('Failed to process article summary, deleting:') print(summary.encode('utf-8')) traceback.print_exc() summary = '' self.text_summary = clean_ascii_chars(summary) self.author = author self.content = content self.date = published self.utctime = dt_factory(self.date, assume_utc=True, as_utc=True) self.localtime = self.utctime.astimezone(local_tz) self._formatted_date = None
def identify( self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30): from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.cleantext import clean_ascii_chars XPath = partial(etree.XPath, namespaces=NAMESPACES) entry = XPath('//atom:entry') query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: log.error('Insufficient metadata to construct query') return br = self.browser try: raw = br.open_novisit(query, timeout=timeout).read() except Exception as e: log.exception('Failed to make identify query: %r' % query) return as_unicode(e) try: parser = etree.XMLParser(recover=True, no_network=True) feed = etree.fromstring(xml_to_unicode( clean_ascii_chars(raw), strip_encoding_pats=True)[0], parser=parser) entries = entry(feed) except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e) if not entries and identifiers and title and authors and \ not abort.is_set(): return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) # There is no point running these queries in threads as douban # throttles requests returning 403 Forbidden errors self.get_all_details(br, log, entries, abort, result_queue, timeout) return None
def get_details(self): ''' 从书籍详情页获取书籍详情信息 ''' from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode import html5lib try: raw = self.browser.open_novisit( self.url, timeout=self.timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: self.log.error('URL malformed: %r' % self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = '17k.com timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r' % self.url self.log.exception(msg) return oraw = raw raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] if '<title>404 - ' in raw: self.log.error('URL malformed: %r' % self.url) return try: root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', namespaceHTMLElements=False) except: msg = 'Failed to parse 17k.com details page: %r' % self.url self.log.exception(msg) return errmsg = root.xpath('//*[@id="errorMessage"]') if errmsg: msg = 'Failed to parse 17k.com details page: %r' % self.url msg += self.tostring(errmsg, method='text', encoding=unicode).strip() self.log.error(msg) return self.parse_details(oraw, root)
def download_detail(self): query = self.plugin.BASE_URL + self.ident br = self.browser try: self.log('download page detail %s'%query) data = br.open(query, timeout=self.timeout).read().strip() parser = etree.XMLParser(recover=True) clean = clean_ascii_chars(data) xml = fromstring(clean, parser=parser) return xml except Exception as e: self.log.exception('Failed to make download : %r'%query) return None
def __init__(self, id, title, url, author, summary, published, content): from lxml import html self.downloaded = False self.id = id if not title or not isinstance(title, string_or_bytes): title = _('Unknown') title = force_unicode(title, 'utf-8') self._title = clean_xml_chars(title).strip() try: self._title = re.sub(r'&(\S+?);', entity_to_unicode, self._title) except: pass self._title = clean_ascii_chars(self._title) self.url = url self.author = author self.toc_thumbnail = None if author and not isinstance(author, unicode_type): author = author.decode('utf-8', 'replace') if summary and not isinstance(summary, unicode_type): summary = summary.decode('utf-8', 'replace') summary = clean_xml_chars(summary) if summary else summary self.summary = summary if summary and '<' in summary: try: s = html.fragment_fromstring(summary, create_parent=True) summary = html.tostring(s, method='text', encoding=unicode_type) except: print('Failed to process article summary, deleting:') print(summary.encode('utf-8')) traceback.print_exc() summary = u'' self.text_summary = clean_ascii_chars(summary) self.author = author self.content = content self.date = published self.utctime = dt_factory(self.date, assume_utc=True, as_utc=True) self.localtime = self.utctime.astimezone(local_tz) self._formatted_date = None
def get_details(self): from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode import html5lib try: raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: self.log.error('URL malformed: %r'%self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Amazon timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r'%self.url self.log.exception(msg) return oraw = raw raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] if '<title>404 - ' in raw: self.log.error('URL malformed: %r'%self.url) return try: root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', namespaceHTMLElements=False) except: msg = 'Failed to parse amazon details page: %r'%self.url self.log.exception(msg) return if self.domain == 'jp': for a in root.xpath('//a[@href]'): if 'black-curtain-redirect.html' in a.get('href'): self.url = 'http://amazon.co.jp'+a.get('href') self.log('Black curtain redirect found, following') return self.get_details() errmsg = root.xpath('//*[@id="errorMessage"]') if errmsg: msg = 'Failed to parse amazon details page: %r'%self.url msg += self.tostring(errmsg, method='text', encoding=unicode).strip() self.log.error(msg) return self.parse_details(oraw, root)
def get_details(self): try: self.log.info('Naver book url: %r'%self.url) raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: self.log.error('URL malformed: %r'%self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Naver timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r'%self.url self.log.exception(msg) return raw = raw.decode('utf-8', errors='replace') #open('c:\\naverbook.html', 'wb').write(raw) try: root = fromstring(clean_ascii_chars(raw)) except: msg = 'Failed to parse Naver details page: %r'%self.url self.log.exception(msg) return try: # Look at the <title> attribute for page to make sure that we were actually returned # a details page for a book. If the user had specified an invalid ISBN, then the results # page will just do a textual search. title = root.xpath('//meta[@property="og:title"]/@content') if title: if title is None: self.log.error('Failed to see search results in page title: %r'%self.url) return except: msg = 'Failed to read naverbook page title: %r'%self.url self.log.exception(msg) return errmsg = root.xpath('//*[@id="errorMessage"]') if errmsg: msg = 'Failed to parse naverbook details page: %r'%self.url msg += tostring(errmsg, method='text', encoding=unicode).strip() self.log.error(msg) return self.parse_details(root)
def download_moreinfo(self): query_more_info = '%shelpful/ajax/more_binfo.php?bid=%d'%(self.plugin.BASE_URL, self.number) try: self.log('download page moreinfo %s'%query_more_info) data = self.browser.open(query_more_info, timeout=self.timeout).read().strip() #fix - ajax request in not valid XML data = '<html>%s</html>'%data parser = etree.XMLParser(recover=True) clean = clean_ascii_chars(data) xml = fromstring(clean, parser=parser) return xml except Exception as e: self.log.exception('Failed to make download : %r'%query_more_info) return None
def download_detail(self): query = "%snew/?mainpage=pub&subpage=detail&id=%s"%(self.plugin.BASE_URL, self.ident) br = self.browser try: self.log('download page detail %s'%query) data = br.open(query, timeout=self.timeout).read().strip() parser = etree.HTMLParser(recover=True) clean = clean_ascii_chars(data) xml = fromstring(clean, parser=parser) self.log.filelog(clean, "\\tmp\\worker-%s.html"%self.ident) return xml except Exception as e: self.log.exception('Failed to make download : %r'%query) return None
def get_editions(self): url_parts = self.url.split('#') if len(url_parts) == 2: base_url, edition_year = url_parts else: base_url = url_parts[0] edition_year = None url = '%s/vydani' % (base_url) try: self.log.info('Legie url: %r' % url) raw = self.browser.open_novisit( url, timeout=self.timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: self.log.error('URL malformed: %r' % url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Legie timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r' % url self.log.exception(msg) return raw = raw.decode('utf-8', errors='replace') #open('E:\\t3.html', 'wb').write(raw) if '<title>404 - ' in raw: self.log.error('URL malformed: %r' % url) return try: root = fromstring(clean_ascii_chars(raw)) except: msg = 'Failed to parse Legie details page: %r' % url self.log.exception(msg) return self.log.info('Trying to parse editions') try: editions = self.parse_editions(root, edition_year) except: self.log.exception('Failed to parse editions page') editions = [] return editions
def download_detail(self): query = self.plugin.BASE_DETAIL_URL + self.ident br = self.browser try: self.log('download page detail %s'%query) data = br.open(query, timeout=self.timeout).read().strip() parser = etree.HTMLParser(recover=True) clean = clean_ascii_chars(data) self.log.filelog(clean, 'D:\\tmp\\file' + self.ident +'.html') xml = fromstring(clean, parser=parser) # for error in parser.error_log: # self.log(error.message) return xml except Exception as e: self.log.exception('Failed to make download : %r'%query) return None
def fix_endings(self): # read with open(self.__file, 'r') as read_obj: input_file = read_obj.read() # calibre go from win and mac to unix input_file = input_file.replace('\r\n', '\n') input_file = input_file.replace('\r', '\n') # remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27 if self.__replace_illegals: input_file = clean_ascii_chars(input_file) # write with open(self.__write_to, 'wb') as write_obj: write_obj.write(input_file) # copy copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "line_endings.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to)
def fix_endings(self): # read with open(self.__file, 'rb') as read_obj: input_file = read_obj.read() # calibre go from win and mac to unix input_file = input_file.replace(b'\r\n', b'\n') input_file = input_file.replace(b'\r', b'\n') # remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27 if self.__replace_illegals: input_file = clean_ascii_chars(input_file) # write with open(self.__write_to, 'wb') as write_obj: write_obj.write(input_file) # copy copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "line_endings.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to)
def download_detail(self): url = "%shtml/csv_txt_export_hledani.php?dotaz=%s,0"%(self.plugin.BASE_URL, self.ident) parameters = { "vystup":"csv", "oddelovac":"pipe", "rozsah":"vse", "odeslano":"true" } query= [url, parameters] try: self.log('download page search %s'%query) data = urllib.urlencode(query[1]) raw = self.browser.open(query[0],data,timeout=self.timeout).read().strip() clean = clean_ascii_chars(raw) return unicode(clean, 'cp1250') except Exception as e: self.log.exception('Failed to make identify query: %r'%query) return as_unicode(e)
def make_query(query): log('Making query:', query) try: raw = br.open_novisit(query, timeout=timeout).read() except Exception as e: log.exception('Failed to make identify query: %r' % query) return False, as_unicode(e) try: feed = etree.fromstring( xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0], parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)) return True, entry(feed) except Exception as e: log.exception('Failed to parse identify results') return False, as_unicode(e)
def _get_details(self): try: print('Goodreads book url: %r'%self.url) br = browser() raw = br.open_novisit(self.url, timeout=self.timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: print('URL malformed: %r'%self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Goodreads timed out. Try again later.' print(msg) else: msg = 'Failed to make details query: %r'%self.url print(msg) return raw = raw.decode('utf-8', errors='replace') #open('E:\\t.html', 'wb').write(raw) if '<title>404 - ' in raw: print('URL malformed: %r'%self.url) return try: root = fromstring(clean_ascii_chars(raw)) except: msg = 'Failed to parse goodreads details page: %r'%self.url print(msg) return errmsg = root.xpath('//*[@id="errorMessage"]') if errmsg: msg = 'Failed to parse goodreads details page: %r'%self.url msg += tostring(errmsg, method='text', encoding=unicode).strip() print(msg) return self._parse_page_count(root)
def run(self): if self.xml is None: raw = None url = None try: self.log([self.title, self.number]) url = self.plugin.create_query(self.title, self.number) self.log('download page search %s'%url) raw = self.plugin.browser.open(url, timeout=self.timeout).read().strip() except Exception as e: self.log.exception('Failed to make identify query: %r'%url) return as_unicode(e) if raw is not None: try: parser = etree.XMLParser(recover=True) clean = clean_ascii_chars(raw) self.xml = fromstring(clean, parser=parser) except Exception as e: self.log.exception('Failed to parse xml for url: %s'%self.url) self.parse()
def get_details(self): try: raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip() raw = raw.decode('utf-8', errors='replace') if not raw: log.error('Failed to get raw result for query: %r'%query) return except Exception as e: if callable(getattr(e, 'getcode', None)) and e.getcode() == 404: self.log.error('URL malformed: %r'%self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Moly.hu timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r'%self.url self.log.exception(msg) return root = fromstring(clean_ascii_chars(raw)) self.parse_details(root)
def save_history(self): items = [] ct = str(self.currentText()) if ct: items.append(ct) for i in range(self.count()): item = str(self.itemText(i)) if item not in items: items.append(item) self.blockSignals(True) self.clear() self.addItems(items) self.setEditText(ct) self.blockSignals(False) try: history.set(self.store_name, items) except ValueError: from calibre.utils.cleantext import clean_ascii_chars items = [clean_ascii_chars(force_unicode(x)) for x in items] try: history.set(self.store_name, items) except ValueError: pass
def get_image_urls(self, title, author, log, abort, timeout): from calibre.utils.cleantext import clean_ascii_chars try: from urllib.parse import urlencode except ImportError: from urllib import urlencode from collections import OrderedDict ans = OrderedDict() br = self.browser q = urlencode({'as_q': ('%s %s' % (title, author)).encode('utf-8')}) if isinstance(q, bytes): q = q.decode('utf-8') sz = self.prefs['size'] if sz == 'any': sz = '' elif sz == 'l': sz = 'isz:l,' else: sz = 'isz:lt,islt:%s,' % sz # See https://www.google.com/advanced_image_search to understand this # URL scheme url = 'https://www.google.com/search?as_st=y&tbm=isch&{}&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs={}iar:t,ift:jpg'.format( q, sz) log('Search URL: ' + url) raw = clean_ascii_chars(br.open(url).read().decode('utf-8')) root = parse_html(raw) results = root.xpath('//div/@data-tbnid') # could also use data-id # from calibre.utils.ipython import ipython # ipython({'root': root, 'raw': raw, 'url': url, 'results': results}) for tbnid in results: try: imgurl = imgurl_from_id(raw, tbnid) except Exception: continue if imgurl: ans[imgurl] = True return list(ans)
if isinstance(attr[0], socket.timeout): msg = 'ISFDB.org timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r'%self.url self.log.exception(msg) return raw = raw.decode('cp1252', errors='replace') if '<title>404 - ' in raw: self.log.error('URL malformed: %r'%self.url) return try: root = fromstring(clean_ascii_chars(raw)) except: msg = 'Failed to parse ISFDB details page: %r'%self.url self.log.exception(msg) return self.parse_details(root) def parse_details(self, root): isfdb_id = None title = None authors = [] isbn = None publisher = None pubdate = None
def to_metadata(browser, log, entry_, timeout): # {{{ from lxml import etree XPath = partial(etree.XPath, namespaces=NAMESPACES) # total_results = XPath('//openSearch:totalResults') # start_index = XPath('//openSearch:startIndex') # items_per_page = XPath('//openSearch:itemsPerPage') entry = XPath('//atom:entry') entry_id = XPath('descendant::atom:id') creator = XPath('descendant::dc:creator') identifier = XPath('descendant::dc:identifier') title = XPath('descendant::dc:title') date = XPath('descendant::dc:date') publisher = XPath('descendant::dc:publisher') subject = XPath('descendant::dc:subject') description = XPath('descendant::dc:description') language = XPath('descendant::dc:language') rating = XPath('descendant::gd:rating[@average]') def get_text(extra, x): try: ans = x(extra) if ans: ans = ans[0].text if ans and ans.strip(): return ans.strip() except: log.exception('Programming error:') return None id_url = entry_id(entry_)[0].text google_id = id_url.split('/')[-1] title_ = ': '.join([x.text for x in title(entry_)]).strip() authors = [x.text.strip() for x in creator(entry_) if x.text] if not authors: authors = [_('Unknown')] if not id_url or not title: # Silently discard this entry return None mi = Metadata(title_, authors) mi.identifiers = {'google':google_id} try: raw = get_details(browser, id_url, timeout) feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0]) extra = entry(feed)[0] except: log.exception('Failed to get additional details for', mi.title) return mi mi.comments = get_text(extra, description) lang = canonicalize_lang(get_text(extra, language)) if lang: mi.language = lang mi.publisher = get_text(extra, publisher) # ISBN isbns = [] for x in identifier(extra): t = str(x.text).strip() if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'): if t[:5].upper() == 'ISBN:': t = check_isbn(t[5:]) if t: isbns.append(t) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags try: btags = [x.text for x in subject(extra) if x.text] tags = [] for t in btags: atags = [y.strip() for y in t.split('/')] for tag in atags: if tag not in tags: tags.append(tag) except: log.exception('Failed to parse tags:') tags = [] if tags: mi.tags = [x.replace(',', ';') for x in tags] # pubdate pubdate = get_text(extra, date) if pubdate: from calibre.utils.date import parse_date, utcnow try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r'%pubdate) # Ratings for x in rating(extra): try: mi.rating = float(x.get('average')) if mi.rating > 5: mi.rating /= 2 except: log.exception('Failed to parse rating') # Cover mi.has_google_cover = None for x in extra.xpath( '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'): mi.has_google_cover = x.get('href') break return mi
def sanitize(s): return unicodedata.normalize( 'NFC', clean_xml_chars(clean_ascii_chars(force_unicode(s or ''))))
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): ''' Note this method will retry without identifiers automatically if no match is found with identifiers. ''' matches = [] # If we have a Legie id then we do not need to fire a "search". # Instead we will go straight to the URL for that book. legie_id = identifiers.get('legie', None) br = self.browser if legie_id: matches.append('%s/kniha/%s'%(Legie.BASE_URL, legie_id)) else: query = self.create_title_query(log, title=title) if query is None: log.error('Insufficient metadata to construct query') return try: log.info('Querying: %s'%query) response = br.open_novisit(query, timeout=timeout) raw = response.read() redirected = response.geturl() except Exception as e: err = 'Failed to make identify query: %r'%query log.exception(err) return as_unicode(e) root = fromstring(clean_ascii_chars(raw)) # Now grab the match from the search result, provided the # title appears to be for the same book if redirected == query: log.info('No direct link for book, needed to search results page') self._parse_search_results(log, title, root, matches, timeout, query) else: matches.append(redirected) if abort.is_set(): return if not matches: log.error('No matches found with query: %r'%query) return from calibre_plugins.legie.worker import Worker author_tokens = list(self.get_author_tokens(authors)) workers = [Worker(url, author_tokens, result_queue, br, log, i, self) for i, url in enumerate(matches)] for w in workers: w.start() # Don't send all requests at the same time time.sleep(0.1) while not abort.is_set(): a_worker_is_alive = False for w in workers: w.join(0.2) if abort.is_set(): break if w.is_alive(): a_worker_is_alive = True if not a_worker_is_alive: break return None