def run(self): url = 'http://www.mobileread.com/forums/ebooks.php?do=getlist&type=html' self.update_details.emit(_('Checking last download date.')) last_download = self.config.get('last_download', None) # Don't update the book list if our cache is less than one week old. if last_download and (time.time() - last_download) < 604800: return self.update_details.emit(_('Downloading book list from MobileRead.')) # Download the book list HTML file from MobileRead. br = browser() raw_data = None try: with closing(br.open(url, timeout=self.timeout)) as f: raw_data = f.read() except: return if not raw_data or not self._run: return self.update_details.emit(_('Processing books.')) # Turn books listed in the HTML file into SearchResults's. books = [] try: data = html.fromstring(raw_data) raw_books = data.xpath('//ul/li') self.total_changed.emit(len(raw_books)) for i, book_data in enumerate(raw_books): self.update_details.emit( _('%(num)s of %(tot)s books processed.') % dict( num=i, tot=len(raw_books))) book = SearchResult() book.detail_item = ''.join(book_data.xpath('.//a/@href')) book.formats = ''.join(book_data.xpath('.//i/text()')) book.formats = book.formats.strip() text = ''.join(book_data.xpath('.//a/text()')) if ':' in text: book.author, q, text = text.partition(':') book.author = book.author.strip() book.title = text.strip() books.append(book) if not self._run: books = [] break else: self.update_progress.emit(i) except: pass # Save the book list and it's create time. if books: self.config['book_list'] = self.seralize_books(books) self.config['last_download'] = time.time()
def run(self): url = 'http://www.mobileread.com/forums/ebooks.php?do=getlist&type=html' self.update_details.emit(_('Checking last download date.')) last_download = self.config.get('last_download', None) # Don't update the book list if our cache is less than one week old. if last_download and (time.time() - last_download) < 604800: return self.update_details.emit(_('Downloading book list from MobileRead.')) # Download the book list HTML file from MobileRead. br = browser() raw_data = None try: with closing(br.open(url, timeout=self.timeout)) as f: raw_data = f.read() except: return if not raw_data or not self._run: return self.update_details.emit(_('Processing books.')) # Turn books listed in the HTML file into SearchResults's. books = [] try: data = html.fromstring(raw_data) raw_books = data.xpath('//ul/li') self.total_changed.emit(len(raw_books)) for i, book_data in enumerate(raw_books): self.update_details.emit( _('%(num)s of %(tot)s books processed.') % dict(num=i, tot=len(raw_books))) book = SearchResult() book.detail_item = ''.join(book_data.xpath('.//a/@href')) book.formats = ''.join(book_data.xpath('.//i/text()')) book.formats = book.formats.strip() text = ''.join(book_data.xpath('.//a/text()')) if ':' in text: book.author, q, text = text.partition(':') book.author = book.author.strip() book.title = text.strip() books.append(book) if not self._run: books = [] break else: self.update_progress.emit(i) except: pass # Save the book list and it's create time. if books: self.config['book_list'] = self.seralize_books(books) self.config['last_download'] = time.time()
def search(self, query, max_results=10, timeout=60): url = 'http://woblink.com/katalog-e-book?query=' + urllib.quote_plus(query.encode('utf-8')) if max_results > 10: if max_results > 20: url += '&limit=30' else: url += '&limit=20' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="book-item backgroundmix"]'): if counter <= 0: break id = ''.join(data.xpath('.//td[@class="w10 va-t mYHaveItYes"]/a[1]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//td[@class="w10 va-t mYHaveItYes"]/a[1]/img/@src')) title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()')) author = ', '.join(data.xpath('.//td[@class="va-t"]/h3/a/text()')) price = ''.join(data.xpath('.//div[@class="prices"]/span[1]/strong/span/text()')) price = re.sub('\.', ',', price) formats = [ form[8:-4].split('.')[0] for form in data.xpath('.//p[3]/img/@src')] s = SearchResult() s.cover_url = 'http://woblink.com' + cover_url s.title = title.strip() s.author = author.strip() s.price = price + ' zł' s.detail_item = id.strip() if 'epub_drm' in formats: s.drm = SearchResult.DRM_LOCKED s.formats = 'EPUB' counter -= 1 yield s elif 'pdf' in formats: s.drm = SearchResult.DRM_LOCKED s.formats = 'PDF' counter -= 1 yield s else: s.drm = SearchResult.DRM_UNLOCKED if 'MOBI_nieb' in formats: formats.remove('MOBI_nieb') formats.append('MOBI') s.formats = ', '.join(formats).upper() counter -= 1 yield s
def search(self, query, max_results=10, timeout=60): url = 'http://woblink.com/katalog-e-book?query=' + urllib.quote_plus(query.encode('utf-8')) if max_results > 10: if max_results > 20: url += '&limit=30' else: url += '&limit=20' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="book-item backgroundmix"]'): if counter <= 0: break id = ''.join(data.xpath('.//td[@class="w10 va-t mYHaveItYes"]/a[1]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//td[@class="w10 va-t mYHaveItYes"]/a[1]/img/@src')) title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()')) author = ', '.join(data.xpath('.//td[@class="va-t"]/h3/a/text()')) price = ''.join(data.xpath('.//div[@class="prices"]/span[1]/strong/span/text()')) price = re.sub('\.', ',', price) formats = [ form[8:-4].split('.')[0] for form in data.xpath('.//p[3]/img/@src')] s = SearchResult() s.cover_url = 'http://woblink.com' + cover_url s.title = title.strip() s.author = author.strip() s.price = price + ' zł' s.detail_item = id.strip() if 'epub_drm' in formats: s.drm = SearchResult.DRM_LOCKED s.formats = 'EPUB' counter -= 1 yield s elif 'pdf' in formats: s.drm = SearchResult.DRM_LOCKED s.formats = 'PDF' counter -= 1 yield s else: s.drm = SearchResult.DRM_UNLOCKED if 'MOBI_nieb' in formats: formats.remove('MOBI_nieb') formats.append('MOBI') s.formats = ', '.join(formats).upper() counter -= 1 yield s
def search_kobo(query, max_results=10, timeout=60, write_html_to=None): from css_selectors import Select url = 'https://www.kobobooks.com/search/search.html?q=' + quote_plus(query) raw = read_url(url, timeout=timeout) if write_html_to is not None: with open(write_html_to, 'w') as f: f.write(raw) doc = html.fromstring(raw) select = Select(doc) for i, item in enumerate(select('.result-items .item-wrapper.book')): if i == max_results: break for img in select('.item-image img[src]', item): cover_url = img.get('src') if cover_url.startswith('//'): cover_url = 'https:' + cover_url break else: cover_url = None for p in select('h2.title', item): title = etree.tostring(p, method='text', encoding='unicode').strip() for a in select('a[href]', p): url = a.get('href') break else: url = None break else: title = None if title: for p in select('p.subtitle', item): title += ' - ' + etree.tostring( p, method='text', encoding='unicode').strip() authors = [] for a in select('.contributors a.contributor-name', item): authors.append( etree.tostring(a, method='text', encoding='unicode').strip()) authors = authors_to_string(authors) for p in select('p.price', item): price = etree.tostring(p, method='text', encoding='unicode').strip() break else: price = None if title and authors and url: s = SearchResult() s.cover_url = cover_url s.title = title s.author = authors s.price = price s.detail_item = url s.formats = 'EPUB' s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=12, timeout=60): url = 'http://virtualo.pl/?q=' + urllib.quote( query) + '&f=format_id:4,6,3' br = browser() no_drm_pattern = re.compile(r'Znak wodny|Brak') counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath( '//div[@id="content"]//div[@class="list_box list_box_border"]' ): if counter <= 0: break id = ''.join( data.xpath('.//div[@class="list_middle_left"]//a/@href') ).split(r'?q=')[0] if not id: continue price = ''.join( data.xpath( './/span[@class="price"]/text() | .//span[@class="price abbr"]/text()' )) cover_url = ''.join( data.xpath( './/div[@class="list_middle_left"]//a//img/@src')) title = ''.join( data.xpath( './/div[@class="list_title list_text_left"]/a/text()')) author = ', '.join( data.xpath( './/div[@class="list_authors list_text_left"]/a/text()' )) formats = [ form.split('_')[-1].replace('.png', '') for form in data.xpath( './/div[@style="width:55%;float:left;text-align:left;height:18px;"]//a/span/img/@src' ) ] nodrm = no_drm_pattern.search(''.join( data.xpath( './/div[@style="width:45%;float:right;text-align:right;height:18px;"]//span[@class="prompt_preview"]/text()' ))) counter -= 1 s = SearchResult() s.cover_url = cover_url.split('.jpg')[0] + '.jpg' s.title = title.strip() s.author = author.strip() s.price = price + ' zł' s.detail_item = 'http://virtualo.pl' + id.strip().split( 'http://')[0] s.formats = ', '.join(formats).upper() s.drm = SearchResult.DRM_UNLOCKED if nodrm else SearchResult.DRM_LOCKED yield s
def search(self, query, max_results=20, timeout=60): br = browser() counter = max_results page = 1 while counter: with closing( br.open( "http://www.publio.pl/e-booki,strona" + str(page) + ".html?q=" + urllib.quote(query), timeout=timeout, ) ) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="item"]'): if counter <= 0: break id = "".join(data.xpath('.//div[@class="img"]/a/@href')) if not id: continue cover_url = "".join(data.xpath('.//div[@class="img"]/a/img/@data-original')) title = "".join(data.xpath('.//div[@class="img"]/a/@title')) title2 = "".join(data.xpath('.//div[@class="desc"]/h5//text()')) if title2: title = title + ". " + title2 if ( "".join( data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[last()]/span/text()') ).strip() == "Seria:" ): series = "".join( data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[last()]/a/@title') ) title = title + " (seria " + series + ")" author = ", ".join( data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[@class="row"][1]/a/@title') ) price = "".join(data.xpath('.//div[@class="priceBox tk-museo-slab"]/ins/text()')) if not price: price = "".join(data.xpath('.//div[@class="priceBox tk-museo-slab"]/text()')).strip() formats = ", ".join(data.xpath('.//div[@class="formats"]/a/img/@alt')) counter -= 1 s = SearchResult() s.cover_url = "http://www.publio.pl" + cover_url s.title = title.strip() s.author = author s.price = price s.detail_item = "http://www.publio.pl" + id.strip() s.drm = SearchResult.DRM_LOCKED if "DRM" in formats else SearchResult.DRM_UNLOCKED s.formats = formats.replace(" DRM", "").strip() yield s if not doc.xpath('boolean(//a[@class="next"])'): break page += 1
def search(self, query, max_results=10, timeout=60): url = ('http://www.whsmith.co.uk/search?keywordCategoryId=wc_dept_ebooks&results=60' '&page=1&keywords=' + urllib2.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//li[@class="product"]'): if counter <= 0: break id_ = ''.join(data.xpath('./a[@class="product_image_wrap"]/@href')) if not id_: continue id_ = 'http://www.whsmith.co.uk' + id_ cover_url = ''.join(data.xpath('.//img[@class="product_image"]/@src')) title = ''.join(data.xpath('.//h4[@class="product_title"]/text()')) author = ', '.join(data.xpath('.//span[@class="product_second"]/text()')) price = ''.join(data.xpath('.//span[@class="price"]/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_LOCKED s.detail_item = id_ s.formats = 'ePub' yield s
def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode( 'ascii', 'backslashreplace').replace('%', '%25').replace( '\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read().decode('latin-1', 'replace')) data_xpath = '//div[contains(@class, "prod")]' format_xpath = './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' asin_xpath = '@name' cover_xpath = './/img[@class="productImage"]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' price_xpath = './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format = ''.join(data.xpath(format_xpath)) if 'kindle' not in format.lower(): continue # We must have an asin otherwise we can't easily reference the # book later. asin = data.xpath(asin_xpath) if asin: asin = asin[0] else: continue cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath(title_xpath)) author = ''.join(data.xpath(author_xpath)) try: author = author.split('by ', 1)[1].split(" (")[0] except: pass price = ''.join(data.xpath(price_xpath)) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = asin.strip() s.formats = 'Kindle' yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.bubok.es/resellers/calibre_search/' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[contains(@class, "libro")]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="url"]/text()')) title = ''.join(data.xpath('.//div[@class="titulo"]/text()')) author = ''.join(data.xpath('.//div[@class="autor"]/text()')) price = ''.join(data.xpath('.//div[@class="precio"]/text()')) formats = ''.join(data.xpath('.//div[@class="formatos"]/text()')) cover = ''.join(data.xpath('.//div[@class="portada"]/text()')) counter -= 1 s = SearchResult() s.title = title.strip() s.author = author.strip() s.detail_item = id.strip() s.price = price.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.strip() s.cover_url = cover.strip() yield s
def search(self, query, max_results=10, timeout=60): """Searches LibGen for Books. Since the mirror links are not direct downloads, it should not provide these as `s.downloads`. """ debug_print = partial(module_debug_print, 'LibgenStore:search:') debug_print('search:query = ', query) libgen_results = self.libgen.search(query) for result in libgen_results.results[:min(max_results, len(libgen_results.results))]: debug_print('result.title = ', result.title) for mirror in result.mirrors[0:1]: # Calibre only shows 1 anyway debug_print('result.mirror.url = ', mirror.url) s = SearchResult() s.store_name = PLUGIN_NAME s.cover_url = result.image_url s.title = '{} ({}, {}{})'.format( result.title, result.language, mirror.size, mirror.unit) s.author = result.authors s.price = '0.00' s.detail_item = result.md5 s.drm = SearchResult.DRM_UNLOCKED s.formats = mirror.format s.plugin_author = PLUGIN_AUTHORS debug_print('s = ', s) yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.barnesandnoble.com/s/%s?keyword=%s&store=ebook' % (query.replace(' ', '-'), urllib.quote_plus(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[contains(@class, "result-set")]/li[contains(@class, "result")]'): if counter <= 0: break id = ''.join(data.xpath('.//div[contains(@class, "image-bounding-box")]/a/@href')) if not id: continue cover_url = ''.join(data.xpath('.//img[contains(@class, "product-image")]/@src')) title = ''.join(data.xpath('.//a[@class="title"]//text()')) author = ', '.join(data.xpath('.//a[@class="contributor"]//text()')) price = ''.join(data.xpath('.//div[@class="price-format"]//span[contains(@class, "price")]/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Nook' yield s
def search(self, query, max_results=10, timeout=60): print( "search!") q = query.decode('utf-8') url = "https://views.scraperwiki.com/run/haodooscraperview/?" + urlencode( { "q": q } ) print( url ) br = browser() with closing(br.open(url, timeout=timeout)) as f: json_doc = f.read() if len(json_doc)>0: result = json.loads( json_doc ) for volume in result: s = SearchResult() s.title = volume['title'] s.detail_item = volume['url'] s.price = '$0.00' s.drm = SearchResult.DRM_UNLOCKED if volume.has_key('type') and len(volume["type"]): for t in volume["type"]: s.downloads[ t['type'] ] = t['link'] s.formats = ', '.join(s.downloads.keys()) yield s else: print( "scrape nothing." )
def search(self, query, max_results=10, timeout=60): print("search!") q = query.decode('utf-8') url = "https://views.scraperwiki.com/run/haodooscraperview/?" + urlencode( {"q": q}) print(url) br = browser() with closing(br.open(url, timeout=timeout)) as f: json_doc = f.read() if len(json_doc) > 0: result = json.loads(json_doc) for volume in result: s = SearchResult() s.title = volume['title'] s.detail_item = volume['url'] s.price = '$0.00' s.drm = SearchResult.DRM_UNLOCKED if volume.has_key('type') and len(volume["type"]): for t in volume["type"]: s.downloads[t['type']] = t['link'] s.formats = ', '.join(s.downloads.keys()) yield s else: print("scrape nothing.")
def search(self, query, max_results=25, timeout=60): url = 'http://ebookpoint.pl/search?qa=&szukaj=' + quote_plus( query.decode('utf-8').encode('iso-8859-2')) + '&serwisyall=0&wprzyg=0&wsprzed=1&wyczerp=0&formaty=em-p' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="list"]/li'): if counter <= 0: break id = ''.join(data.xpath('./a/@href')) if not id: continue formats = ', '.join(data.xpath('.//ul[@class="book-type book-type-points"]//span[@class="popup"]/span/text()')) cover_url = ''.join(data.xpath('.//p[@class="cover"]/img/@data-src')) title = ''.join(data.xpath('.//div[@class="book-info"]/h3/a/text()')) author = ''.join(data.xpath('.//p[@class="author"]//text()')) price = ''.join(data.xpath('.//p[@class="price price-incart"]/a/ins/text()|.//p[@class="price price-add"]/a/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = re.sub(r'\.',',',price) s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.upper() yield s
def search(self, query, max_results=20, timeout=60): url = 'http://www.escapemagazine.pl/wyszukiwarka?query=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="item item_short"]'): if counter <= 0: break id = ''.join(data.xpath('.//h2[@class="title"]/a[1]/@href')) if not id: continue title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()')) author = ''.join(data.xpath('.//div[@class="author"]/text()')) price = ''.join(data.xpath('.//span[@class="price_now"]/strong/text()')) + ' zł' cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://www.escapemagazine.pl' + id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = 'PDF' yield s
def search(self, query, max_results=10, timeout=60): base_url = 'https://www.millsandboon.co.uk' url = base_url + '/search.aspx??format=ebook&searchText=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//article[contains(@class, "group")]'): if counter <= 0: break id_ = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/@href')).strip() if not id_: continue cover_url = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@src')) title = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@alt')).strip() author = ''.join(data.xpath('.//a[@class="author"]/text()')) price = ''.join(data.xpath('.//div[@class="type-wrapper"]/ul/li[child::span[text()="eBook"]]/a/text()')) format_ = ''.join(data.xpath('.//p[@class="doc-meta-format"]/span[last()]/text()')) drm = SearchResult.DRM_LOCKED counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = id_ s.drm = drm s.formats = format_ yield s
def search_amazon(query, max_results=10, timeout=60, write_html_to=None, base_url=SEARCH_BASE_URL, base_query=SEARCH_BASE_QUERY, field_keywords='k' ): uquery = base_query.copy() uquery[field_keywords] = query def asbytes(x): if isinstance(x, type('')): x = x.encode('utf-8') return x uquery = {asbytes(k):asbytes(v) for k, v in uquery.items()} url = base_url + '?' + urlencode(uquery) br = browser(user_agent=get_user_agent()) counter = max_results with closing(br.open(url, timeout=timeout)) as f: raw = f.read() if write_html_to is not None: with open(write_html_to, 'wb') as f: f.write(raw) doc = html.fromstring(raw) for result in doc.xpath('//div[contains(@class, "s-result-list")]//div[@data-index and @data-asin]'): kformat = ''.join(result.xpath('.//a[contains(text(), "{}")]//text()'.format(KINDLE_EDITION))) # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. if 'kindle' not in kformat.lower(): continue asin = result.get('data-asin') if not asin: continue cover_url = ''.join(result.xpath('.//img/@src')) title = etree.tostring(result.xpath('.//h2')[0], method='text', encoding='unicode') adiv = result.xpath('.//div[contains(@class, "a-color-secondary")]')[0] aparts = etree.tostring(adiv, method='text', encoding='unicode').split() idx = aparts.index(BY) author = ' '.join(aparts[idx+1:]).split('|')[0].strip() price = '' for span in result.xpath('.//span[contains(@class, "a-price")]/span[contains(@class, "a-offscreen")]'): q = ''.join(span.xpath('./text()')) if q: price = q break counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.detail_item = asin.strip() s.price = price.strip() s.formats = 'Kindle' yield s
def open_search(url, query, max_results=10, timeout=60): description = Description(url) url_template = description.get_best_template() if not url_template: return oquery = Query(url_template) # set up initial values oquery.searchTerms = query oquery.count = max_results url = oquery.url() counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: doc = etree.fromstring(f.read()) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip() for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if 'http://opds-spec.org/thumbnail' in rel: s.cover_url = href elif 'http://opds-spec.org/image/thumbnail' in rel: s.cover_url = href elif 'http://opds-spec.org/acquisition/buy' in rel: s.detail_item = href elif 'http://opds-spec.org/acquisition/sample' in rel: pass elif 'http://opds-spec.org/acquisition' in rel: if type: ext = guess_extension(type) if ext: ext = ext[1:].upper().strip() s.downloads[ext] = href s.formats = ', '.join(s.downloads.keys()).strip() s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join(data.xpath('./*[local-name() = "author"]//*[local-name() = "name"]//text()')).strip() price_e = data.xpath('.//*[local-name() = "price"][1]') if price_e: price_e = price_e[0] currency_code = price_e.get('currencycode', '') price = ''.join(price_e.xpath('.//text()')).strip() s.price = currency_code + ' ' + price s.price = s.price.strip() yield s
def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) data_xpath = '//div[contains(@class, "prod")]' format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' asin_xpath = '@name' cover_xpath = './/img[@class="productImage"]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]/text()' price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format_ = ''.join(data.xpath(format_xpath)) if 'kindle' not in format_.lower(): continue # We must have an asin otherwise we can't easily reference the # book later. asin = data.xpath(asin_xpath) if asin: asin = asin[0] else: continue cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath(title_xpath)) author = ''.join(data.xpath(author_xpath)) try: if self.author_article: author = author.split(self.author_article, 1)[1].split(" (")[0] except: pass price = ''.join(data.xpath(price_xpath)) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = asin.strip() s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Kindle' yield s
def search(self, query, max_results=10, timeout=60): url = 'http://bookoteka.pl/list?search=' + urllib.quote_plus(query) + '&cat=1&hp=1&type=1' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//li[@class="EBOOK"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[@class="item_link"]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//a[@class="item_link"]/img/@src')) title = ''.join(data.xpath('.//div[@class="shelf_title"]/a/text()')) author = ''.join(data.xpath('.//div[@class="shelf_authors"][1]/text()')) price = ''.join(data.xpath('.//span[@class="EBOOK"]/text()')) price = price.replace('.', ',') formats = ', '.join(data.xpath('.//a[@class="fancybox protected"]/text()')) counter -= 1 s = SearchResult() s.cover_url = 'http://bookoteka.pl' + cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://bookoteka.pl' + id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.strip() yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.bubok.pt/resellers/calibre_search/' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[contains(@class, "libro")]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="url"]/text()')) title = ''.join(data.xpath('.//div[@class="titulo"]/text()')) author = ''.join(data.xpath('.//div[@class="autor"]/text()')) price = ''.join(data.xpath('.//div[@class="precio"]/text()')) formats = ''.join(data.xpath('.//div[@class="formatos"]/text()')) cover = ''.join(data.xpath('.//div[@class="portada"]/text()')) counter -= 1 s = SearchResult() s.title = title.strip() s.author = author.strip() s.detail_item = id.strip() s.price = price.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.strip() s.cover_url = cover.strip() yield s
def search(self, query, max_results=10, timeout=60): url = 'http://bookoteka.pl/list?search=' + urllib.quote_plus(query) + '&cat=1&hp=1&type=1' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//li[@class="EBOOK"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[@class="item_link"]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//a[@class="item_link"]/img/@src')) title = ''.join(data.xpath('.//div[@class="shelf_title"]/a/text()')) author = ''.join(data.xpath('.//div[@class="shelf_authors"][1]/text()')) price = ''.join(data.xpath('.//span[@class="EBOOK"]/text()')) price = price.replace('.', ',') formats = ', '.join(data.xpath('.//a[@class="fancybox protected"]/text()')) counter -= 1 s = SearchResult() s.cover_url = 'http://bookoteka.pl' + cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://bookoteka.pl' + id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.strip() yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.kobobooks.com/search/search.html?q=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[contains(@class, "flowview-items")]/li'): if counter <= 0: break id = ''.join(data.xpath('./a[contains(@class, "block-link")]/@href')) if not id: continue id = id[1:] price = ''.join(data.xpath('.//a[contains(@class, "primary-button")]//text()')) cover_url = ''.join(data.xpath('.//img[1]/@src')) cover_url = 'http:%s' % cover_url title = ''.join(data.xpath('.//p[contains(@class, "flowview-item-title")]//text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.price = price.strip() s.detail_item = 'http://store.kobobooks.com/' + id.strip() s.formats = 'EPUB' s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=10, timeout=60): url = 'http://ebooks.eharlequin.com/BANGSearch.dll?Type=FullText&FullTextField=All&FullTextCriteria=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//table[not(.//@class="sidelink")]/tr[.//ul[@id="details"]]'): if counter <= 0: break id = ''.join(data.xpath('.//ul[@id="details"]/li[@id="title-results"]/a/@href')) if not id: continue title = ''.join(data.xpath('.//ul[@id="details"]/li[@id="title-results"]/a/text()')) author = ''.join(data.xpath('.//ul[@id="details"]/li[@id="author"][1]//a/text()')) price = ''.join(data.xpath('.//div[@class="ourprice"]/font/text()')) cover_url = ''.join(data.xpath('.//a[@href="%s"]/img/@src' % id)) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = 'http://ebooks.eharlequin.com/' + id.strip() s.formats = 'EPUB' yield s
def search(self, query, max_results=25, timeout=60): url = 'http://ebookpoint.pl/search?qa=&szukaj=' + quote_plus( query.decode('utf-8').encode('iso-8859-2')) + '&serwisyall=0&wprzyg=0&wsprzed=1&wyczerp=0&formaty=em-p' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="list"]/li'): if counter <= 0: break id = ''.join(data.xpath('./a/@href')) if not id: continue formats = ', '.join(data.xpath('.//ul[@class="book-type book-type-points"]//span[@class="popup"]/span/text()')) cover_url = ''.join(data.xpath('.//p[@class="cover"]/img/@data-src')) title = ''.join(data.xpath('.//div[@class="book-info"]/h3/a/text()')) author = ''.join(data.xpath('.//p[@class="author"]//text()')) price = ''.join(data.xpath('.//p[@class="price price-incart"]/a/ins/text()|.//p[@class="price price-add"]/a/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = re.sub(r'\.',',',price) s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.upper() yield s
def search(self, query, max_results=10, timeout=60): url = ('https://www.whsmith.co.uk/search?keywordCategoryId=wc_dept_ebooks&results=60' '&page=1&keywords=' + quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//li[@class="product"]'): if counter <= 0: break id_ = ''.join(data.xpath('./a[@class="product_image_wrap"]/@href')) if not id_: continue id_ = 'https://www.whsmith.co.uk' + id_ cover_url = ''.join(data.xpath('.//img[@class="product_image"]/@src')) title = ''.join(data.xpath('.//h4[@class="product_title"]/text()')) author = ', '.join(data.xpath('.//span[@class="product_second"]/text()')) price = ''.join(data.xpath('.//span[@class="price"]/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_LOCKED s.detail_item = id_ s.formats = 'ePub' yield s
def search(self, query, max_results=10, timeout=60): url = 'http://ebooks.foyles.co.uk/catalog/search/?query=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="doc-item"]'): if counter <= 0: break id_ = ''.join(data.xpath('.//p[@class="doc-cover"]/a/@href')).strip() if not id_: continue id_ = 'http://ebooks.foyles.co.uk' + id_ cover_url = ''.join(data.xpath('.//p[@class="doc-cover"]/a/img/@src')) title = ''.join(data.xpath('.//span[@class="title"]/a/text()')) author = ', '.join(data.xpath('.//span[@class="author"]/span[@class="author"]/text()')) price = ''.join(data.xpath('.//span[@itemprop="price"]/text()')).strip() format_ = ''.join(data.xpath('.//p[@class="doc-meta-format"]/span[last()]/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = id_ s.drm = SearchResult.DRM_LOCKED s.formats = format_ yield s
def search(self, query, max_results=10, timeout=60): br = browser() page = 1 counter = max_results while counter: with closing( br.open(u'https://cdp.pl/ksiazki/e-book.html?q=' + urllib.quote_plus(query) + '&p=' + str(page), timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="grid-of-products"]/li'): if counter <= 0: break id = ''.join( data.xpath('.//a[@class="product-image"]/@href')) if not id: continue if 'ksiazki' not in id: continue cover_url = ''.join( data.xpath( './/a[@class="product-image"]/img/@data-src')) title = ''.join(data.xpath('.//h3[1]/a/@title')) price = ''.join( data.xpath('.//span[@class="custom_price"]/text()') ) + ',' + ''.join( data.xpath( './/span[@class="custom_price"]/sup/text()')) author = '' formats = '' with closing(br.open(id.strip(), timeout=timeout / 4)) as nf: idata = html.fromstring(nf.read()) author = ', '.join( idata.xpath( './/ul[@class="film-data"]/li[1]/p/text()')) formats = idata.xpath( '//div[@class="product-attributes-container"][2]/ul/li/span/text()' )[-1] counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author s.price = price + ' zł' s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.upper().strip() yield s if not doc.xpath('//span[@class="next-page"]/a'): break page += 1
def search(self, query, max_results=10, timeout=60): url = 'http://ebookstore.sony.com/search?keyword=%s' % urllib.quote_plus( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for item in doc.xpath( '//div[contains(@class, "searchResult")]/' 'descendant::li[contains(@class, "hreview")]'): if counter <= 0: break curr = ''.join( item.xpath( 'descendant::div[@class="pricing"]/descendant::*[@class="currency"]/@title' )).strip() amt = ''.join( item.xpath( 'descendant::div[@class="pricing"]/descendant::*[@class="amount"]/text()' )).strip() s = SearchResult() s.price = (curr + ' ' + amt) if (curr and amt) else _('Not Available') title = item.xpath('descendant::h3[@class="item"]') if not title: continue title = etree.tostring(title[0], method='text', encoding=unicode) if not title: continue s.title = title.strip() s.author = ''.join( item.xpath('descendant::li[contains(@class, "author")]/' 'a[@class="fn"]/text()')).strip() if not s.author: continue detail_url = ''.join( item.xpath('descendant::h3[@class="item"]' '/descendant::a[@class="fn" and @href]/@href')) if not detail_url: continue s.detail_item = detail_url counter -= 1 cover_url = ''.join( item.xpath('descendant::li[@class="coverart"]/' 'descendant::img[@src]/@src')) if cover_url: if cover_url.startswith('//'): cover_url = 'http:' + cover_url elif cover_url.startswith('/'): cover_url = 'http://ebookstore.sony.com' + cover_url s.cover_url = url_slash_cleaner(cover_url) s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Sony' yield s
def search(self, query, max_results=10, timeout=60): url = 'http://woblink.com/publication?query=' + urllib.quote_plus(query.encode('utf-8')) if max_results > 10: if max_results > 20: url += '&limit=30' else: url += '&limit=20' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="book-item"]'): if counter <= 0: break id = ''.join(data.xpath('.//td[@class="w10 va-t"]/a[1]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//td[@class="w10 va-t"]/a[1]/img/@src')) title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()')) author = ', '.join(data.xpath('.//p[@class="author"]/a/text()')) price = ''.join(data.xpath('.//div[@class="prices"]/span[1]/span/text()')) price = re.sub('\.', ',', price) formats = [ form[8:-4].split('_')[0] for form in data.xpath('.//p[3]/img/@src')] s = SearchResult() s.cover_url = 'http://woblink.com' + cover_url s.title = title.strip() s.author = author.strip() s.price = price + ' zł' s.detail_item = id.strip() # MOBI should be send first, if 'MOBI' in formats: t = copy.copy(s) t.title += ' MOBI' t.drm = SearchResult.DRM_UNLOCKED t.formats = 'MOBI' formats.remove('MOBI') counter -= 1 yield t # and the remaining formats (if any) next if formats: if 'epub' in formats: formats.remove('epub') formats.append('WOBLINK') if 'E Ink' in data.xpath('.//div[@class="prices"]/img/@title'): formats.insert(0, 'EPUB') s.drm = SearchResult.DRM_LOCKED s.formats = ', '.join(formats).upper() counter -= 1 yield s
def search(query, max_results=10, timeout=60, write_raw_to=None): url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + urllib.quote_plus(query) counter = max_results br = browser(user_agent='calibre/'+__version__) with closing(br.open(url, timeout=timeout)) as f: raw = f.read() if write_raw_to is not None: with open(write_raw_to, 'wb') as f: f.write(raw) doc = etree.fromstring(raw) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() # We could use the <link rel="alternate" type="text/html" ...> tag from the # detail odps page but this is easier. id = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip() s.detail_item = fix_url(url_slash_cleaner('%s/ebooks/%s' % (web_url, re.sub('[^\d]', '', id)))) if not s.detail_item: continue s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join(data.xpath('./*[local-name() = "content"]//text()')).strip() if not s.title or not s.author: continue # Get the formats and direct download links. with closing(br.open(id, timeout=timeout/4)) as nf: ndoc = etree.fromstring(nf.read()) for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'): type = link.get('type') href = link.get('href') if type: ext = mimetypes.guess_extension(type) if ext: ext = ext[1:].upper().strip() s.downloads[ext] = fix_url(href) s.formats = ', '.join(s.downloads.keys()) if not s.formats: continue for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: href = fix_url(href) if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'): if href.startswith('data:image/png;base64,'): s.cover_data = base64.b64decode(href.replace('data:image/png;base64,', '')) yield s
def search(query, max_results=10, timeout=60, write_raw_to=None): url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + quote_plus(query) counter = max_results br = browser(user_agent='calibre/'+__version__) with closing(br.open(url, timeout=timeout)) as f: raw = f.read() if write_raw_to is not None: with open(write_raw_to, 'wb') as f: f.write(raw) doc = etree.fromstring(raw) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() # We could use the <link rel="alternate" type="text/html" ...> tag from the # detail odps page but this is easier. id = fix_url(''.join(data.xpath('./*[local-name() = "id"]/text()')).strip()) s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (web_url, re.sub(r'[^\d]', '', id))) s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join(data.xpath('./*[local-name() = "content"]//text()')).strip() if not s.title or not s.author: continue # Get the formats and direct download links. with closing(br.open(id, timeout=timeout/4)) as nf: ndoc = etree.fromstring(nf.read()) for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'): type = link.get('type') href = link.get('href') if type: ext = mimetypes.guess_extension(type) if ext: ext = ext[1:].upper().strip() s.downloads[ext] = fix_url(href) s.formats = ', '.join(s.downloads.keys()) if not s.formats: continue for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: href = fix_url(href) if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'): if href.startswith('data:image/png;base64,'): cdata = href.replace('data:image/png;base64,', '') if not isinstance(cdata, bytes): cdata = cdata.encode('ascii') s.cover_data = base64.b64decode(cdata) yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.nexto.pl/szukaj.xml?search-clause=' + urllib.parse.quote_plus( query) + '&scid=1015' br = browser() offset = 0 counter = max_results while counter: with closing( br.open(url + '&_offset=' + str(offset), timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="productslist"]/li'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="col-2"]/a/@href')) if not id: continue price = ''.join( data.xpath('.//strong[@class="nprice"]/text()')) cover_url = ''.join( data.xpath('.//img[@class="cover"]/@src')) cover_url = re.sub(r'%2F', '/', cover_url) cover_url = re.sub(r'widthMax=120&heightMax=200', 'widthMax=64&heightMax=64', cover_url) title = ''.join(data.xpath('.//a[@class="title"]/text()')) title = re.sub(r' – ebook', '', title) author = ', '.join( data.xpath('.//div[@class="col-7"]//h4//a/text()')) formats = ', '.join( data.xpath('.//ul[@class="formats"]/li//b/text()')) DrmFree = re.search( r'znak', str(data.xpath( './/ul[@class="formats"]/li//b/@title'))) counter -= 1 s = SearchResult() s.cover_url = cover_url if cover_url[: 4] == 'http' else 'http://www.nexto.pl' + cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED if DrmFree else SearchResult.DRM_LOCKED s.formats = formats.upper().strip() yield s if not doc.xpath( '//div[@class="listnavigator"]//a[@class="next"]'): break offset += 10
def search(self, query, max_results=10, timeout=60): search_url = 'http://www.amazon.fr/s/?url=search-alias%3Ddigital-text&field-keywords=' url = search_url + query.encode('ascii', 'backslashreplace').replace( '%', '%25').replace('\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: # doc = html.fromstring(f.read().decode('latin-1', 'replace')) # Apparently amazon Europe is responding in UTF-8 now doc = html.fromstring(f.read()) data_xpath = '//div[contains(@class, "result") and contains(@class, "product")]' format_xpath = './/span[@class="format"]/text()' cover_xpath = './/img[@class="productImage"]/@src' for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). So we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format = ''.join(data.xpath(format_xpath)) if 'kindle' not in format.lower(): continue # We must have an asin otherwise we can't easily reference the # book later. asin = ''.join(data.xpath("@name")) cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath('.//a[@class="title"]/text()')) price = ''.join( data.xpath( './/div[@class="newPrice"]/span[contains(@class, "price")]/text()' )) author = unicode(''.join( data.xpath( './/h3[@class="title"]/span[@class="ptBrand"]/text()')) ) if author.startswith('de '): author = author[3:] counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = asin.strip() s.formats = 'Kindle' s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=10, timeout=60): # check for cyrillic symbols before performing search uquery = unicode(query.strip(), 'utf-8') reObj = re.search(u'^[а-яА-Я\\d\\s]{3,}$', uquery) if not reObj: return base_url = 'http://chitanka.info' url = base_url + '/search?q=' + urllib2.quote(query) counter = max_results # search for book title br = browser() try: with closing(br.open(url, timeout=timeout)) as f: f = unicode(f.read(), 'utf-8') doc = html.fromstring(f) for data in doc.xpath('//ul[@class="superlist booklist"]/li'): if counter <= 0: break id = ''.join( data.xpath('.//a[@class="booklink"]/@href')).strip() if not id: continue counter -= 1 s = SearchResult() s.cover_url = ''.join( data.xpath( './/a[@class="booklink"]/img/@src')).strip() s.title = ''.join( data.xpath( './/a[@class="booklink"]/i/text()')).strip() s.author = ''.join( data.xpath( './/span[@class="bookauthor"]/a/text()')).strip() s.detail_item = id s.drm = SearchResult.DRM_UNLOCKED s.downloads['FB2'] = base_url + ''.join( data.xpath('.//a[@class="dl dl-fb2"]/@href')).strip( ).replace('.zip', '') s.downloads['EPUB'] = base_url + ''.join( data.xpath('.//a[@class="dl dl-epub"]/@href')).strip( ).replace('.zip', '') s.downloads['TXT'] = base_url + ''.join( data.xpath('.//a[@class="dl dl-txt"]/@href')).strip( ).replace('.zip', '') s.formats = 'FB2, EPUB, TXT, SFB' yield s except urllib2.HTTPError, e: if e.code == 404: return else: raise
def search(self, query, max_results=10, timeout=60): url = 'http://www.barnesandnoble.com/s/%s?keyword=%s&store=ebook&view=list' % ( query.replace(' ', '-'), urllib.parse.quote_plus(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: raw = f.read() doc = html.fromstring(raw) for data in doc.xpath( '//ol[contains(@class, "result-set")]/li[contains(@class, "result")]' ): if counter <= 0: break id = ''.join( data.xpath( './/div[contains(@class, "image-block")]/a/@href')) if not id: continue cover_url = '' cover_id = ''.join( data.xpath( './/img[contains(@class, "product-image")]/@id')) m = re.search(r"%s'.*?srcUrl: '(?P<iurl>.*?)'.*?}" % cover_id, raw) if m: cover_url = m.group('iurl') title = ''.join( data.xpath( 'descendant::p[@class="title"]//span[@class="name"]//text()' )).strip() if not title: continue author = ', '.join( data.xpath( './/ul[contains(@class, "contributors")]//a[contains(@class, "subtle")]//text()' )).strip() price = ''.join( data.xpath('.//a[contains(@class, "bn-price")]//text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Nook' yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.waterstones.com/waterstonesweb/simpleSearch.do?simpleSearchString=ebook+' + urllib2.quote( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[contains(@class, "results-pane")]'): if counter <= 0: break id = ''.join(data.xpath('./div/div/h2/a/@href')).strip() if not id: continue cover_url = ''.join( data.xpath('.//div[@class="image"]/a/img/@src')) if not cover_url.startswith("http"): cover_url = 'http://www.waterstones.com' + cover_url title = ''.join(data.xpath('./div/div/h2/a/text()')) author = ', '.join( data.xpath('.//p[@class="byAuthor"]/a/text()')) price = ''.join( data.xpath( './/p[@class="price"]/span[@class="priceRed2"]/text()') ) drm = data.xpath( 'boolean(.//td[@headers="productFormat" and contains(., "DRM")])' ) pdf = data.xpath( 'boolean(.//td[@headers="productFormat" and contains(., "PDF")])' ) epub = data.xpath( 'boolean(.//td[@headers="productFormat" and contains(., "EPUB")])' ) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price if drm: s.drm = SearchResult.DRM_LOCKED else: s.drm = SearchResult.DRM_UNKNOWN s.detail_item = id formats = [] if epub: formats.append('ePub') if pdf: formats.append('PDF') s.formats = ', '.join(formats) yield s
def search_kobo(query, max_results=10, timeout=60, write_html_to=None): from css_selectors import Select url = 'http://www.kobobooks.com/search/search.html?q=' + urllib.quote_plus(query) br = browser() with closing(br.open(url, timeout=timeout)) as f: raw = f.read() if write_html_to is not None: with open(write_html_to, 'wb') as f: f.write(raw) doc = html.fromstring(raw) select = Select(doc) for i, item in enumerate(select('.result-items .item-wrapper.book')): if i == max_results: break for img in select('.item-image img[src]', item): cover_url = img.get('src') if cover_url.startswith('//'): cover_url = 'http:' + cover_url break else: cover_url = None for p in select('p.title', item): title = etree.tostring(p, method='text', encoding=unicode).strip() for a in select('a[href]', p): url = 'http://store.kobobooks.com' + a.get('href') break else: url = None break else: title = None authors = [] for a in select('p.author a.contributor', item): authors.append(etree.tostring(a, method='text', encoding=unicode).strip()) authors = authors_to_string(authors) for p in select('p.price', item): price = etree.tostring(p, method='text', encoding=unicode).strip() break else: price = None if title and authors and url: s = SearchResult() s.cover_url = cover_url s.title = title s.author = authors s.price = price s.detail_item = url s.formats = 'EPUB' s.drm = SearchResult.DRM_UNKNOWN yield s
def parse_search_result(self, node): r = SearchResult() r.detail_item = text(node, './/*', 'Item__title', '/a/@href') r.title = text(node, './/*', 'Item__title', '/a/text()') r.author = text(node, './/*', 'Item__authors') r.price = text(node, './/*', 'pricing__price') + ' kr' r.formats = text(node, './/*', 'Item__format-as-link') r.cover_url = text(node, './/img', 'Item__image', '/@data-src') return r
def search(self, query, max_results=10, timeout=60): br = browser() page = 1 counter = max_results while counter: with closing( br.open('http://ebooki.allegro.pl/szukaj?fraza=' + urllib.quote(query) + '&strona=' + str(page), timeout=timeout)) as f: doc = html.fromstring(f.read().decode('utf-8')) for data in doc.xpath( '//div[@class="listing-list"]/div[@class="listing-list-item"]' ): if counter <= 0: break id = ''.join( data.xpath( './/div[@class="listing-cover-wrapper"]/a/@href')) if not id: continue cover_url = ''.join( data.xpath( './/div[@class="listing-cover-wrapper"]/a/img/@src' )) title = ''.join( data.xpath( './/div[@class="listing-info"]/div[1]/a/text()')) author = ', '.join( data.xpath( './/div[@class="listing-info"]/div[2]/a/text()')) price = ''.join( data.xpath('.//div[@class="book-price"]/text()')) formats = ', '.join( data.xpath( './/div[@class="listing-buy-formats"]//div[@class="devices-wrapper"]/span[@class="device-label"]/span/text()' )) counter -= 1 s = SearchResult() s.cover_url = 'http://ebooki.allegro.pl/' + cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://ebooki.allegro.pl/' + id[1:] s.formats = formats.upper() s.drm = SearchResult.DRM_UNLOCKED yield s if not doc.xpath( '//a[@class="paging-arrow right-paging-arrow"]'): break page += 1
def search(self, query, max_results=10, timeout=60): url = 'http://www.baenebooks.com/searchadv.aspx?IsSubmit=true&SearchTerm=' + urllib2.quote( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//table//table//table//table//tr'): if counter <= 0: break id = ''.join(data.xpath('./td[1]/a/@href')) if not id or not id.startswith('p-'): continue title = ''.join(data.xpath('./td[1]/a/text()')) author = '' cover_url = '' price = '' with closing( br.open('http://www.baenebooks.com/' + id.strip(), timeout=timeout / 4)) as nf: idata = html.fromstring(nf.read()) author = ''.join( idata.xpath( '//span[@class="ProductNameText"]/../b/text()')) author = author.split('by ')[-1] price = ''.join( idata.xpath('//span[@class="variantprice"]/text()')) a, b, price = price.partition('$') price = b + price pnum = '' mo = re.search(r'p-(?P<num>\d+)-', id.strip()) if mo: pnum = mo.group('num') if pnum: cover_url = 'http://www.baenebooks.com/' + ''.join( idata.xpath( '//img[@id="ProductPic%s"]/@src' % pnum)) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = 'RB, MOBI, EPUB, LIT, LRF, RTF, HTML' yield s
def parse_book_details(self, node): r = SearchResult() r.title = text(node, './/*[@itemprop="name"]') r.author = text(node, './/*[@itemprop="author"]') r.price = text(node, './/*', 'price ') r.cover_url = text(node, './/img[@itemprop="image"]', '', '/@src') r.formats = text(node, '//th[contains(., "Mediatyp")]/following-sibling::td[1]') r.drm = r.formats return r
def parse_book_details(self, node): r = SearchResult() r.title = text(node, './/*[@itemprop="name"]') r.author = text(node, './/*', 'bookdetails__authorname') r.price = text(node, './/*', 'bookdetails__price') r.cover_url = text(node, './/img[@itemprop="image"]', '', '/@src') r.formats = text(node, './/*', 'book_info__format', '/span[2]/text()') r.drm = text(node, './/*', 'book_info__drm', '/span[2]/text()') return r
def search(self, query, max_results=10, timeout=60): url = "http://ebookstore.sony.com/search?keyword=%s" % urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for item in doc.xpath( '//div[contains(@class, "searchResult")]/' 'descendant::li[contains(@class, "hreview")]' ): if counter <= 0: break curr = "".join( item.xpath('descendant::div[@class="pricing"]/descendant::*[@class="currency"]/@title') ).strip() amt = "".join( item.xpath('descendant::div[@class="pricing"]/descendant::*[@class="amount"]/text()') ).strip() s = SearchResult() s.price = (curr + " " + amt) if (curr and amt) else _("Not Available") title = item.xpath('descendant::h3[@class="item"]') if not title: continue title = etree.tostring(title[0], method="text", encoding=unicode) if not title: continue s.title = title.strip() s.author = "".join( item.xpath('descendant::li[contains(@class, "author")]/' 'a[@class="fn"]/text()') ).strip() if not s.author: continue detail_url = "".join( item.xpath('descendant::h3[@class="item"]' '/descendant::a[@class="fn" and @href]/@href') ) if not detail_url: continue if detail_url.startswith("/"): detail_url = "http:" + detail_url s.detail_item = detail_url counter -= 1 cover_url = "".join(item.xpath('descendant::li[@class="coverart"]/' "descendant::img[@src]/@src")) if cover_url: if cover_url.startswith("//"): cover_url = "http:" + cover_url elif cover_url.startswith("/"): cover_url = "http://ebookstore.sony.com" + cover_url s.cover_url = url_slash_cleaner(cover_url) s.drm = SearchResult.DRM_UNKNOWN s.formats = "Sony" yield s
def search(self, query, max_results=10, timeout=60): ''' XinXii's open search url is: http://www.xinxii.com/catalog-search/query/?keywords={searchTerms}&pw={startPage?}&doc_lang={docLang}&ff={docFormat},{docFormat},{docFormat} This url requires the docLang and docFormat. However, the search itself sent to XinXii does not require them. They can be ignored. We cannot push this into the stanard OpenSearchOPDSStore search because of the required attributes. XinXii doesn't return all info supported by OpenSearchOPDSStore search function so this one is modified to remove parts that are used. ''' url = 'http://www.xinxii.com/catalog-search/query/?keywords=' + urllib.quote_plus(query) counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: doc = etree.fromstring(f.read()) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip() for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'): s.cover_url = href if rel == 'alternate': s.detail_item = href s.formats = 'EPUB, PDF' s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join(data.xpath('./*[local-name() = "author"]//*[local-name() = "name"]//text()')).strip() price_e = data.xpath('.//*[local-name() = "price"][1]') if price_e: price_e = price_e[0] currency_code = price_e.get('currencycode', '') price = ''.join(price_e.xpath('.//text()')).strip() s.price = currency_code + ' ' + price s.price = s.price.strip() yield s
def deseralize_books(self, sbooks): books = [] for s in sbooks: b = SearchResult() b.author = s.get('author', '') b.title = s.get('title', '') b.detail_item = s.get('detail_item', '') b.formats = s.get('formats', '') books.append(b) return books
def search_amazon(query, max_results=10, timeout=60, write_html_to=None, base_url=SEARCH_BASE_URL, base_query=SEARCH_BASE_QUERY, field_keywords='k' ): uquery = base_query.copy() uquery[field_keywords] = query def asbytes(x): if isinstance(x, type('')): x = x.encode('utf-8') return x uquery = {asbytes(k):asbytes(v) for k, v in uquery.items()} url = base_url + '?' + urlencode(uquery) br = browser(user_agent=get_user_agent()) counter = max_results with closing(br.open(url, timeout=timeout)) as f: raw = f.read() if write_html_to is not None: with open(write_html_to, 'wb') as f: f.write(raw) doc = html.fromstring(raw) for result in doc.xpath('//div[contains(@class, "s-result-list")]//div[@data-index and @data-asin]'): kformat = ''.join(result.xpath('.//a[contains(text(), "Kindle Edition")]//text()')) # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. if 'kindle' not in kformat.lower(): continue asin = result.get('data-asin') if not asin: continue cover_url = ''.join(result.xpath('.//img/@src')) title = etree.tostring(result.xpath('.//h5')[0], method='text', encoding='unicode') adiv = result.xpath('.//div[contains(@class, "a-color-secondary")]')[0] aparts = etree.tostring(adiv, method='text', encoding='unicode').split() idx = aparts.index('|') author = ' '.join(aparts[1:idx]) price = ''.join(result.xpath('.//span[contains(@class, "a-price")]/span[contains(@class, "a-offscreen")]/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.detail_item = asin.strip() s.price = price.strip() s.formats = 'Kindle' yield s
def search(self, query, max_results=10, timeout=60): url = ('http://www.ebook.nl/store/advanced_search_result.php?keywords=' + urllib2.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//table[contains(@class, "productListing")]/tr'): if counter <= 0: break details = data.xpath('./td/div[@class="prodImage"]/a') if not details: continue details = details[0] id = ''.join(details.xpath('./@href')).strip() id = id[id.rfind('/')+1:] i = id.rfind('?') if i > 0: id = id[:i] if not id: continue cover_url = 'http://www.ebook.nl/store/' + ''.join(details.xpath('./img/@src')) title = ''.join(details.xpath('./img/@title')).strip() author = ''.join(data.xpath('./td/div[@class="prodTitle"]/h3/a/text()')).strip() price = ''.join(data.xpath('./td/div[@class="prodTitle"]/b/text()')) pdf = data.xpath('boolean(./td/div[@class="prodTitle"]/' 'p[contains(text(), "Bestandsformaat: Pdf")])') epub = data.xpath('boolean(./td/div[@class="prodTitle"]/' 'p[contains(text(), "Bestandsformaat: ePub")])') nodrm = data.xpath('boolean(./td/div[@class="prodTitle"]/' 'p[contains(text(), "zonder DRM") or' ' contains(text(), "watermerk")])') counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price if nodrm: s.drm = SearchResult.DRM_UNLOCKED else: s.drm = SearchResult.DRM_LOCKED s.detail_item = id formats = [] if epub: formats.append('ePub') if pdf: formats.append('PDF') s.formats = ','.join(formats) yield s
def search(self, query, max_results=10, timeout=60): url = ('http://www.ebook.de/de/pathSearch?nav=52122&searchString=' + urllib2.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[contains(@class, "articlecontainer")]'): if counter <= 0: break details = data.xpath('./div[contains(@class, "articleinfobox")]') if not details: continue details = details[0] id_ = ''.join(details.xpath('./a/@name')).strip() if not id_: continue title = ''.join(details.xpath('./h3[@class="title"]/a/text()')).strip() author = ''.join(details.xpath('.//div[@class="author"]/text()')).strip() if author.startswith('von'): author = author[4:] pdf = details.xpath( 'boolean(.//span[@class="bindername" and contains(text(), "pdf")]/text())') epub = details.xpath( 'boolean(.//span[@class="bindername" and contains(text(), "epub")]/text())') mobi = details.xpath( 'boolean(.//span[@class="bindername" and contains(text(), "mobipocket")]/text())') cover_url = ''.join(data.xpath('.//div[@class="coverimg"]/a/img/@src')) price = ''.join(data.xpath('.//div[@class="preis"]/text()')).replace('*', '').strip() counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_UNKNOWN s.detail_item = id_ formats = [] if epub: formats.append('ePub') if pdf: formats.append('PDF') if mobi: formats.append('MOBI') s.formats = ', '.join(formats) yield s
def search(self, query, max_results=10, timeout=60): url = self.SEARCH_URL % urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for item in doc.xpath('//div[@id="searchresult-list"]/descendant::div[contains(@class, "doc-item")]'): if counter <= 0: break s = SearchResult() s.price = _('Not Available') p = ''.join(item.xpath('descendant::p[@class="doc-price"]/descendant::span[@itemprop="price"]/text()')).strip() if p: s.price = 'AUD ' + p.split('$')[-1] title = item.xpath('descendant::h3[@class="doc-title"]') if not title: continue title = etree.tostring(title[0], method='text', encoding=unicode) if not title: continue st = item.xpath('descendant::p[@class="doc-subtitle"]') if st: st = etree.tostring(st[0], method='text', encoding=unicode) if st and st.strip(): title = title.strip() + ': ' + st s.title = title.strip() aut = item.xpath('descendant::p[@class="doc-author"]') if not aut: continue s.author = etree.tostring(aut[0], method='text', encoding=unicode).strip() if not s.author: continue du = ''.join(item.xpath('descendant::h3[position() = 1 and @class="doc-title"]/descendant::a[position() = 1 and @href]/@href')).strip() if not du: continue detail_url = 'https://au.readerstore.sony.com'+du s.detail_item = detail_url counter -= 1 cover_url = ''.join(item.xpath( 'descendant::p[@class="doc-cover" and position() = 1]/' 'descendant::img[position() = 1 and @src]/@src')) if cover_url: s.cover_url = url_slash_cleaner(cover_url) s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Sony' yield s
def search(self, query, max_results=10, timeout=60): url = u'http://uk.nook.com/s/%s?s%%5Bdref%%5D=1&s%%5Bkeyword%%5D=%s' % ( query.replace(' ', '-'), urllib.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: raw = f.read() doc = html.fromstring(raw) for data in doc.xpath('//ul[contains(@class, "product_list")]/li'): if counter <= 0: break id_ = ''.join( data.xpath('.//span[contains(@class, "image")]/a/@href')) if not id_: continue if id_.startswith('/gb'): id_ = id_[3:] id_ = 'http://uk.nook.com' + id_.strip() cover_url = ''.join( data.xpath( './/span[contains(@class, "image")]//img/@data-src')) title = ''.join( data.xpath( './/div[contains(@class, "title")]//text()')).strip() if not title: continue author = ', '.join( data.xpath( './/div[contains(@class, "contributor")]//a/text()') ).strip() price = ''.join( data.xpath('.//div[contains(@class, "action")]//a//text()') ).strip() price = re.sub(r'[^\d.,£]', '', price) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = id_ s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Nook' yield s
def search(self, query, max_results=10, timeout=60): search_url = "http://www.amazon.fr/s/?url=search-alias%3Ddigital-text&field-keywords=" url = search_url + query.encode("ascii", "backslashreplace").replace("%", "%25").replace("\\x", "%").replace( " ", "+" ) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: # doc = html.fromstring(f.read().decode('latin-1', 'replace')) # Apparently amazon Europe is responding in UTF-8 now doc = html.fromstring(f.read()) data_xpath = '//div[contains(@class, "result") and contains(@class, "product")]' format_xpath = './/span[@class="format"]/text()' cover_xpath = './/img[@class="productImage"]/@src' for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). So we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format = "".join(data.xpath(format_xpath)) if "kindle" not in format.lower(): continue # We must have an asin otherwise we can't easily reference the # book later. asin = "".join(data.xpath("@name")) cover_url = "".join(data.xpath(cover_xpath)) title = "".join(data.xpath('.//a[@class="title"]/text()')) price = "".join(data.xpath('.//div[@class="newPrice"]/span[contains(@class, "price")]/text()')) author = unicode("".join(data.xpath('.//h3[@class="title"]/span[@class="ptBrand"]/text()'))) if author.startswith("de "): author = author[3:] counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = asin.strip() s.formats = "Kindle" s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.beam-ebooks.de/suchergebnis.php?Type=&limit={0}&sw={1}'.format( max_results, urllib2.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//table[tr/td/div[@class="stil2"]]'): if counter <= 0: break id_ = ''.join(data.xpath('./tr/td[1]/a/@href')).strip() if not id_: continue id_ = id_[7:] cover_url = ''.join(data.xpath('./tr/td[1]/a/img/@src')) if cover_url: cover_url = 'http://www.beam-ebooks.de' + cover_url temp = ''.join(data.xpath('./tr/td[1]/a/img/@alt')) colon = temp.find(':') if not temp.startswith('eBook') or colon < 0: continue author = temp[5:colon] title = temp[colon+1:] price = ''.join(data.xpath('./tr/td[3]/text()')) pdf = data.xpath( 'boolean(./tr/td[3]/a/img[contains(@alt, "PDF")]/@alt)') epub = data.xpath( 'boolean(./tr/td[3]/a/img[contains(@alt, "ePub")]/@alt)') mobi = data.xpath( 'boolean(./tr/td[3]/a/img[contains(@alt, "Mobipocket")]/@alt)') counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_UNLOCKED s.detail_item = id_ formats = [] if epub: formats.append('ePub') if pdf: formats.append('PDF') if mobi: formats.append('MOBI') s.formats = ', '.join(formats) yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.nexto.pl/szukaj.xml?search-clause=' + urllib.quote_plus(query) + '&scid=1015' br = browser() offset=0 counter = max_results while counter: with closing(br.open(url + '&_offset=' + str(offset), timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="productslist"]/li'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="cover_container"]/a[1]/@href')) if not id: continue price = ''.join(data.xpath('.//strong[@class="nprice"]/text()')) cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src')) cover_url = re.sub(r'%2F', '/', cover_url) cover_url = re.sub(r'widthMax=120&heightMax=200', 'widthMax=64&heightMax=64', cover_url) title = ''.join(data.xpath('.//a[@class="title"]/text()')) title = re.sub(r' - ebook$', '', title) formats = ', '.join(data.xpath('.//ul[@class="formats_available"]/li//b/text()')) DrmFree = re.search(r'znak', formats) formats = re.sub(r'\ ?\(.+?\)', '', formats) author = '' with closing(br.open('http://www.nexto.pl/' + id.strip(), timeout=timeout/4)) as nf: idata = html.fromstring(nf.read()) author = ', '.join(idata.xpath('//div[@class="basic_data"]/p[1]/b/a/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url if cover_url[:4] == 'http' else 'http://www.nexto.pl' + cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED if DrmFree else SearchResult.DRM_LOCKED s.formats = formats.upper().strip() yield s if not doc.xpath('//div[@class="listnavigator"]//a[@class="next"]'): break offset+=10