def search(self, query, max_results=15, timeout=60): search_url = self.shop_url + '/webservice/webservice.asmx/SearchWebService?'\ 'searchText=%s&searchContext=ebook' % urllib2.quote(query) search_urls = [ search_url ] ## add this as the fist try if it looks like ozon ID if re.match("^\d{6,9}$", query): ozon_detail = self.shop_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % query search_urls.insert(0, ozon_detail) xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())' counter = max_results br = browser() for url in search_urls: with closing(br.open(url, timeout=timeout)) as f: raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0] doc = etree.fromstring(raw) for data in doc.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = data.xpath(xp_template.format('ID')) s.title = data.xpath(xp_template.format('Name')) s.author = data.xpath(xp_template.format('Author')) s.price = data.xpath(xp_template.format('Price')) s.cover_url = data.xpath(xp_template.format('Picture')) s.price = format_price_in_RUR(s.price) yield s
def open_search(url, query, max_results=10, timeout=60): description = Description(url) url_template = description.get_best_template() if not url_template: return oquery = Query(url_template) # set up initial values oquery.searchTerms = query oquery.count = max_results url = oquery.url() counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: doc = etree.fromstring(f.read()) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip() for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if 'http://opds-spec.org/thumbnail' in rel: s.cover_url = href elif 'http://opds-spec.org/image/thumbnail' in rel: s.cover_url = href elif 'http://opds-spec.org/acquisition/buy' in rel: s.detail_item = href elif 'http://opds-spec.org/acquisition/sample' in rel: pass elif 'http://opds-spec.org/acquisition' in rel: if type: ext = guess_extension(type) if ext: ext = ext[1:].upper().strip() s.downloads[ext] = href s.formats = ', '.join(s.downloads.keys()).strip() s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join(data.xpath('./*[local-name() = "author"]//*[local-name() = "name"]//text()')).strip() price_e = data.xpath('.//*[local-name() = "price"][1]') if price_e: price_e = price_e[0] currency_code = price_e.get('currencycode', '') price = ''.join(price_e.xpath('.//text()')).strip() s.price = currency_code + ' ' + price s.price = s.price.strip() yield s
def search(self, query, max_results=15, timeout=60): search_url = ( self.shop_url + "/webservice/webservice.asmx/SearchWebService?" "searchText=%s&searchContext=ebook" % urllib2.quote(query) ) search_urls = [search_url] xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())' counter = max_results br = browser() for url in search_urls: with closing(br.open(url, timeout=timeout)) as f: raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0] doc = etree.fromstring(raw) for data in doc.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = data.xpath(xp_template.format("ID")) s.title = data.xpath(xp_template.format("Name")) s.author = data.xpath(xp_template.format("Author")) s.price = data.xpath(xp_template.format("Price")) s.cover_url = data.xpath(xp_template.format("Picture")) s.price = format_price_in_RUR(s.price) yield s
def search(self, query, max_results=10, timeout=60): ''' XinXii's open search url is: http://www.xinxii.com/catalog-search/query/?keywords={searchTerms}&pw={startPage?}&doc_lang={docLang}&ff={docFormat},{docFormat},{docFormat} This url requires the docLang and docFormat. However, the search itself sent to XinXii does not require them. They can be ignored. We cannot push this into the stanard OpenSearchOPDSStore search because of the required attributes. XinXii doesn't return all info supported by OpenSearchOPDSStore search function so this one is modified to remove parts that are used. ''' url = 'http://www.xinxii.com/catalog-search/query/?keywords=' + urllib.quote_plus(query) counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: doc = etree.fromstring(f.read()) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip() for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'): s.cover_url = href if rel == 'alternate': s.detail_item = href s.formats = 'EPUB, PDF' s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join(data.xpath('./*[local-name() = "author"]//*[local-name() = "name"]//text()')).strip() price_e = data.xpath('.//*[local-name() = "price"][1]') if price_e: price_e = price_e[0] currency_code = price_e.get('currencycode', '') price = ''.join(price_e.xpath('.//text()')).strip() s.price = currency_code + ' ' + price s.price = s.price.strip() yield s
def search(self, query, max_results=10, timeout=60): # check for cyrillic symbols before performing search uquery = unicode(query.strip(), 'utf-8') reObj = re.search(u'^[а-яА-Я\\d\\s]{2,}$', uquery) if not reObj: return base_url = 'http://e-knigi.net' url = base_url + '/virtuemart?page=shop.browse&search_category=0&search_limiter=anywhere&keyword=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) # if the store finds only one product, it opens directly detail view for data in doc.xpath('//div[@class="prod_details"]'): s = SearchResult() s.cover_url = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@src')).strip() s.title = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@alt')).strip() s.author = ''.join(data.xpath('.//div[@class="td_bg clearfix"]/div[@class="gk_product_tab"]/div/table/tr[3]/td[2]/text()')).strip() s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip() s.detail_item = url s.drm = SearchResult.DRM_UNLOCKED yield s return # search in store results for data in doc.xpath('//div[@class="browseProductContainer"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[1]/@href')).strip() if not id: continue title = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@title')).strip() author = ''.join(data.xpath('.//div[@style="float:left;width:90%"]/b/text()')).strip().replace('Автор: ', '') if title.lower().find(query.lower()) == -1 and author.lower().find(query.lower()) == -1: continue counter -= 1 s = SearchResult() s.cover_url = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@src')).strip() s.title = title s.author = author s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip() s.detail_item = base_url + id s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=10, timeout=60): url = self.SEARCH_URL % urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for item in doc.xpath('//div[@id="searchresult-list"]/descendant::div[contains(@class, "doc-item")]'): if counter <= 0: break s = SearchResult() s.price = _('Not Available') p = ''.join(item.xpath('descendant::p[@class="doc-price"]/descendant::span[@itemprop="price"]/text()')).strip() if p: s.price = 'AUD ' + p.split('$')[-1] title = item.xpath('descendant::h3[@class="doc-title"]') if not title: continue title = etree.tostring(title[0], method='text', encoding=unicode) if not title: continue st = item.xpath('descendant::p[@class="doc-subtitle"]') if st: st = etree.tostring(st[0], method='text', encoding=unicode) if st and st.strip(): title = title.strip() + ': ' + st s.title = title.strip() aut = item.xpath('descendant::p[@class="doc-author"]') if not aut: continue s.author = etree.tostring(aut[0], method='text', encoding=unicode).strip() if not s.author: continue du = ''.join(item.xpath('descendant::h3[position() = 1 and @class="doc-title"]/descendant::a[position() = 1 and @href]/@href')).strip() if not du: continue detail_url = 'https://au.readerstore.sony.com'+du s.detail_item = detail_url counter -= 1 cover_url = ''.join(item.xpath( 'descendant::p[@class="doc-cover" and position() = 1]/' 'descendant::img[position() = 1 and @src]/@src')) if cover_url: s.cover_url = url_slash_cleaner(cover_url) s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Sony' yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.bewrite.net/mm5/merchant.mvc?Search_Code=B&Screen=SRCH&Search=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@id="content"]//table/tr[position() > 1]'): if counter <= 0: break id = ''.join(data.xpath('.//a/@href')) if not id: continue heading = ''.join(data.xpath('./td[2]//text()')) title, q, author = heading.partition('by ') cover_url = '' price = '' counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=20, timeout=60): br = browser() counter = max_results page = 1 while counter: with closing( br.open( "http://www.publio.pl/e-booki,strona" + str(page) + ".html?q=" + urllib.quote(query), timeout=timeout, ) ) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="item"]'): if counter <= 0: break id = "".join(data.xpath('.//div[@class="img"]/a/@href')) if not id: continue cover_url = "".join(data.xpath('.//div[@class="img"]/a/img/@data-original')) title = "".join(data.xpath('.//div[@class="img"]/a/@title')) title2 = "".join(data.xpath('.//div[@class="desc"]/h5//text()')) if title2: title = title + ". " + title2 if ( "".join( data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[last()]/span/text()') ).strip() == "Seria:" ): series = "".join( data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[last()]/a/@title') ) title = title + " (seria " + series + ")" author = ", ".join( data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[@class="row"][1]/a/@title') ) price = "".join(data.xpath('.//div[@class="priceBox tk-museo-slab"]/ins/text()')) if not price: price = "".join(data.xpath('.//div[@class="priceBox tk-museo-slab"]/text()')).strip() formats = ", ".join(data.xpath('.//div[@class="formats"]/a/img/@alt')) counter -= 1 s = SearchResult() s.cover_url = "http://www.publio.pl" + cover_url s.title = title.strip() s.author = author s.price = price s.detail_item = "http://www.publio.pl" + id.strip() s.drm = SearchResult.DRM_LOCKED if "DRM" in formats else SearchResult.DRM_UNLOCKED s.formats = formats.replace(" DRM", "").strip() yield s if not doc.xpath('boolean(//a[@class="next"])'): break page += 1
def search(self, query, max_results=10, timeout=60): url = 'http://ebooks.eharlequin.com/BANGSearch.dll?Type=FullText&FullTextField=All&FullTextCriteria=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//table[not(.//@class="sidelink")]/tr[.//ul[@id="details"]]'): if counter <= 0: break id = ''.join(data.xpath('.//ul[@id="details"]/li[@id="title-results"]/a/@href')) if not id: continue title = ''.join(data.xpath('.//ul[@id="details"]/li[@id="title-results"]/a/text()')) author = ''.join(data.xpath('.//ul[@id="details"]/li[@id="author"][1]//a/text()')) price = ''.join(data.xpath('.//div[@class="ourprice"]/font/text()')) cover_url = ''.join(data.xpath('.//a[@href="%s"]/img/@src' % id)) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = 'http://ebooks.eharlequin.com/' + id.strip() s.formats = 'EPUB' yield s
def search(self, query, max_results=20, timeout=60): url = 'http://www.escapemagazine.pl/wyszukiwarka?query=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="item item_short"]'): if counter <= 0: break id = ''.join(data.xpath('.//h2[@class="title"]/a[1]/@href')) if not id: continue title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()')) author = ''.join(data.xpath('.//div[@class="author"]/text()')) price = ''.join(data.xpath('.//span[@class="price_now"]/strong/text()')) + ' zł' cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://www.escapemagazine.pl' + id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = 'PDF' yield s
def search(self, query, max_results=10, timeout=60): url = "http://www.legimi.com/pl/ebooki/?szukaj=" + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@id="listBooks"]/div'): if counter <= 0: break id = "".join(data.xpath('.//a[@class="plainLink"]/@href')) if not id: continue cover_url = "".join(data.xpath(".//img[1]/@src")) title = "".join(data.xpath('.//span[@class="bookListTitle ellipsis"]/text()')) author = "".join(data.xpath('.//span[@class="bookListAuthor ellipsis"]/text()')) price = "".join(data.xpath('.//div[@class="bookListPrice"]/span/text()')) counter -= 1 s = SearchResult() s.cover_url = "http://www.legimi.com/" + cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = "http://www.legimi.com/" + id.strip() yield s
def search(self, query, max_results=10, timeout=60): url = 'http://m.gutenberg.org/ebooks/search.mobile/?default_prefix=all&sort_order=title&query=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ol[@class="results"]/li[@class="booklink"]'): if counter <= 0: break id = ''.join(data.xpath('./a/@href')) id = id.split('.mobile')[0] title = ''.join(data.xpath('.//span[@class="title"]/text()')) author = ''.join(data.xpath('.//span[@class="subtitle"]/text()')) counter -= 1 s = SearchResult() s.cover_url = '' s.detail_item = id.strip() s.title = title.strip() s.author = author.strip() s.price = '$0.00' s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=10, timeout=60): url = 'http://bookoteka.pl/list?search=' + urllib.quote_plus(query) + '&cat=1&hp=1&type=1' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//li[@class="EBOOK"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[@class="item_link"]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//a[@class="item_link"]/img/@src')) title = ''.join(data.xpath('.//div[@class="shelf_title"]/a/text()')) author = ''.join(data.xpath('.//div[@class="shelf_authors"][1]/text()')) price = ''.join(data.xpath('.//span[@class="EBOOK"]/text()')) price = price.replace('.', ',') formats = ', '.join(data.xpath('.//a[@class="fancybox protected"]/text()')) counter -= 1 s = SearchResult() s.cover_url = 'http://bookoteka.pl' + cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://bookoteka.pl' + id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.strip() yield s
def search(self, query, max_results=10, timeout=60): print( "search!") q = query.decode('utf-8') url = "https://views.scraperwiki.com/run/haodooscraperview/?" + urlencode( { "q": q } ) print( url ) br = browser() with closing(br.open(url, timeout=timeout)) as f: json_doc = f.read() if len(json_doc)>0: result = json.loads( json_doc ) for volume in result: s = SearchResult() s.title = volume['title'] s.detail_item = volume['url'] s.price = '$0.00' s.drm = SearchResult.DRM_UNLOCKED if volume.has_key('type') and len(volume["type"]): for t in volume["type"]: s.downloads[ t['type'] ] = t['link'] s.formats = ', '.join(s.downloads.keys()) yield s else: print( "scrape nothing." )
def search(self, query, max_results=10, timeout=60): url = ('http://www.ebook.nl/store/advanced_search_result.php?keywords=' + urllib2.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@id="books"]/div[@itemtype="http://schema.org/Book"]'): if counter <= 0: break id = ''.join(data.xpath('./meta[@itemprop="url"]/@content')).strip() if not id: continue cover_url = 'http://www.ebook.nl/store/' + ''.join(data.xpath('.//img[@itemprop="image"]/@src')) title = ''.join(data.xpath('./span[@itemprop="name"]/a/text()')).strip() author = ''.join(data.xpath('./span[@itemprop="author"]/a/text()')).strip() if author == ' ': author = '' price = ''.join(data.xpath('.//span[@itemprop="price"]//text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_UNKNOWN s.detail_item = id yield s
def search(self, query, max_results=10, timeout=60): url = 'http://zixo.pl/wyszukiwarka/?search=' + urllib.quote(query.encode('utf-8')) + '&product_type=0' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="productInline"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[@class="productThumb"]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//a[@class="productThumb"]/img/@src')) title = ''.join(data.xpath('.//a[@class="title"]/text()')) author = ','.join(data.xpath('.//div[@class="productDescription"]/span[1]/a/text()')) price = ''.join(data.xpath('.//div[@class="priceList"]/span/text()')) price = re.sub('\.', ',', price) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://zixo.pl' + id.strip() s.drm = SearchResult.DRM_LOCKED yield s
def search(self, query, max_results=25, timeout=60): url = 'http://ebookpoint.pl/search?qa=&szukaj=' + quote_plus( query.decode('utf-8').encode('iso-8859-2')) + '&serwisyall=0&wprzyg=0&wsprzed=1&wyczerp=0&formaty=em-p' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="list"]/li'): if counter <= 0: break id = ''.join(data.xpath('./a/@href')) if not id: continue formats = ', '.join(data.xpath('.//ul[@class="book-type book-type-points"]//span[@class="popup"]/span/text()')) cover_url = ''.join(data.xpath('.//p[@class="cover"]/img/@data-src')) title = ''.join(data.xpath('.//div[@class="book-info"]/h3/a/text()')) author = ''.join(data.xpath('.//p[@class="author"]//text()')) price = ''.join(data.xpath('.//p[@class="price price-incart"]/a/ins/text()|.//p[@class="price price-add"]/a/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = re.sub(r'\.',',',price) s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.upper() yield s
def search(self, query, max_results=10, timeout=60): url = 'https://www.beam-shop.de/search?saltFieldLimitation=all&sSearch=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[contains(@class, "product--box")]'): if counter <= 0: break id_ = ''.join(data.xpath('./div/div[contains(@class, "product--info")]/a/@href')).strip() if not id_: continue cover_url = ''.join(data.xpath('./div/div[contains(@class, "product--info")]/a//img/@srcset')) if cover_url: cover_url = cover_url.split(',')[0].strip() author = data.xpath('.//a[@class="product--author"]/text()')[0].strip() title = data.xpath('.//a[@class="product--title"]/text()')[0].strip() price = data.xpath('.//div[@class="product--price"]/span/text()')[0].strip() counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_UNLOCKED s.detail_item = id_ # s.formats = None yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.bubok.pt/resellers/calibre_search/' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[contains(@class, "libro")]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="url"]/text()')) title = ''.join(data.xpath('.//div[@class="titulo"]/text()')) author = ''.join(data.xpath('.//div[@class="autor"]/text()')) price = ''.join(data.xpath('.//div[@class="precio"]/text()')) formats = ''.join(data.xpath('.//div[@class="formatos"]/text()')) cover = ''.join(data.xpath('.//div[@class="portada"]/text()')) counter -= 1 s = SearchResult() s.title = title.strip() s.author = author.strip() s.detail_item = id.strip() s.price = price.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.strip() s.cover_url = cover.strip() yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.kobobooks.com/search/search.html?q=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[contains(@class, "flowview-items")]/li'): if counter <= 0: break id = ''.join(data.xpath('./a[contains(@class, "block-link")]/@href')) if not id: continue id = id[1:] price = ''.join(data.xpath('.//a[contains(@class, "primary-button")]//text()')) cover_url = ''.join(data.xpath('.//img[1]/@src')) cover_url = 'http:%s' % cover_url title = ''.join(data.xpath('.//p[contains(@class, "flowview-item-title")]//text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.price = price.strip() s.detail_item = 'http://store.kobobooks.com/' + id.strip() s.formats = 'EPUB' s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.barnesandnoble.com/s/%s?keyword=%s&store=ebook' % (query.replace(' ', '-'), urllib.quote_plus(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[contains(@class, "result-set")]/li[contains(@class, "result")]'): if counter <= 0: break id = ''.join(data.xpath('.//div[contains(@class, "image-bounding-box")]/a/@href')) if not id: continue cover_url = ''.join(data.xpath('.//img[contains(@class, "product-image")]/@src')) title = ''.join(data.xpath('.//a[@class="title"]//text()')) author = ', '.join(data.xpath('.//a[@class="contributor"]//text()')) price = ''.join(data.xpath('.//div[@class="price-format"]//span[contains(@class, "price")]/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Nook' yield s
def search(self, query, max_results=10, timeout=60): url = ('http://www.whsmith.co.uk/search?keywordCategoryId=wc_dept_ebooks&results=60' '&page=1&keywords=' + urllib2.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//li[@class="product"]'): if counter <= 0: break id_ = ''.join(data.xpath('./a[@class="product_image_wrap"]/@href')) if not id_: continue id_ = 'http://www.whsmith.co.uk' + id_ cover_url = ''.join(data.xpath('.//img[@class="product_image"]/@src')) title = ''.join(data.xpath('.//h4[@class="product_title"]/text()')) author = ', '.join(data.xpath('.//span[@class="product_second"]/text()')) price = ''.join(data.xpath('.//span[@class="price"]/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_LOCKED s.detail_item = id_ s.formats = 'ePub' yield s
def search(self, query, max_results=10, timeout=60): url = 'http://ebooks.foyles.co.uk/catalog/search/?query=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="doc-item"]'): if counter <= 0: break id_ = ''.join(data.xpath('.//p[@class="doc-cover"]/a/@href')).strip() if not id_: continue id_ = 'http://ebooks.foyles.co.uk' + id_ cover_url = ''.join(data.xpath('.//p[@class="doc-cover"]/a/img/@src')) title = ''.join(data.xpath('.//span[@class="title"]/a/text()')) author = ', '.join(data.xpath('.//span[@class="author"]/span[@class="author"]/text()')) price = ''.join(data.xpath('.//span[@itemprop="price"]/text()')).strip() format_ = ''.join(data.xpath('.//p[@class="doc-meta-format"]/span[last()]/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = id_ s.drm = SearchResult.DRM_LOCKED s.formats = format_ yield s
def search(self, query, max_results=10, timeout=60): base_url = 'https://www.millsandboon.co.uk' url = base_url + '/search.aspx??format=ebook&searchText=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//article[contains(@class, "group")]'): if counter <= 0: break id_ = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/@href')).strip() if not id_: continue cover_url = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@src')) title = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@alt')).strip() author = ''.join(data.xpath('.//a[@class="author"]/text()')) price = ''.join(data.xpath('.//div[@class="type-wrapper"]/ul/li[child::span[text()="eBook"]]/a/text()')) format_ = ''.join(data.xpath('.//p[@class="doc-meta-format"]/span[last()]/text()')) drm = SearchResult.DRM_LOCKED counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = id_ s.drm = drm s.formats = format_ yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.ebookshoppe.com/search.php?search_query=' + quote(query) br = browser() br.addheaders = [("Referer", "http://www.ebookshoppe.com/")] counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="ProductList"]/li'): if counter <= 0: break id = ''.join(data.xpath('./div[@class="ProductDetails"]/' 'strong/a/@href')).strip() if not id: continue cover_url = ''.join(data.xpath('./div[@class="ProductImage"]/a/img/@src')) title = ''.join(data.xpath('./div[@class="ProductDetails"]/strong/a/text()')) price = ''.join(data.xpath('./div[@class="ProductPriceRating"]/em/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.price = price s.drm = SearchResult.DRM_UNLOCKED s.detail_item = id self.get_author_and_formats(s, timeout) if not s.author: continue yield s
def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) data_xpath = '//div[contains(@class, "prod")]' format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' asin_xpath = '@name' cover_xpath = './/img[@class="productImage"]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]/text()' price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format_ = ''.join(data.xpath(format_xpath)) if 'kindle' not in format_.lower(): continue # We must have an asin otherwise we can't easily reference the # book later. asin = data.xpath(asin_xpath) if asin: asin = asin[0] else: continue cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath(title_xpath)) author = ''.join(data.xpath(author_xpath)) try: if self.author_article: author = author.split(self.author_article, 1)[1].split(" (")[0] except: pass price = ''.join(data.xpath(price_xpath)) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = asin.strip() s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Kindle' yield s
def search(self, query, max_results=10, timeout=60): url = 'http://woblink.com/publication?query=' + urllib.quote_plus(query.encode('utf-8')) if max_results > 10: if max_results > 20: url += '&limit=30' else: url += '&limit=20' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="book-item"]'): if counter <= 0: break id = ''.join(data.xpath('.//td[@class="w10 va-t"]/a[1]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//td[@class="w10 va-t"]/a[1]/img/@src')) title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()')) author = ', '.join(data.xpath('.//p[@class="author"]/a/text()')) price = ''.join(data.xpath('.//div[@class="prices"]/span[1]/span/text()')) price = re.sub('\.', ',', price) formats = [ form[8:-4].split('_')[0] for form in data.xpath('.//p[3]/img/@src')] s = SearchResult() s.cover_url = 'http://woblink.com' + cover_url s.title = title.strip() s.author = author.strip() s.price = price + ' zł' s.detail_item = id.strip() # MOBI should be send first, if 'MOBI' in formats: t = copy.copy(s) t.title += ' MOBI' t.drm = SearchResult.DRM_UNLOCKED t.formats = 'MOBI' formats.remove('MOBI') counter -= 1 yield t # and the remaining formats (if any) next if formats: if 'epub' in formats: formats.remove('epub') formats.append('WOBLINK') if 'E Ink' in data.xpath('.//div[@class="prices"]/img/@title'): formats.insert(0, 'EPUB') s.drm = SearchResult.DRM_LOCKED s.formats = ', '.join(formats).upper() counter -= 1 yield s
def search(self, query, max_results=10, timeout=60): s = SearchResult() s.title = 'Amazon required that this<br>store be permanently closed.' s.author = '' s.price = '' s.detail_item = '' s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=100, timeout=180): url = 'http://www.e-knjiga.si/rezultati_cover.php?query=' + urllib2.quote(query) print("will search for: " + urllib2.quote(query) + ":\n " + url) br = browser() # counter = max_results with closing(br.open(url, timeout=timeout)) as f: html=etree.HTML(f.read()) #get list of books for book in html.xpath("//table[@class='zebra']"): print(etree.tostring(book, pretty_print=True, method="html")) author = book.find('.//tr/[0]/td/[1]').text title = book.find('.//tr/[0]/td/[2]/a').text details = 'http://www.e-knjiga.si/' + book.find('.//tr/[0]/td/[2]/a').get("href") ## get details fo = urllib2.urlopen(details) det=etree.HTML(fo.read()) fo.close() table=det.find(".//div[@id='center_container']").find('./table') cover='http://www.e-knjiga.si/' + table.find('.//tr/[1]/td/[1]/div/img').get("src") description=table.find(".//tr/[6]/td[@class='knjige_spremna']").text links=[] files=table.find('.//tr/[7]/td/[1]') for file in files.iter('a'): links.append("http://www.e-knjiga.si/"+file.get("href")) #print("Author: " + author) #print("Title: " + title) #print("Details: " + details) #print("Description: " + description) #print("Cover: " + cover) #print("Files: ") #print('\n '.join(links)) s = SearchResult() s.title = title s.author = author s.price = "0.00eur" s.drm = SearchResult.DRM_UNLOCKED s.detail_item = description for f in links: ftype = f.split(".")[-1] s.downloads[ftype] = f s.formats += ftype s.cover_url = cover yield s
def search(self, query, max_results=10, timeout=60): url = "http://ebookstore.sony.com/search?keyword=%s" % urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for item in doc.xpath( '//div[contains(@class, "searchResult")]/' 'descendant::li[contains(@class, "hreview")]' ): if counter <= 0: break curr = "".join( item.xpath('descendant::div[@class="pricing"]/descendant::*[@class="currency"]/@title') ).strip() amt = "".join( item.xpath('descendant::div[@class="pricing"]/descendant::*[@class="amount"]/text()') ).strip() s = SearchResult() s.price = (curr + " " + amt) if (curr and amt) else _("Not Available") title = item.xpath('descendant::h3[@class="item"]') if not title: continue title = etree.tostring(title[0], method="text", encoding=unicode) if not title: continue s.title = title.strip() s.author = "".join( item.xpath('descendant::li[contains(@class, "author")]/' 'a[@class="fn"]/text()') ).strip() if not s.author: continue detail_url = "".join( item.xpath('descendant::h3[@class="item"]' '/descendant::a[@class="fn" and @href]/@href') ) if not detail_url: continue if detail_url.startswith("/"): detail_url = "http:" + detail_url s.detail_item = detail_url counter -= 1 cover_url = "".join(item.xpath('descendant::li[@class="coverart"]/' "descendant::img[@src]/@src")) if cover_url: if cover_url.startswith("//"): cover_url = "http:" + cover_url elif cover_url.startswith("/"): cover_url = "http://ebookstore.sony.com" + cover_url s.cover_url = url_slash_cleaner(cover_url) s.drm = SearchResult.DRM_UNKNOWN s.formats = "Sony" yield s
def search(self, query, max_results=10, timeout=60): url = ('http://www.ebook.de/de/pathSearch?nav=52122&searchString=' + urllib2.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath( '//div[contains(@class, "articlecontainer")]'): if counter <= 0: break details = data.xpath( './div[contains(@class, "articleinfobox")]') if not details: continue details = details[0] id_ = ''.join(details.xpath('./a/@name')).strip() if not id_: continue title = ''.join( details.xpath('./h3[@class="title"]/a/text()')).strip() author = ''.join( details.xpath('.//div[@class="author"]/text()')).strip() if author.startswith('von'): author = author[4:] pdf = details.xpath( 'boolean(.//span[@class="bindername" and contains(text(), "pdf")]/text())' ) epub = details.xpath( 'boolean(.//span[@class="bindername" and contains(text(), "epub")]/text())' ) mobi = details.xpath( 'boolean(.//span[@class="bindername" and contains(text(), "mobipocket")]/text())' ) cover_url = ''.join( data.xpath('.//div[@class="coverimg"]/a/img/@src')) price = ''.join( data.xpath('.//div[@class="preis"]/text()')).replace( '*', '').strip() counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_UNKNOWN s.detail_item = id_ formats = [] if epub: formats.append('ePub') if pdf: formats.append('PDF') if mobi: formats.append('MOBI') s.formats = ', '.join(formats) yield s
def search_amazon(query, max_results=10, timeout=60, write_html_to=None, base_url=SEARCH_BASE_URL, base_query=SEARCH_BASE_QUERY, field_keywords='field-keywords'): uquery = base_query.copy() uquery[field_keywords] = query def asbytes(x): if isinstance(x, type('')): x = x.encode('utf-8') return x uquery = {asbytes(k): asbytes(v) for k, v in uquery.iteritems()} url = base_url + '?' + urllib.urlencode(uquery).decode('ascii') br = browser(user_agent=get_user_agent()) counter = max_results with closing(br.open(url, timeout=timeout)) as f: raw = f.read() if write_html_to is not None: with open(write_html_to, 'wb') as f: f.write(raw) doc = html.fromstring(raw) try: results = doc.xpath('//div[@id="atfResults" and @class]')[0] except IndexError: return if 's-result-list-parent-container' in results.get('class', ''): data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]" format_xpath = './/a[@title="%s"]/@title' % KINDLE_EDITION asin_xpath = '@data-asin' cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src" title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()" author_xpath = './/span[starts-with(text(), "%s ")]/following-sibling::span//text()' % BY price_xpath = '(.//span[contains(@class, " s-price ")])[last()]//text()' else: return for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format = ''.join(data.xpath(format_xpath)) if 'kindle' not in format.lower(): continue # We must have an asin otherwise we can't easily reference the # book later. asin = data.xpath(asin_xpath) if asin: asin = asin[0] else: continue cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath(title_xpath)) author = ''.join(data.xpath(author_xpath)) try: author = author.split('by ', 1)[1].split(" (")[0] except: pass price = ''.join(data.xpath(price_xpath)) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = asin.strip() s.formats = 'Kindle' yield s
def search(self, query, max_results=10, timeout=60): url = 'http://ebookstore.sony.com/search?keyword=%s' % urllib.quote_plus( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for item in doc.xpath( '//div[contains(@class, "searchResult")]/' 'descendant::li[contains(@class, "hreview")]'): if counter <= 0: break curr = ''.join( item.xpath( 'descendant::div[@class="pricing"]/p[@class="price money"]/descendant::*[@class="currency"]/@title' )).strip() amt = ''.join( item.xpath( 'descendant::div[@class="pricing"]/p[@class="price money"]/descendant::*[@class="amount"]/text()' )).strip() s = SearchResult() s.price = (curr + ' ' + amt) if (curr and amt) else _('Not Available') title = item.xpath('descendant::h3[@class="item"]') if not title: continue title = etree.tostring(title[0], method='text', encoding=unicode) if not title: continue s.title = title.strip() s.author = ''.join( item.xpath('descendant::li[contains(@class, "author")]/' 'a[@class="fn"]/text()')).strip() if not s.author: continue detail_url = ''.join( item.xpath('descendant::h3[@class="item"]' '/descendant::a[@class="fn" and @href]/@href')) if not detail_url: continue if detail_url.startswith('/'): detail_url = 'http:' + detail_url s.detail_item = detail_url counter -= 1 cover_url = ''.join( item.xpath('descendant::li[@class="coverart"]/' 'descendant::img[@src]/@src')) if cover_url: if cover_url.startswith('//'): cover_url = 'http:' + cover_url elif cover_url.startswith('/'): cover_url = 'http://ebookstore.sony.com' + cover_url s.cover_url = url_slash_cleaner(cover_url) s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Sony' yield s
def search(self, query, max_results=10, timeout=60): url = ( 'http://www.ebook.nl/store/advanced_search_result.php?keywords=' + urllib2.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath( '//table[contains(@class, "productListing")]/tr'): if counter <= 0: break details = data.xpath('./td/div[@class="prodImage"]/a') if not details: continue details = details[0] id = ''.join(details.xpath('./@href')).strip() id = id[id.rfind('/') + 1:] i = id.rfind('?') if i > 0: id = id[:i] if not id: continue cover_url = 'http://www.ebook.nl/store/' + ''.join( details.xpath('./img/@src')) title = ''.join(details.xpath('./img/@title')).strip() author = ''.join( data.xpath( './td/div[@class="prodTitle"]/h3/a/text()')).strip() price = ''.join( data.xpath('./td/div[@class="prodTitle"]/b/text()')) pdf = data.xpath( 'boolean(./td/div[@class="prodTitle"]/' 'p[contains(text(), "Bestandsformaat: Pdf")])') epub = data.xpath( 'boolean(./td/div[@class="prodTitle"]/' 'p[contains(text(), "Bestandsformaat: ePub")])') nodrm = data.xpath('boolean(./td/div[@class="prodTitle"]/' 'p[contains(text(), "zonder DRM") or' ' contains(text(), "watermerk")])') counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price if nodrm: s.drm = SearchResult.DRM_UNLOCKED else: s.drm = SearchResult.DRM_LOCKED s.detail_item = id formats = [] if epub: formats.append('ePub') if pdf: formats.append('PDF') s.formats = ','.join(formats) yield s
def search(self, query, max_results=10, timeout=60): lg = libgenapi.Libgenapi([ "http://libgen.io", "http://gen.lib.rus.ec", "http://93.174.95.27/", "http://libgen.in", "http://libgen.org" ]) try: results = lg.search(query) abort = False print 'Reached LibGen Mirrors.' except: print 'LibGenAPI crashed. In most cases this is caused by unreachable LibGen Mirrors, try again in a few minutes.' raise abort = True counter = 0 br = browser() if not abort: for i in results: r = results[counter] s = SearchResult() s.title = r['title'] s.author = r['author'] s.price = '$0.00' s.drm = SearchResult.DRM_UNLOCKED extension = r['extension'] # prep cover coverpage = 'http://gen.lib.rus.ec' + r['mirrors'][1] coverpage = coverpage.replace('ads.', 'book/index.') with closing(br.open(coverpage, timeout=10)) as f: doc = f.read() linkpos = doc.find('/covers') linkend = doc.find('.jpg') + 4 cover = doc[linkpos:linkend] cover = 'http://gen.lib.rus.ec' + cover s.cover_url = cover # prep download via libgen lgpage = 'http://libgen.io' + r['mirrors'][1] with closing(br.open(lgpage, timeout=10)) as f: doc = f.read() linkend = doc.find('><h2>DOWNLOAD') - 1 doc = doc[linkend - 100:linkend] linkpos = doc.find('http') libgendl = doc[linkpos:linkend] libgendl = libgendl.replace('amp;', '') libgenformat = 'libgen: .' + extension s.downloads[libgenformat] = libgendl # get download location from libgen.pw lgpw = r['mirrors'][0].replace('view', 'download') with closing(br.open(lgpw, timeout=10)) as f: doc = f.read() pos = doc.find('location') + 16 end = doc.find('status') - 27 location = doc[pos:end] # prep filename filename = r['author'] + ' - ' + r['title'] filename = filename.replace(' ', '_') filename = filename.encode('utf8') filename = urllib2.quote(filename) # prep download via b-ok (/bookzz/boosc/bookza) bokdl = 'http://dlx.b-ok.org/genesis/' + location + '/_as/' + filename + '.pdf' bokformat = 'b-ok: .' + extension s.downloads[bokformat] = bokdl # prep download via bookfi bookfidl = 'http://dl.lux.bookfi.net/genesis/' + location + '/_as/' + filename + '.pdf' bookfiformat = 'bookfi: .' + extension # s.downloads[bookfiformat] = bookfidl s.formats = libgenformat + ', ' + bokformat + ', ' + bookfiformat + ',' yield s counter = counter + 1
def search(self, query, max_results=10, timeout=60): url = 'http://woblink.com/ebooki-kategorie?query=' + urllib.quote_plus( query.encode('utf-8')) if max_results > 10: if max_results > 20: url += '&limit=30' else: url += '&limit=20' counter = max_results try: results = fork_job(js_browser, 'get_results', ( url, timeout, ), module_is_source_code=True) except WorkerError as e: raise Exception('Could not get results: %s' % e.orig_tb) doc = html.fromstring(strip_encoding_declarations(results['result'])) for data in doc.xpath('//div[@class="nw_katalog_lista_ksiazka "]'): if counter <= 0: break id = ''.join( data.xpath( './/div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/@href' )) if not id: continue cover_url = ''.join( data.xpath( './/div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/img/@src' )) title = ''.join( data.xpath( './/h3[@class="nw_katalog_lista_ksiazka_detale_tytul"]/a[1]/text()' )) author = ', '.join( data.xpath( './/p[@class="nw_katalog_lista_ksiazka_detale_autor"]/a/text()' )) price = ''.join( data.xpath('.//div[@class="nw_opcjezakupu_cena"]/text()')) formats = ', '.join( data.xpath( './/p[@class="nw_katalog_lista_ksiazka_detale_format"]/span/text()' )) s = SearchResult() s.cover_url = 'http://woblink.com' + cover_url s.title = title.strip() s.author = author.strip() s.price = price + ' zł' s.detail_item = id.strip() s.formats = formats if 'DRM' in formats: s.drm = SearchResult.DRM_LOCKED counter -= 1 yield s else: s.drm = SearchResult.DRM_UNLOCKED counter -= 1 yield s
def search(self, query, max_results=10, timeout=60): url = 'http://woblink.com/katalog-e-book?query=' + urllib.quote_plus( query.encode('utf-8')) if max_results > 10: if max_results > 20: url += '&limit=30' else: url += '&limit=20' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="nw_katalog_lista_ksiazka"]'): if counter <= 0: break id = ''.join( data.xpath( './/div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/@href' )) if not id: continue cover_url = ''.join( data.xpath( './/div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/img/@src' )) title = ''.join( data.xpath( './/h2[@class="nw_katalog_lista_ksiazka_detale_tytul"]/a[1]/text()' )) author = ', '.join( data.xpath( './/h3[@class="nw_katalog_lista_ksiazka_detale_autor"]/a/text()' )) price = ''.join( data.xpath( './/div[@class="nw_katalog_lista_ksiazka_opcjezakupu_cena"]/span/text()' )) price = re.sub('\.', ',', price) formats = ', '.join( data.xpath( './/p[@class="nw_katalog_lista_ksiazka_detale_formaty"]/span/text()' )) s = SearchResult() s.cover_url = 'http://woblink.com' + cover_url s.title = title.strip() s.author = author.strip() s.price = price + ' zł' s.detail_item = id.strip() s.formats = formats if 'EPUB DRM' in formats: s.drm = SearchResult.DRM_LOCKED counter -= 1 yield s else: s.drm = SearchResult.DRM_UNLOCKED counter -= 1 yield s
def search_amazon(query, max_results=10, timeout=60, write_html_to=None, base_url=SEARCH_BASE_URL, base_query=SEARCH_BASE_QUERY, field_keywords='k'): uquery = base_query.copy() uquery[field_keywords] = query def asbytes(x): if isinstance(x, type('')): x = x.encode('utf-8') return x uquery = {asbytes(k): asbytes(v) for k, v in uquery.items()} url = base_url + '?' + urlencode(uquery) br = browser(user_agent=get_user_agent()) counter = max_results with closing(br.open(url, timeout=timeout)) as f: raw = f.read() if write_html_to is not None: with open(write_html_to, 'wb') as f: f.write(raw) doc = html.fromstring(raw) for result in doc.xpath( '//div[contains(@class, "s-result-list")]//div[@data-index and @data-asin]' ): kformat = ''.join( result.xpath( './/a[contains(text(), "Kindle Edition")]//text()')) # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. if 'kindle' not in kformat.lower(): continue asin = result.get('data-asin') if not asin: continue cover_url = ''.join(result.xpath('.//img/@src')) title = etree.tostring(result.xpath('.//h2')[0], method='text', encoding='unicode') adiv = result.xpath( './/div[contains(@class, "a-color-secondary")]')[0] aparts = etree.tostring(adiv, method='text', encoding='unicode').split() idx = aparts.index('|') author = ' '.join(aparts[1:idx]) price = ''.join( result.xpath( './/span[contains(@class, "a-price")]/span[contains(@class, "a-offscreen")]/text()' )) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.detail_item = asin.strip() s.price = price.strip() s.formats = 'Kindle' yield s
def search_kobo(query, max_results=10, timeout=60, write_html_to=None): from css_selectors import Select url = 'https://www.kobobooks.com/search/search.html?q=' + urllib.quote_plus( query) br = browser() with closing(br.open(url, timeout=timeout)) as f: raw = f.read() if write_html_to is not None: with open(write_html_to, 'wb') as f: f.write(raw) doc = html.fromstring(raw) select = Select(doc) for i, item in enumerate(select('.result-items .item-wrapper.book')): if i == max_results: break for img in select('.item-image img[src]', item): cover_url = img.get('src') if cover_url.startswith('//'): cover_url = 'https:' + cover_url break else: cover_url = None for p in select('p.title', item): title = etree.tostring(p, method='text', encoding=unicode).strip() for a in select('a[href]', p): url = a.get('href') break else: url = None break else: title = None authors = [] for a in select('p.contributor-list a.contributor-name', item): authors.append( etree.tostring(a, method='text', encoding=unicode).strip()) authors = authors_to_string(authors) for p in select('p.price', item): price = etree.tostring(p, method='text', encoding=unicode).strip() break else: price = None if title and authors and url: s = SearchResult() s.cover_url = cover_url s.title = title s.author = authors s.price = price s.detail_item = url s.formats = 'EPUB' s.drm = SearchResult.DRM_UNKNOWN yield s
def search(query, max_results=10, timeout=60): url = 'https://woblink.com/publication/ajax?mode=none&query=' + quote_plus( query) if max_results > 10: if max_results > 20: url += '&limit=30' else: url += '&limit=20' br = browser(user_agent='CalibreCrawler/1.0') br.set_handle_gzip(True) rq = Request(url, headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest', 'Referrer': 'https://woblink.com/ebooki-kategorie', 'Cache-Control': 'max-age=0', }, data=urlencode({ 'nw_filtry_filtr_zakrescen_formularz[min]': '0', 'nw_filtry_filtr_zakrescen_formularz[max]': '350', })) r = br.open(rq) raw = r.read() doc = html.fromstring('<html><body>' + raw.decode('utf-8') + '</body></html>') counter = max_results for data in doc.xpath( '//div[@class="nw_katalog_lista_ksiazka ebook " or @class="nw_katalog_lista_ksiazka ebook promocja"]' ): if counter <= 0: break id = ''.join( data.xpath( './/div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/@href' )) if not id: continue cover_url = ''.join( data.xpath( './/div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/img/@src' )) title = ''.join( data.xpath( './/h3[@class="nw_katalog_lista_ksiazka_detale_tytul"]/a[1]/text()' )) author = ', '.join( data.xpath( './/p[@class="nw_katalog_lista_ksiazka_detale_autor"]/a/text()' )) price = ''.join( data.xpath('.//div[@class="nw_opcjezakupu_cena"]/span[2]/text()')) formats = ', '.join( data.xpath( './/p[@class="nw_katalog_lista_ksiazka_detale_format"]/span/text()' )) s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price + ' zł' s.detail_item = id.strip() s.formats = formats counter -= 1 s.drm = SearchResult.DRM_LOCKED if 'DRM' in formats else SearchResult.DRM_UNLOCKED yield s
def open_search(url, query, max_results=10, timeout=60): description = Description(url) url_template = description.get_best_template() if not url_template: return oquery = Query(url_template) # set up initial values oquery.searchTerms = query oquery.count = max_results url = oquery.url() counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: doc = safe_xml_fromstring(f.read()) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join( data.xpath('./*[local-name() = "id"]/text()')).strip() for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if 'http://opds-spec.org/thumbnail' in rel: s.cover_url = href elif 'http://opds-spec.org/image/thumbnail' in rel: s.cover_url = href elif 'http://opds-spec.org/acquisition/buy' in rel: s.detail_item = href elif 'http://opds-spec.org/acquisition/sample' in rel: pass elif 'http://opds-spec.org/acquisition' in rel: if type: ext = guess_extension(type) if ext: ext = ext[1:].upper().strip() s.downloads[ext] = href s.formats = ', '.join(s.downloads.keys()).strip() s.title = ' '.join( data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join( data.xpath( './*[local-name() = "author"]//*[local-name() = "name"]//text()' )).strip() price_e = data.xpath('.//*[local-name() = "price"][1]') if price_e: price_e = price_e[0] currency_code = price_e.get('currencycode', '') price = ''.join(price_e.xpath('.//text()')).strip() s.price = currency_code + ' ' + price s.price = s.price.strip() yield s
def search(self, query, max_results=10, timeout=60): counter = max_results page = 1 url = 'http://www.gandalf.com.pl/we/' + urllib.quote_plus( query.decode('utf-8').encode('iso8859_2')) + '/bdb' br = browser() while counter: with closing( br.open((url + str(page - 1) + '/#s') if (page - 1) else (url + '/#s'), timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="box"]'): if counter <= 0: break id = ''.join( data.xpath('.//div[@class="info"]/h3/a/@href')) if not id: continue cover_url = ''.join( data.xpath('.//div[@class="info"]/h3/a/@id')) title = ''.join( data.xpath('.//div[@class="info"]/h3/a/@title')) formats = ''.join( data.xpath('.//div[@class="info"]/p[1]/text()')) formats = re.findall(r'\((.*?)\)', formats)[0] author = ''.join( data.xpath( './/div[@class="info"]/h4/text() | .//div[@class="info"]/h4/span/text()' )) price = ''.join( data.xpath('.//div[@class="options"]/h3/text()')) price = re.sub('PLN', 'zł', price) price = re.sub('\.', ',', price) drm = data.xpath( 'boolean(.//div[@class="info" and contains(., "Zabezpieczenie: DRM")])' ) counter -= 1 s = SearchResult() s.cover_url = 'http://imguser.gandalf.com.pl/' + re.sub( 'p', 'p_', cover_url) + '.jpg' s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = id.strip() if drm: s.drm = SearchResult.DRM_LOCKED else: s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.upper().strip() yield s if not doc.xpath( 'boolean(//div[@class="wyszukiwanie_podstawowe_header"]//div[@class="box"])' ): break page += 1
def search(query, max_results=10, timeout=60, save_raw=None): url = 'https://www.smashwords.com/books/search?query=' + quote(query) br = browser() try: br.set_simple_cookie('adultOff', 'erotica', '.smashwords.com', path='/') except AttributeError: pass # old version of mechanize counter = max_results with closing(br.open(url, timeout=timeout)) as f: raw = f.read() if save_raw: with open(save_raw, 'wb') as r: r.write(raw) doc = html.fromstring(raw) for data in doc.xpath( '//div[@id="pageContent"]//div[contains(@class, "library-book")]' ): if counter <= 0: break data = html.fromstring(html.tostring(data)) id_a = ''.join( data.xpath( '//span[contains(@class, "library-title")]/a/@href')) if not id_a: continue cover_url = ''.join( data.xpath('//img[contains(@class, "book-list-image")]/@src')) title = ''.join( data.xpath( './/span[contains(@class, "library-title")]//text()')) author = ''.join( data.xpath( './/span[contains(@class, "library-by-line")]/a//text()')) price = ''.join(data.xpath('.//div[@class="subnote"]//text()')) if 'Price:' in price: try: price = price.partition('Price:')[2] price = re.sub(r'\s', ' ', price).strip() price = price.split(' ')[0].strip() except Exception: price = 'Unknown' if price == 'Free!': price = '$0.00' counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = id_a s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=10, timeout=60): # check for cyrillic symbols before performing search uquery = type(u'')(query.strip(), 'utf-8') reObj = re.search(u'^[а-яА-Я\\d\\s]{2,}$', uquery) if not reObj: return base_url = 'http://e-knigi.net' url = base_url + '/virtuemart?page=shop.browse&search_category=0&search_limiter=anywhere&keyword=' + urllib2.quote( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) # if the store finds only one product, it opens directly detail view for data in doc.xpath('//div[@class="prod_details"]'): s = SearchResult() s.cover_url = ''.join( data.xpath( './/div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@src' )).strip() s.title = ''.join( data.xpath( './/div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@alt' )).strip() s.author = ''.join( data.xpath( './/div[@class="td_bg clearfix"]/div[@class="gk_product_tab"]/div/table/tr[3]/td[2]/text()' )).strip() s.price = ''.join( data.xpath( './/span[@class="productPrice"]/text()')).strip() s.detail_item = url s.drm = SearchResult.DRM_UNLOCKED yield s return # search in store results for data in doc.xpath('//div[@class="browseProductContainer"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[1]/@href')).strip() if not id: continue title = ''.join( data.xpath('.//a[@class="gk_vm_product_image"]/img/@title') ).strip() author = ''.join( data.xpath('.//div[@style="float:left;width:90%"]/b/text()' )).strip().replace('Автор: ', '') if title.lower().find( query.lower()) == -1 and author.lower().find( query.lower()) == -1: continue counter -= 1 s = SearchResult() s.cover_url = ''.join( data.xpath('.//a[@class="gk_vm_product_image"]/img/@src') ).strip() s.title = title s.author = author s.price = ''.join( data.xpath( './/span[@class="productPrice"]/text()')).strip() s.detail_item = base_url + id s.drm = SearchResult.DRM_UNLOCKED yield s
def search_flibusta(url, query, web_url, max_results=10, timeout=60): description = Description(url) url_template = description.get_best_template() if not url_template: return oquery = Query(url_template) # set up initial values oquery.searchTerms = query oquery.count = max_results url = oquery.url() counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: doc = etree.fromstring(f.read()) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join( data.xpath('./*[local-name() = "id"]/text()')).strip() for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if 'http://opds-spec.org/thumbnail' in rel: s.cover_url = web_url + href elif 'http://opds-spec.org/image/thumbnail' in rel: s.cover_url = web_url + href elif 'http://opds-spec.org/acquisition/buy' in rel: s.detail_item = web_url + href elif 'http://opds-spec.org/acquisition/sample' in rel: pass elif 'http://opds-spec.org/acquisition/open-access' in rel: if 'application/fb2+zip' in type: s.downloads['FB2'] = web_url + href elif 'application/txt+zip' in type: s.downloads['TXT'] = web_url + href elif 'application/html+zip' in type: s.downloads['HTML'] = web_url + href elif 'application/x-mobipocket-ebook' in type: s.downloads['MOBI'] = web_url + href elif type: ext = guess_extension(type) ext2 = guess_extension(type.replace("+zip", "")) if ext: ext = ext[1:].upper().strip() s.downloads[ext] = web_url + href elif ext2: ext2 = ext2[1:].upper().strip() s.downloads[ext2] = web_url + href s.formats = ', '.join(s.downloads.keys()).strip() s.title = ' '.join( data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join( data.xpath( './*[local-name() = "author"]//*[local-name() = "name"]//text()' )).strip() s.price = '$0.00' s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read().decode('latin-1', 'replace')) # Amazon has two results pages. is_shot = doc.xpath('boolean(//div[@id="shotgunMainResults"])') # Horizontal grid of books. Search "Paolo Bacigalupi" if is_shot: data_xpath = '//div[contains(@class, "result")]' format_xpath = './/div[@class="productTitle"]//text()' asin_xpath = './/div[@class="productTitle"]//a' cover_xpath = './/div[@class="productTitle"]//img/@src' title_xpath = './/div[@class="productTitle"]/a//text()' price_xpath = './/div[@class="newPrice"]/span/text()' # Vertical list of books. else: # New style list. Search "Paolo Bacigalupi" if doc.xpath('boolean(//div[@class="image"])'): data_xpath = '//div[contains(@class, "results")]//div[contains(@class, "result")]' format_xpath = './/span[@class="binding"]//text()' asin_xpath = './/div[@class="image"]/a[1]' cover_xpath = './/img[@class="productImage"]/@src' title_xpath = './/a[@class="title"]/text()' price_xpath = './/span[contains(@class, "price")]/text()' # Old style list. Search "martin" else: data_xpath = '//div[contains(@class, "result")]' format_xpath = './/span[@class="format"]//text()' asin_xpath = './/div[@class="productImage"]/a[1]' cover_xpath = './/div[@class="productImage"]//img/@src' title_xpath = './/div[@class="productTitle"]/a/text()' price_xpath = './/div[@class="newPrice"]//span//text()' for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format = ''.join(data.xpath(format_xpath)) if 'kindle' not in format.lower(): continue # We must have an asin otherwise we can't easily reference the # book later. asin_href = None asin_a = data.xpath(asin_xpath) if asin_a: asin_href = asin_a[0].get('href', '') m = re.search(r'/dp/(?P<asin>.+?)(/|$)', asin_href) if m: asin = m.group('asin') else: continue else: continue cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath(title_xpath)) price = ''.join(data.xpath(price_xpath)) if is_shot: author = format.split(' by ')[-1] else: author = ''.join(data.xpath('.//span[@class="ptBrand"]/text()')) author = author.split('by ')[-1] counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = asin.strip() s.formats = 'Kindle' yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.empik.com/szukaj/produkt?c=ebooki-ebooki&q=' + urllib.quote( query ) + '&qtype=basicForm&start=1&catalogType=pl&searchCategory=3501&format=epub&format=mobi&format=pdf&resultsPP=' + str( max_results) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="productsSet"]/div'): if counter <= 0: break id = ''.join( data.xpath('.//a[@class="productBox-450Title"]/@href')) if not id: continue cover_url = ''.join( data.xpath( './/div[@class="productBox-450Pic"]/a/img/@data-original' )) title = ''.join( data.xpath('.//a[@class="productBox-450Title"]/text()')) title = re.sub(r' \(ebook\)', '', title) author = ', '.join( data.xpath( './/div[@class="productBox-450Author"]/a/text()')) price = ''.join( data.xpath('.//span[@class="currentPrice"]/text()')) formats = ''.join( data.xpath('.//div[@class="productBox-450Type"]/text()')) formats = re.sub(r'Ebook *,? *', '', formats) formats = re.sub(r'\(.*\)', '', formats) with closing( br.open('http://empik.com' + id.strip(), timeout=timeout / 4)) as nf: idata = html.fromstring(nf.read()) crawled = idata.xpath( './/td[(@class="connectedInfo") or (@class="connectedInfo connectedBordered")]/a/text()' ) formats_more = ','.join([ re.sub('ebook, ', '', x) for x in crawled if 'ebook' in x ]) if formats_more: formats += ', ' + formats_more drm = data.xpath( 'boolean(.//div[@class="productBox-450Type" and contains(text(), "ADE")])' ) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://empik.com' + id.strip() s.formats = formats.upper().strip() s.drm = SearchResult.DRM_LOCKED if drm else SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.diesel-ebooks.com/index.php?page=seek&id[m]=&id[c]=scope%253Dinventory&id[q]=' + urllib.quote_plus( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) if doc.xpath('not(boolean(//select[contains(@id, "selection")]))'): id = ''.join(doc.xpath('//div[@class="price_fat"]//a/@href')) mo = re.search('(?<=id=).+?(?=&)', id) if not mo: yield None id = mo.group() cover_url = ''.join(doc.xpath('//div[@class="cover"]/a/@href')) title = ''.join( doc.xpath('//div[@class="desc_fat"]//h1/text()')) author = ''.join( doc.xpath( '//div[@class="desc_fat"]//span[@itemprop="author"]/text()' )) price = ''.join( doc.xpath('//div[@class="price_fat"]//h1/text()')) formats = ', '.join( doc.xpath( '//div[@class="desc_fat"]//p[contains(text(), "Format")]/text()' )) a, b, formats = formats.partition('Format:') drm = SearchResult.DRM_LOCKED if 'drm free' in formats.lower(): drm = SearchResult.DRM_UNLOCKED s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = id.strip() s.formats = formats s.drm = drm yield s else: for data in doc.xpath('//div[contains(@class, "item")]'): if counter <= 0: break id = ''.join(data.xpath('div[@class="cover"]/a/@href')) if not id or '/item/' not in id: continue cover_url = ''.join( data.xpath('div[@class="cover"]//img/@src')) title = ''.join( data.xpath('.//div[@class="content"]//h2/a/text()')) author = ''.join( data.xpath('.//div[@class="content"]/span//a/text()')) price = '' price_elem = data.xpath( './/div[@class="price_fat"]//h1/text()') if price_elem: price = price_elem[0] formats = ', '.join( data.xpath( './/div[@class="book-info"]//text()')).strip() a, b, formats = formats.partition('Format:') drm = SearchResult.DRM_LOCKED if 'drm free' in formats.lower(): drm = SearchResult.DRM_UNLOCKED counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = id.strip() s.formats = formats s.drm = drm yield s
def search(self, query, max_results=20, timeout=60): br = browser() counter = max_results page = 1 while counter: with closing( br.open( 'http://www.publio.pl/szukaj,strona' + str(page) + '.html?q=' + urllib.quote(query) + '§ions=EMAGAZINE§ions=MINIBOOK§ions=EBOOK', timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="item"]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="img"]/a/@href')) if not id: continue cover_url = ''.join( data.xpath( './/div[@class="img"]/a/img/@data-original')) title = ''.join( data.xpath('.//div[@class="img"]/a/@title')) title2 = ''.join( data.xpath('.//div[@class="desc"]/h5//text()')) if title2: title = title + '. ' + title2 if (''.join( data.xpath( './div[@class="desc"]/div[@class="detailShortList"]/div[last()]/span/text()' )).strip() == "Seria:"): series = ''.join( data.xpath( './div[@class="desc"]/div[@class="detailShortList"]/div[last()]/a/@title' )) title = title + ' (seria ' + series + ')' author = ', '.join( data.xpath( './div[@class="desc"]/div[@class="detailShortList"]/div[@class="row"][1]/a/@title' )) price = ''.join( data.xpath( './/div[@class="priceBox tk-museo-slab"]/ins/text()' )) if not price: price = ''.join( data.xpath( './/div[@class="priceBox tk-museo-slab"]/text()' )).strip() formats = ', '.join([ x.strip() for x in data.xpath( './/div[@class="formats"]/a/text()') ]) counter -= 1 s = SearchResult() s.cover_url = 'http://www.publio.pl' + cover_url s.title = title.strip() s.author = author s.price = price s.detail_item = 'http://www.publio.pl' + id.strip() s.drm = SearchResult.DRM_LOCKED if 'DRM' in formats else SearchResult.DRM_UNLOCKED s.formats = formats.replace(' DRM', '').strip() yield s if not doc.xpath('boolean(//a[@class="next"])'): break page += 1
def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://www.manybooks.net/opds/'): ''' Manybooks uses a very strange opds feed. The opds main feed is structured like a stanza feed. The search result entries give very little information and requires you to go to a detail link. The detail link has the wrong type specified (text/html instead of application/atom+xml). ''' description = Description(open_search_url) url_template = description.get_best_template() if not url_template: return oquery = Query(url_template) # set up initial values oquery.searchTerms = query oquery.count = max_results url = oquery.url() counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: raw_data = f.read() raw_data = raw_data.decode('utf-8', 'replace') doc = etree.fromstring(raw_data) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() detail_links = data.xpath( './*[local-name() = "link" and @type = "text/html"]') if not detail_links: continue detail_link = detail_links[0] detail_href = detail_link.get('href') if not detail_href: continue s.detail_item = 'http://manybooks.net/titles/' + detail_href.split( 'tid=')[-1] + '.html' # These can have HTML inside of them. We are going to get them again later # just in case. s.title = ''.join( data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join( data.xpath('./*[local-name() = "author"]//text()')).strip() # Follow the detail link to get the rest of the info. with closing(br.open(detail_href, timeout=timeout / 4)) as df: ddoc = etree.fromstring(df.read()) ddata = ddoc.xpath('//*[local-name() = "entry"][1]') if ddata: ddata = ddata[0] # This is the real title and author info we want. We got # it previously just in case it's not specified here for some reason. s.title = ''.join( ddata.xpath( './*[local-name() = "title"]//text()')).strip() s.author = ', '.join( ddata.xpath( './*[local-name() = "author"]//text()')).strip() if s.author.startswith(','): s.author = s.author[1:] if s.author.endswith(','): s.author = s.author[:-1] s.cover_url = ''.join( ddata.xpath( './*[local-name() = "link" and @rel = "http://opds-spec.org/thumbnail"][1]/@href' )).strip() for link in ddata.xpath( './*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]' ): type = link.get('type') href = link.get('href') if type: ext = mimetypes.guess_extension(type) if ext: ext = ext[1:].upper().strip() s.downloads[ext] = href s.price = '$0.00' s.drm = SearchResult.DRM_UNLOCKED s.formats = 'EPUB, PDB (eReader, PalmDoc, zTXT, Plucker, iSilo), FB2, ZIP, AZW, MOBI, PRC, LIT, PKG, PDF, TXT, RB, RTF, LRF, TCR, JAR' yield s
def search(self, query, max_results=10, timeout=60): ''' XinXii's open search url is: http://www.xinxii.com/catalog-search/query/?keywords={searchTerms}&pw={startPage?}&doc_lang={docLang}&ff={docFormat},{docFormat},{docFormat} This url requires the docLang and docFormat. However, the search itself sent to XinXii does not require them. They can be ignored. We cannot push this into the stanard OpenSearchOPDSStore search because of the required attributes. XinXii doesn't return all info supported by OpenSearchOPDSStore search function so this one is modified to remove parts that are used. ''' url = 'http://www.xinxii.com/catalog-search/query/?keywords=' + urllib.quote_plus( query) counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: doc = etree.fromstring(f.read()) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join( data.xpath('./*[local-name() = "id"]/text()')).strip() for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'): s.cover_url = href if rel == 'alternate': s.detail_item = href s.formats = 'EPUB, PDF' s.title = ' '.join( data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join( data.xpath( './*[local-name() = "author"]//*[local-name() = "name"]//text()' )).strip() price_e = data.xpath('.//*[local-name() = "price"][1]') if price_e: price_e = price_e[0] currency_code = price_e.get('currencycode', '') price = ''.join(price_e.xpath('.//text()')).strip() s.price = currency_code + ' ' + price s.price = s.price.strip() yield s
def search(self, query, max_results=10, timeout=60): url = self.SEARCH_URL % urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for item in doc.xpath( '//div[@id="searchresult-list"]/descendant::div[contains(@class, "doc-item")]' ): if counter <= 0: break s = SearchResult() s.price = _('Not Available') p = ''.join( item.xpath( 'descendant::p[@class="doc-price"]/descendant::span[@itemprop="price"]/text()' )).strip() if p: s.price = 'AUD ' + p.split('$')[-1] title = item.xpath('descendant::h3[@class="doc-title"]') if not title: continue title = etree.tostring(title[0], method='text', encoding=unicode) if not title: continue st = item.xpath('descendant::p[@class="doc-subtitle"]') if st: st = etree.tostring(st[0], method='text', encoding=unicode) if st and st.strip(): title = title.strip() + ': ' + st s.title = title.strip() aut = item.xpath('descendant::p[@class="doc-author"]') if not aut: continue s.author = etree.tostring(aut[0], method='text', encoding=unicode).strip() if not s.author: continue du = ''.join( item.xpath( 'descendant::h3[position() = 1 and @class="doc-title"]/descendant::a[position() = 1 and @href]/@href' )).strip() if not du: continue detail_url = 'https://au.readerstore.sony.com' + du s.detail_item = detail_url counter -= 1 cover_url = ''.join( item.xpath( 'descendant::p[@class="doc-cover" and position() = 1]/' 'descendant::img[position() = 1 and @src]/@src')) if cover_url: s.cover_url = url_slash_cleaner(cover_url) s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Sony' yield s
def search_amazon( query, max_results=10, timeout=60, write_html_to=None, search_url='http://www.amazon.com/s/?url=search-alias%3Ddigital-text&field-keywords=' ): url = search_url + query.encode('ascii', 'backslashreplace').replace( '%', '%25').replace('\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: raw = f.read() if write_html_to is not None: with open(write_html_to, 'wb') as f: f.write(raw) doc = html.fromstring(raw) try: results = doc.xpath('//div[@id="atfResults" and @class]')[0] except IndexError: return if 's-result-list-parent-container' in results.get('class', ''): data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]" format_xpath = './/a[contains(text(), "Kindle Edition")]//text()' asin_xpath = '@data-asin' cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src" title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()" author_xpath = './/span[starts-with(text(), "by ")]/following-sibling::span//text()' price_xpath = '(.//span[contains(@class, " s-price ")])[last()]//text()' elif 'grid' in results.get('class', ''): data_xpath = '//div[contains(@class, "prod")]' format_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' ) asin_xpath = '@name' cover_xpath = './/img[contains(@class, "productImage")]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' price_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' ) elif 'ilresults' in results.get('class', ''): data_xpath = '//li[(@class="ilo")]' format_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' ) asin_xpath = '@name' cover_xpath = './div[@class = "ilf"]/a/img[contains(@class, "ilo")]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' # Results can be in a grid (table) or a column price_xpath = ( './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' ) elif 'list' in results.get('class', ''): data_xpath = '//div[contains(@class, "prod")]' format_xpath = ( './/ul[contains(@class, "rsltL")]' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' ) asin_xpath = '@name' cover_xpath = './/img[contains(@class, "productImage")]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' price_xpath = ( './/ul[contains(@class, "rsltL")]' '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' ) else: return for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format = ''.join(data.xpath(format_xpath)) if 'kindle' not in format.lower(): continue # We must have an asin otherwise we can't easily reference the # book later. asin = data.xpath(asin_xpath) if asin: asin = asin[0] else: continue cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath(title_xpath)) author = ''.join(data.xpath(author_xpath)) try: author = author.split('by ', 1)[1].split(" (")[0] except: pass price = ''.join(data.xpath(price_xpath)) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = asin.strip() s.formats = 'Kindle' yield s
def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode( 'ascii', 'backslashreplace').replace('%', '%25').replace( '\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: allText = f.read() doc = html.fromstring(allText) #.decode('latin-1', 'replace')) format_xpath2 = '' if doc.xpath( '//div[@id = "atfResults" and contains(@class, "grid")]'): #print('grid form') data_xpath = '//div[contains(@class, "prod")]' format_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' ) asin_xpath = '@name' cover_xpath = './/img[contains(@class, "productImage")]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' price_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' ) elif doc.xpath( '//div[@id = "atfResults" and contains(@class, "ilresults")]' ): #print('ilo form') data_xpath = '//li[(@class="ilo")]' format_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' ) asin_xpath = '@name' cover_xpath = './div[@class = "ilf"]/a/img[contains(@class, "ilo")]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' # Results can be in a grid (table) or a column price_xpath = ( './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' ) elif doc.xpath( '//div[@id = "atfResults" and contains(@class, "s-result-list-parent-container")]' ): #print('new list form') data_xpath = '//li[contains(@class, "s-result-item")]' format_xpath = './/a[contains(@class, "a-size-small")]/text()' format_xpath2 = './/h3[contains(@class, "s-inline")]/text()' asin_xpath = '@data-asin' cover_xpath = './/img[contains(@class, "cfMarker")]/@src' title_xpath = './/h2[contains(@class, "s-access-title")]/text()' author_xpath = ( './/div[contains(@class, "a-fixed-left-grid-col")]' '/div/div/span//text()') price_xpath = ( './/div[contains(@class, "a-spacing-none")]/a/span[contains(@class, "s-price")]/text()' ) elif doc.xpath( '//div[@id = "atfResults" and contains(@class, "list")]'): #print('list form') data_xpath = '//li[@class="s-result-item"]' format_xpath = './/a[contains(@class, "a-size-small")]/text()' format_xpath2 = './/h3[contains(@class, "s-inline")]/text()' asin_xpath = '@data-asin' cover_xpath = './/img[contains(@class, "cfMarker")]/@src' title_xpath = './/h2[contains(@class, "s-access-title")]/text()' author_xpath = ( './/div[contains(@class, "a-fixed-left-grid-col")]' '/div/div/span//text()') price_xpath = ('.//span[contains(@class, "s-price")]/text()') else: # URK -- whats this? print('unknown result table form for Amazon EU search') #with open("c:/amazon_search_results.html", "w") as out: # out.write(allText) return for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (authors pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format_ = ''.join(data.xpath(format_xpath)) if 'kindle' not in format_.lower(): if format_xpath2: format_ = ''.join(data.xpath(format_xpath2)) if 'kindle' not in format_.lower(): # print(etree.tostring(data, pretty_print=True)) continue # We must have an asin otherwise we can't easily reference the # book later. asin = data.xpath(asin_xpath) if asin: asin = asin[0] else: continue cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath(title_xpath)) authors = ''.join(data.xpath(author_xpath)) authors = re.sub('^' + self.author_article, '', authors) authors = re.sub(self.and_word, ' & ', authors) mo = re.match(r'(.*)(\(\d.*)$', authors) if mo: authors = mo.group(1).strip() price = ''.join(data.xpath(price_xpath)[-1]) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = authors.strip() s.price = price.strip() s.detail_item = asin.strip() s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Kindle' yield s
def search(self, query, max_results=10, timeout=60): description = Description(self.open_search_url) url_template = description.get_best_template() if not url_template: return oquery = Query(url_template) # set up initial values oquery.searchTerms = query oquery.count = max_results url = oquery.url() counter = max_results br = self.create_browser() while url != None and counter > 0: with closing(br.open(url, timeout=timeout)) as f: s = f.read() doc = etree.fromstring(s) url = None for link in doc.xpath('//*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if rel == 'next' and type == 'application/atom+xml': if href[0] == "/": href = self.base_url + href url = href for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip() drm = False for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if 'http://opds-spec.org/thumbnail' in rel: s.cover_url = href elif 'http://opds-spec.org/image/thumbnail' in rel: s.cover_url = href elif 'http://opds-spec.org/acquisition/buy' in rel: s.detail_item = href elif 'http://opds-spec.org/acquisition' in rel: if type: ext = guess_extension(type) if type == 'application/fb2+xml': ext = '.fb2' if ext: ext = ext[1:].upper().strip() if href[0] == "/": href = self.base_url + href s.downloads[ext] = href for enc in link.xpath('./*[local-name() = "encryption_method"]'): drm = True s.formats = ', '.join(s.downloads.keys()).strip() s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join(data.xpath('./*[local-name() = "author"]//*[local-name() = "name"]//text()')).strip() s.drm = SearchResult.DRM_LOCKED if drm else SearchResult.DRM_UNLOCKED price_e = data.xpath('.//*[local-name() = "price"][1]') if price_e: price_e = price_e[0] currency_code = price_e.get('currencycode', '') price = ''.join(price_e.xpath('.//text()')).strip() s.price = currency_code + ' ' + price s.price = s.price.strip() if s.cover_url: s.cover_bak = s.cover_url s.cover_url = None yield s
def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode( 'ascii', 'backslashreplace').replace('%', '%25').replace( '\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) if doc.xpath( '//div[@id = "atfResults" and contains(@class, "grid")]'): data_xpath = '//div[contains(@class, "prod")]' format_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' ) asin_xpath = '@name' cover_xpath = './/img[contains(@class, "productImage")]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' price_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' ) elif doc.xpath( '//div[@id = "atfResults" and contains(@class, "ilresults")]' ): data_xpath = '//li[(@class="ilo")]' format_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' ) asin_xpath = '@name' cover_xpath = './div[@class = "ilf"]/a/img[contains(@class, "ilo")]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' # Results can be in a grid (table) or a column price_xpath = ( './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' ) elif doc.xpath( '//div[@id = "atfResults" and contains(@class, "list")]'): data_xpath = '//div[contains(@class, "prod")]' format_xpath = ( './/ul[contains(@class, "rsltL")]' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' ) asin_xpath = '@name' cover_xpath = './/img[contains(@class, "productImage")]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' price_xpath = ( './/ul[contains(@class, "rsltL")]' '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' ) else: return for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format = ''.join(data.xpath(format_xpath)) if 'kindle' not in format.lower(): continue # We must have an asin otherwise we can't easily reference the # book later. asin = data.xpath(asin_xpath) if asin: asin = asin[0] else: continue cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath(title_xpath)) author = ''.join(data.xpath(author_xpath)) try: author = author.split('by ', 1)[1].split(" (")[0] except: pass price = ''.join(data.xpath(price_xpath)) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = asin.strip() s.formats = 'Kindle' yield s
def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode( 'ascii', 'backslashreplace').replace('%', '%25').replace( '\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) #.decode('latin-1', 'replace')) data_xpath = '//div[contains(@class, "prod")]' # Results can be in a grid (table) or a column format_xpath = ( './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' ) asin_xpath = '@name' cover_xpath = './/img[@class="productImage"]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' # Results can be in a grid (table) or a column price_xpath = ( './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' ) for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (authors pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format_ = ''.join(data.xpath(format_xpath)) if 'kindle' not in format_.lower(): continue # We must have an asin otherwise we can't easily reference the # book later. asin = data.xpath(asin_xpath) if asin: asin = asin[0] else: continue cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath(title_xpath)) authors = ''.join(data.xpath(author_xpath)) authors = re.sub('^' + self.author_article, '', authors) authors = re.sub(self.and_word, ' & ', authors) mo = re.match(r'(.*)(\(\d.*)$', authors) if mo: authors = mo.group(1).strip() price = ''.join(data.xpath(price_xpath)) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = authors.strip() s.price = price.strip() s.detail_item = asin.strip() s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Kindle' yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.nexto.pl/szukaj.xml?search-clause=' + urllib.quote_plus( query) + '&scid=1015' br = browser() offset = 0 counter = max_results while counter: with closing( br.open(url + '&_offset=' + str(offset), timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="productslist"]/li'): if counter <= 0: break id = ''.join( data.xpath( './/div[@class="cover_container"]/a[1]/@href')) if not id: continue price = ''.join( data.xpath('.//strong[@class="nprice"]/text()')) cover_url = ''.join( data.xpath('.//img[@class="cover"]/@src')) cover_url = re.sub(r'%2F', '/', cover_url) cover_url = re.sub(r'widthMax=120&heightMax=200', 'widthMax=64&heightMax=64', cover_url) title = ''.join(data.xpath('.//a[@class="title"]/text()')) title = re.sub(r' - ebook$', '', title) formats = ', '.join( data.xpath( './/ul[@class="formats_available"]/li//b/text()')) DrmFree = re.search(r'znak', formats) formats = re.sub(r'\ ?\(.+?\)', '', formats) author = '' with closing( br.open('http://www.nexto.pl/' + id.strip(), timeout=timeout / 4)) as nf: idata = html.fromstring(nf.read()) author = ', '.join( idata.xpath( '//div[@class="basic_data"]/p[1]/b/a/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url if cover_url[: 4] == 'http' else 'http://www.nexto.pl' + cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED if DrmFree else SearchResult.DRM_LOCKED s.formats = formats.upper().strip() yield s if not doc.xpath( '//div[@class="listnavigator"]//a[@class="next"]'): break offset += 10
def search_amazon(self, query, max_results=10, timeout=60, write_html_to=None): field_keywords = self.FIELD_KEYWORDS uquery = self.SEARCH_BASE_QUERY.copy() uquery[field_keywords] = query def asbytes(x): if isinstance(x, type('')): x = x.encode('utf-8') return x uquery = {asbytes(k): asbytes(v) for k, v in uquery.items()} url = self.SEARCH_BASE_URL + '?' + urlencode(uquery) counter = max_results raw = read_url(self.scraper_storage, url, timeout=timeout) if write_html_to is not None: with open(write_html_to, 'w') as f: f.write(raw) doc = html.fromstring(raw) for result in doc.xpath( '//div[contains(@class, "s-result-list")]//div[@data-index and @data-asin]' ): kformat = ''.join( result.xpath('.//a[contains(text(), "{}")]//text()'.format( self.KINDLE_EDITION))) # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). So we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. if 'kindle' not in kformat.lower(): continue asin = result.get('data-asin') if not asin: continue cover_url = ''.join(result.xpath('.//img/@src')) title = etree.tostring(result.xpath('.//h2')[0], method='text', encoding='unicode') adiv = result.xpath('.//div[contains(@class, "a-color-secondary")]')[0] aparts = etree.tostring(adiv, method='text', encoding='unicode').split() idx = aparts.index(self.BY) author = ' '.join(aparts[idx + 1:]).split('|')[0].strip() price = '' for span in result.xpath( './/span[contains(@class, "a-price")]/span[contains(@class, "a-offscreen")]' ): q = ''.join(span.xpath('./text()')) if q: price = q break counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.detail_item = asin.strip() s.price = price.strip() s.formats = 'Kindle' yield s