def create_search_result(self, data): xp_template = 'normalize-space(@{0})' sRes = SearchResult() sRes.drm = SearchResult.DRM_UNLOCKED sRes.detail_item = data.xpath(xp_template.format('hub_id')) sRes.title = data.xpath( 'string(.//title-info/book-title/text()|.//publish-info/book-name/text())' ) # aut = concat('.//title-info/author/first-name', ' ') authors = data.xpath('.//title-info/author/first-name/text()|' './/title-info/author/middle-name/text()|' './/title-info/author/last-name/text()') sRes.author = u' '.join(map(type(u''), authors)) sRes.price = data.xpath(xp_template.format('price')) # cover vs cover_preview sRes.cover_url = data.xpath(xp_template.format('cover_preview')) sRes.price = format_price_in_RUR(sRes.price) types = data.xpath('//fb2-book//files/file/@type') fmt_set = _parse_ebook_formats(' '.join(types)) sRes.formats = ', '.join(fmt_set) return sRes
def search(self, query, max_results=10, timeout=60): try: results = lg.lookup( lg.search(query, 'title') + lg.search(query, 'author')) print('Reached LibGen Mirrors.') except Exception as e: print(e) print( 'pylibgen crashed. In most cases this is caused by unreachable LibGen Mirrors, try again in a few minutes.' ) return self.num_results = len(results) for r in results: s = SearchResult() s.title = r['title'] s.author = r['author'] s.price = '$0.00' s.drm = SearchResult.DRM_UNLOCKED s.formats = r['extension'] s.detail_item = r['md5'] yield s
def search_google(query, max_results=10, timeout=60, write_html_to=None): url = 'https://www.google.com/search?tbm=bks&q=' + quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: raw = f.read() doc = parse_html(raw) if write_html_to is not None: praw = html.tostring(doc, encoding='utf-8') open(write_html_to, 'wb').write(praw) for data in doc.xpath('//div[@id="rso"]//div[@class="g"]'): if counter <= 0: break id = ''.join(data.xpath('.//h3/a/@href')) if not id: continue title = ''.join(data.xpath('.//h3/a//text()')) authors = data.xpath('descendant::div[@class="s"]//a[@class="fl" and @href]//text()') while authors and authors[-1].strip().lower() in ('preview', 'read', 'more editions'): authors = authors[:-1] if not authors: continue author = ' & '.join(authors) counter -= 1 s = SearchResult() s.title = title.strip() s.author = author.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.bubok.pt/resellers/calibre_search/' + urllib.quote_plus( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[contains(@class, "libro")]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="url"]/text()')) title = ''.join(data.xpath('.//div[@class="titulo"]/text()')) author = ''.join(data.xpath('.//div[@class="autor"]/text()')) price = ''.join(data.xpath('.//div[@class="precio"]/text()')) formats = ''.join( data.xpath('.//div[@class="formatos"]/text()')) cover = ''.join(data.xpath('.//div[@class="portada"]/text()')) counter -= 1 s = SearchResult() s.title = title.strip() s.author = author.strip() s.detail_item = id.strip() s.price = price.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.strip() s.cover_url = cover.strip() yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.legimi.com/pl/ebooki/?szukaj=' + urllib.quote_plus( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@id="listBooks"]/div'): if counter <= 0: break id = ''.join(data.xpath('.//a[1]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//img[1]/@src')) title = ''.join( data.xpath( './/span[@class="bookListTitle ellipsis"]/text()')) author = ''.join( data.xpath( './/span[@class="bookListAuthor ellipsis"]/text()')) price = ''.join( data.xpath('.//div[@class="bookListPrice"]/span/text()')) counter -= 1 s = SearchResult() s.cover_url = 'http:' + cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://www.legimi.com/' + id.strip() yield s
def search(self, query, max_results=10, timeout=60): br = browser() page=1 counter = max_results while counter: with closing(br.open('https://www.swiatebookow.pl/ebooki/?q=' + quote(query) + '&page=' + str(page), timeout=timeout)) as f: doc = html.fromstring(f.read().decode('utf-8')) for data in doc.xpath('//div[@class="category-item-container"]//div[@class="book-large"]'): if counter <= 0: break id = ''.join(data.xpath('./a/@href')) if not id: continue cover_url = ''.join(data.xpath('.//div[@class="cover-xs"]/img/@src')) price = ''.join(data.xpath('.//span[@class="item-price"]/text()')+data.xpath('.//span[@class="sub-price"]/text()')) title = ''.join(data.xpath('.//h3/text()')) author = ', '.join(data.xpath('.//div[@class="details"]/p/a/text()')) counter -= 1 s = SearchResult() s.cover_url = 'https://www.swiatebookow.pl' + cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'https://www.swiatebookow.pl' + id # s.formats = formats.upper() s.drm = SearchResult.DRM_UNLOCKED yield s if not doc.xpath('//div[@class="paging_bootstrap pagination"]//a[@class="next"]'): break page+=1
def search(self, query, max_results=10, timeout=60): ''' Searches LibGen for Books. Since the mirror links are not direct downloads, it should not provide these as `s.downloads`. ''' debug_print('Libgen Fiction::__init__.py:LibgenStore:search:query =', query) libgen_results = self.libgen.search(query) for result in libgen_results.results[:min(max_results, len(libgen_results.results))]: debug_print('Libgen Fiction::__init__.py:LibgenStore:search:' 'result.title =', result.title) for mirror in result.mirrors[0:1]: # Calibre only shows 1 anyway debug_print('Libgen Fiction::__init__.py:LibgenStore:search:' 'result.mirror.url =', mirror.url) s = SearchResult() s.store_name = PLUGIN_NAME s.cover_url = result.image_url s.title = '{} ({}, {}{})'.format( result.title, result.language, mirror.size, mirror.unit) s.author = result.authors s.price = '0.00' s.detail_item = result.md5 s.drm = SearchResult.DRM_UNLOCKED s.formats = mirror.format s.plugin_author = PLUGIN_AUTHORS debug_print('Libgen Fiction::__init__.py:LibgenStore:search:s =', s) yield s
def search(self, query, max_results=10, timeout=60): url = 'http://wolnelektury.pl/szukaj?q=' + urllib.quote_plus(query.encode('utf-8')) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//li[@class="Book-item"]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="title"]/a/@href')) if not id: continue cover_url = ''.join(data.xpath('.//div[@class="cover-area"]//img/@src')) title = ''.join(data.xpath('.//div[@class="title"]/a[1]/text()')) author = ', '.join(data.xpath('.//div[@class="author"]/a/text()')) price = '0,00 zł' counter -= 1 s = SearchResult() for link in data.xpath('.//div[@class="book-box-formats"]/span/a'): ext = ''.join(link.xpath('./text()')) href = 'http://wolnelektury.pl' + link.get('href') s.downloads[ext] = href s.cover_url = 'http://wolnelektury.pl' + cover_url.strip() s.title = title.strip() s.author = author s.price = price s.detail_item = 'http://wolnelektury.pl' + id s.formats = ', '.join(s.downloads.keys()) s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=20, timeout=60): br = browser() counter = max_results page = 1 while counter: with closing(br.open('http://www.publio.pl/e-booki,strona' + str(page) + '.html?q=' + quote(query), timeout=timeout)) as f: # noqa doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="products-list"]//div[@class="product-tile"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[@class="product-tile-cover"]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//img[@class="product-tile-cover-photo"]/@src')) title = ''.join(data.xpath('.//span[@class="product-tile-title-long"]/text()')) author = ', '.join(data.xpath('.//span[@class="product-tile-author"]/a/text()')) price = ''.join(data.xpath('.//div[@class="product-tile-price-wrapper "]/a/ins/text()')) formats = ''.join(data.xpath('.//a[@class="product-tile-cover"]/img/@alt')).split(' - ebook ')[1] counter -= 1 s = SearchResult() s.cover_url = 'http://www.publio.pl' + cover_url s.title = title.strip() s.author = author s.price = price s.detail_item = 'http://www.publio.pl' + id.strip() s.formats = formats.upper().strip() yield s if not doc.xpath('boolean(//a[@class="next"])'): break page+=1
def search(self, query, max_results=10, timeout=60): url = 'http://www.empik.com/ebooki/ebooki,3501,s?resultsPP=' + str(max_results) + '&q=' + urllib.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="search-list-item"]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="name"]/a/@href')) if not id: continue cover_url = ''.join(data.xpath('.//a/img[@class="lazy"]/@lazy-img')) author = ', '.join(data.xpath('.//div[@class="smartAuthorWrapper"]/a/text()')) title = ''.join(data.xpath('.//div[@class="name"]/a/@title')) price = ''.join(data.xpath('.//div[@class="price"]/text()')) with closing(br.open('http://empik.com' + id.strip(), timeout=timeout/4)) as nf: idata = html.fromstring(nf.read()) crawled = idata.xpath('.//a[(@class="chosen hrefstyle") or (@class="connectionsLink hrefstyle")]/text()') formats = ','.join([re.sub('ebook, ','', x.strip()) for x in crawled if 'ebook' in x]) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.split(' - ')[0] s.author = author.strip() s.price = price.strip() s.detail_item = 'http://empik.com' + id.strip() s.formats = formats.upper().strip() yield s
def search(self, query, max_results=10, timeout=60): url = 'https://www.legimi.pl/ebooki/?sort=score&searchphrase=' + quote_plus( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath( '//div[@class="book-search row auto-clear"]/div'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="panel-body"]/a/@href')) if not id: continue cover_url = ''.join( data.xpath('.//div[@class="img-content"]/img/@data-src')) title = ''.join( data.xpath( './/a[@class="book-title clampBookTitle"]/text()')) author = ' '.join( data.xpath( './/div[@class="authors-container clampBookAuthors"]/a/text()' )) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.detail_item = 'https://www.legimi.pl' + id.strip() s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=10, timeout=60): url = ( 'https://www.whsmith.co.uk/search?keywordCategoryId=wc_dept_ebooks&results=60' '&page=1&keywords=' + quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//li[@class="product"]'): if counter <= 0: break id_ = ''.join( data.xpath('./a[@class="product_image_wrap"]/@href')) if not id_: continue id_ = 'https://www.whsmith.co.uk' + id_ cover_url = ''.join( data.xpath('.//img[@class="product_image"]/@src')) title = ''.join( data.xpath('.//h4[@class="product_title"]/text()')) author = ', '.join( data.xpath('.//span[@class="product_second"]/text()')) price = ''.join(data.xpath('.//span[@class="price"]/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_LOCKED s.detail_item = id_ s.formats = 'ePub' yield s
def search(self, query, max_results=20, timeout=60): url = 'http://www.escapemagazine.pl/wyszukiwarka?query=' + urllib.quote_plus( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="item item_short"]'): if counter <= 0: break id = ''.join(data.xpath('.//h2[@class="title"]/a[1]/@href')) if not id: continue title = ''.join( data.xpath('.//h2[@class="title"]/a[1]/text()')) author = ''.join(data.xpath('.//div[@class="author"]/text()')) price = ''.join( data.xpath( './/span[@class="price_now"]/strong/text()')) + ' zł' cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://www.escapemagazine.pl' + id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = 'PDF' yield s
def search(self, query, max_results=12, timeout=60): url = 'http://virtualo.pl/?q=' + urllib.quote(query) br = browser() no_drm_pattern = re.compile(r'Watermark|Brak') counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="products-list-wrapper"]//li[@class="product "]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="cover-wrapper"]//a/@href')).split(r'?q=')[0] if not id: continue price = ''.join(data.xpath('.//div[@class="information"]//div[@class="price"]/text()')) cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src')) title = ''.join(data.xpath('.//div[@class="title"]/a//text()')) author = ', '.join(data.xpath('.//div[@class="information"]//div[@class="authors"]/a//text()')) formats = [form.strip() for form in data.xpath('.//div[@class="information"]//div[@class="format"]/a//text()')] nodrm = no_drm_pattern.search(''.join(data.xpath('.//div[@class="protection"]/text()'))) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = re.sub('\.',',',price.strip()) s.detail_item = 'http://virtualo.pl' + id s.formats = ', '.join(formats).upper() s.drm = SearchResult.DRM_UNLOCKED if nodrm else SearchResult.DRM_LOCKED yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.google.com/search?tbm=bks&q=' + urllib.quote_plus( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ol/li'): if counter <= 0: break id = ''.join(data.xpath('.//h3/a/@href')) if not id: continue title = ''.join(data.xpath('.//h3/a//text()')) authors = data.xpath( './/span[contains(@class, "f")]//a//text()') while authors and authors[-1].strip().lower() in ( 'preview', 'read', 'more editions'): authors = authors[:-1] if not authors: continue author = ', '.join(authors) counter -= 1 s = SearchResult() s.title = title.strip() s.author = author.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=25, timeout=60): url = 'http://ebookpoint.pl/search.scgi?szukaj=' + urllib.quote_plus(query.decode('utf-8').encode('iso-8859-2')) + '&serwisyall=0&x=0&y=0' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="book-list"]/ul[2]/li'): if counter <= 0: break id = ''.join(data.xpath('.//a[@class="cover"]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//a[@class="cover"]/img/@src')) title = ''.join(data.xpath('.//h3/a/@title')) title = re.sub('eBook.', '', title) author = ''.join(data.xpath('.//p[@class="author"]/text()')) price = ''.join(data.xpath('.//p[@class="price"]/ins/text()')) formats = ', '.join(data.xpath('.//div[@class="ikony"]/span/text()')) counter -= 1 s = SearchResult() s.cover_url = 'http://ebookpoint.pl' + re.sub('72x9', '65x8',cover_url) s.title = title.strip() s.author = author.strip() s.price = re.sub(r'\.',',',price) s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.upper() yield s
def search(self, query, max_results=12, timeout=60): url = 'http://virtualo.pl/?q=' + urllib.quote(query) + '&f=format_id:4,6,3' br = browser() no_drm_pattern = re.compile(r'Znak wodny|Brak') counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@id="content"]//div[@class="list_box list_box_border"]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="list_middle_left"]//a/@href')) if not id: continue price = ''.join(data.xpath('.//span[@class="price"]/text() | .//span[@class="price abbr"]/text()')) cover_url = ''.join(data.xpath('.//div[@class="list_middle_left"]//a//img/@src')) title = ''.join(data.xpath('.//div[@class="list_title list_text_left"]/a/text()')) author = ', '.join(data.xpath('.//div[@class="list_authors list_text_left"]/a/text()')) formats = [ form.split('_')[-1].replace('.png', '') for form in data.xpath('.//div[@style="width:55%;float:left;text-align:left;height:18px;"]//a/span/img/@src')] nodrm = no_drm_pattern.search(''.join(data.xpath('.//div[@style="width:45%;float:right;text-align:right;height:18px;"]//span[@class="prompt_preview"]/text()'))) counter -= 1 s = SearchResult() s.cover_url = cover_url.split('.jpg')[0] + '.jpg' s.title = title.strip() s.author = author.strip() s.price = price + ' zł' s.detail_item = 'http://virtualo.pl' + id.strip().split('http://')[0] s.formats = ', '.join(formats).upper() s.drm = SearchResult.DRM_UNLOCKED if nodrm else SearchResult.DRM_LOCKED yield s
def search(self, query, max_results=10, timeout=60): base_url = 'http://www.millsandboon.co.uk' url = base_url + '/search?format=ebook&q=' + urllib2.quote(query) #print(url) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//article[contains(@class, "group")]'): if counter <= 0: break id_ = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/@href')).strip() id_ = base_url + id_ if not id_: continue cover_url = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@src')) title = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@alt')).strip() author = ''.join(data.xpath('.//a[@class="author"]/text()')) price = ''.join(data.xpath('.//li[@class="productAttribute" and child::span[text()="eBook"]]/input/@value')) format_ = ''.join(data.xpath('.//p[@class="doc-meta-format"]/span[last()]/text()')) drm = SearchResult.DRM_LOCKED counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = id_ s.drm = drm s.formats = format_ yield s
def open_search(url, query, max_results=10, timeout=60): description = Description(url) url_template = description.get_best_template() if not url_template: return oquery = Query(url_template) # set up initial values oquery.searchTerms = query oquery.count = max_results url = oquery.url() counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: doc = safe_xml_fromstring(f.read()) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join( data.xpath('./*[local-name() = "id"]/text()')).strip() for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = BASE_URL + link.get('href') type = link.get('type') title = link.get('title') ext = None if rel and href and type: if 'http://opds-spec.org/image/thumbnail' == rel: s.cover_url = href elif 'http://opds-spec.org/acquisition/open-access' == rel: if type == 'application/epub+zip' and title == 'Recommended compatible epub': ext = 'EPUB' elif type == 'application/x-mobipocket-ebook': ext = 'AZW3' if ext: s.downloads[ext] = href s.formats = ', '.join(s.downloads.keys()).strip() s.title = ' '.join( data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join( data.xpath( './*[local-name() = "author"]//*[local-name() = "name"]//text()' )).strip() s.drm = SearchResult.DRM_UNLOCKED s.price = '$0.00' yield s
def open_search(url, query, max_results=10, timeout=60): description = Description(url) url_template = description.get_best_template() if not url_template: return oquery = Query(url_template) # set up initial values oquery.searchTerms = query oquery.count = max_results url = oquery.url() counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: doc = safe_xml_fromstring(f.read()) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join( data.xpath('./*[local-name() = "id"]/text()')).strip() for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if 'http://opds-spec.org/thumbnail' in rel: s.cover_url = href elif 'http://opds-spec.org/image/thumbnail' in rel: s.cover_url = href elif 'http://opds-spec.org/acquisition/buy' in rel: s.detail_item = href elif 'http://opds-spec.org/acquisition/sample' in rel: pass elif 'http://opds-spec.org/acquisition' in rel: if type: ext = guess_extension(type) if ext: ext = ext[1:].upper().strip() s.downloads[ext] = href s.formats = ', '.join(s.downloads.keys()).strip() s.title = ' '.join( data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join( data.xpath( './*[local-name() = "author"]//*[local-name() = "name"]//text()' )).strip() price_e = data.xpath('.//*[local-name() = "price"][1]') if price_e: price_e = price_e[0] currency_code = price_e.get('currencycode', '') price = ''.join(price_e.xpath('.//text()')).strip() s.price = currency_code + ' ' + price s.price = s.price.strip() yield s
def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode( 'ascii', 'backslashreplace').replace('%', '%25').replace( '\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: allText = f.read() doc = html.fromstring(allText) #.decode('latin-1', 'replace')) format_xpath2 = '' if doc.xpath( '//div[@id = "atfResults" and contains(@class, "grid")]'): #print('grid form') data_xpath = '//div[contains(@class, "prod")]' format_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' ) asin_xpath = '@name' cover_xpath = './/img[contains(@class, "productImage")]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' price_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' ) elif doc.xpath( '//div[@id = "atfResults" and contains(@class, "ilresults")]' ): #print('ilo form') data_xpath = '//li[(@class="ilo")]' format_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' ) asin_xpath = '@name' cover_xpath = './div[@class = "ilf"]/a/img[contains(@class, "ilo")]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' # Results can be in a grid (table) or a column price_xpath = ( './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' ) elif doc.xpath( '//div[@id = "atfResults" and contains(@class, "s-result-list-parent-container")]' ): #print('new list form') data_xpath = '//li[contains(@class, "s-result-item")]' format_xpath = './/a[contains(@class, "a-size-small")]/text()' format_xpath2 = './/h3[contains(@class, "s-inline")]/text()' asin_xpath = '@data-asin' cover_xpath = './/img[contains(@class, "cfMarker")]/@src' title_xpath = './/h2[contains(@class, "s-access-title")]/text()' author_xpath = ( './/div[contains(@class, "a-fixed-left-grid-col")]' '/div/div/span//text()') price_xpath = ( './/div[contains(@class, "a-spacing-none")]/a/span[contains(@class, "s-price")]/text()' ) elif doc.xpath( '//div[@id = "atfResults" and contains(@class, "list")]'): #print('list form') data_xpath = '//li[@class="s-result-item"]' format_xpath = './/a[contains(@class, "a-size-small")]/text()' format_xpath2 = './/h3[contains(@class, "s-inline")]/text()' asin_xpath = '@data-asin' cover_xpath = './/img[contains(@class, "cfMarker")]/@src' title_xpath = './/h2[contains(@class, "s-access-title")]/text()' author_xpath = ( './/div[contains(@class, "a-fixed-left-grid-col")]' '/div/div/span//text()') price_xpath = ('.//span[contains(@class, "s-price")]/text()') else: # URK -- whats this? print('unknown result table form for Amazon EU search') #with open("c:/amazon_search_results.html", "w") as out: # out.write(allText) return for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (authors pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format_ = ''.join(data.xpath(format_xpath)) if 'kindle' not in format_.lower(): if format_xpath2: format_ = ''.join(data.xpath(format_xpath2)) if 'kindle' not in format_.lower(): # print(etree.tostring(data, pretty_print=True)) continue # We must have an asin otherwise we can't easily reference the # book later. asin = data.xpath(asin_xpath) if asin: asin = asin[0] else: continue cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath(title_xpath)) authors = ''.join(data.xpath(author_xpath)) authors = re.sub('^' + self.author_article, '', authors) authors = re.sub(self.and_word, ' & ', authors) mo = re.match(r'(.*)(\(\d.*)$', authors) if mo: authors = mo.group(1).strip() price = ''.join(data.xpath(price_xpath)[-1]) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = authors.strip() s.price = price.strip() s.detail_item = asin.strip() s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Kindle' yield s
def search(self, query, max_results=10, timeout=60): url = ('http://www.ebook.de/de/pathSearch?nav=52122&searchString=' + urllib.parse.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath( '//div[contains(@class, "articlecontainer")]'): if counter <= 0: break details = data.xpath( './div[contains(@class, "articleinfobox")]') if not details: continue details = details[0] id_ = ''.join(details.xpath('./a/@name')).strip() if not id_: continue title = ''.join( details.xpath('./h3[@class="title"]/a/text()')).strip() author = ''.join( details.xpath('.//div[@class="author"]/text()')).strip() if author.startswith('von'): author = author[4:] pdf = details.xpath( 'boolean(.//span[@class="bindername" and contains(text(), "pdf")]/text())' ) epub = details.xpath( 'boolean(.//span[@class="bindername" and contains(text(), "epub")]/text())' ) mobi = details.xpath( 'boolean(.//span[@class="bindername" and contains(text(), "mobipocket")]/text())' ) cover_url = ''.join( data.xpath('.//div[@class="coverimg"]/a/img/@src')) price = ''.join( data.xpath('.//div[@class="preis"]/text()')).replace( '*', '').strip() counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_UNKNOWN s.detail_item = id_ formats = [] if epub: formats.append('ePub') if pdf: formats.append('PDF') if mobi: formats.append('MOBI') s.formats = ', '.join(formats) yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.diesel-ebooks.com/index.php?page=seek&id[m]=&id[c]=scope%253Dinventory&id[q]=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: book_url = f.geturl() doc = html.fromstring(f.read()) if doc.xpath('not(boolean(//select[contains(@id, "selection")]))'): # This is the page for an individual book id = ''.join(doc.xpath('//div[@class="price_fat"]//a/@href')) mo = re.search('(?<=id=).+?(?=&)', id) if not mo: yield None id = mo.group() cover_url = ''.join(doc.xpath('//div[@class="cover"]/a/@href')) title = ''.join(doc.xpath('//div[@class="desc_fat"]//h1/text()')) author = ''.join(doc.xpath('//div[@class="desc_fat"]//span[@itemprop="author"]/text()')) price = ''.join(doc.xpath('//div[@class="price_fat"]//h1/text()')) formats = ', '.join(doc.xpath('//div[@class="desc_fat"]//p[contains(text(), "Format")]/text()')) a, b, formats = formats.partition('Format:') drm = SearchResult.DRM_LOCKED if 'drm free' in formats.lower(): drm = SearchResult.DRM_UNLOCKED s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = book_url s.formats = formats s.drm = drm yield s else: for data in doc.xpath('//div[contains(@class, "item")]'): if counter <= 0: break id = ''.join(data.xpath('div[@class="cover"]/a/@href')) if not id or '/item/' not in id: continue cover_url = ''.join(data.xpath('div[@class="cover"]//img/@src')) title = ''.join(data.xpath('.//div[@class="content"]//h2/a/text()')) author = ''.join(data.xpath('.//div[@class="content"]/span//a/text()')) price = '' price_elem = data.xpath('.//div[@class="price_fat"]//h1/text()') if price_elem: price = price_elem[0] formats = ', '.join(data.xpath('.//div[@class="book-info"]//text()')).strip() a, b, formats = formats.partition('Format:') drm = SearchResult.DRM_LOCKED if 'drm free' in formats.lower(): drm = SearchResult.DRM_UNLOCKED counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = id.strip() s.formats = formats s.drm = drm yield s
def search(self, query, max_results=10, timeout=60): ''' XinXii's open search url is: http://www.xinxii.com/catalog-search/query/?keywords={searchTerms}&pw={startPage?}&doc_lang={docLang}&ff={docFormat},{docFormat},{docFormat} This url requires the docLang and docFormat. However, the search itself sent to XinXii does not require them. They can be ignored. We cannot push this into the stanard OpenSearchOPDSStore search because of the required attributes. XinXii doesn't return all info supported by OpenSearchOPDSStore search function so this one is modified to remove parts that are used. ''' url = 'http://www.xinxii.com/catalog-search/query/?keywords=' + quote_plus( query) counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: doc = etree.fromstring(f.read()) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join( data.xpath('./*[local-name() = "id"]/text()')).strip() for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'): s.cover_url = href if rel == 'alternate': s.detail_item = href s.formats = 'EPUB, PDF' s.title = ' '.join( data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join( data.xpath( './*[local-name() = "author"]//*[local-name() = "name"]//text()' )).strip() price_e = data.xpath('.//*[local-name() = "price"][1]') if price_e: price_e = price_e[0] currency_code = price_e.get('currencycode', '') price = ''.join(price_e.xpath('.//text()')).strip() s.price = currency_code + ' ' + price s.price = s.price.strip() yield s
def search_amazon(query, max_results=10, timeout=60, write_html_to=None, base_url=SEARCH_BASE_URL, base_query=SEARCH_BASE_QUERY, field_keywords='field-keywords'): uquery = base_query.copy() uquery[field_keywords] = query def asbytes(x): if isinstance(x, type('')): x = x.encode('utf-8') return x uquery = {asbytes(k): asbytes(v) for k, v in uquery.iteritems()} url = base_url + '?' + urllib.urlencode(uquery).decode('ascii') br = browser(user_agent=get_user_agent()) counter = max_results with closing(br.open(url, timeout=timeout)) as f: raw = f.read() if write_html_to is not None: with open(write_html_to, 'wb') as f: f.write(raw) doc = html.fromstring(raw) try: results = doc.xpath('//div[@id="atfResults" and @class]')[0] except IndexError: return if 's-result-list-parent-container' in results.get('class', ''): data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]" format_xpath = './/a[@title="Kindle Edition"]/@title' asin_xpath = '@data-asin' cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src" title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()" author_xpath = './/span[starts-with(text(), "by ")]/following-sibling::span//text()' price_xpath = ( 'descendant::div[@class="a-row a-spacing-none" and' ' not(span[contains(@class, "kindle-unlimited")])]//span[contains(@class, "s-price")]//text()' ) else: return for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format = ''.join(data.xpath(format_xpath)) if 'kindle' not in format.lower(): continue # We must have an asin otherwise we can't easily reference the # book later. asin = data.xpath(asin_xpath) if asin: asin = asin[0] else: continue cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath(title_xpath)) author = ''.join(data.xpath(author_xpath)) try: author = author.split('by ', 1)[1].split(" (")[0] except: pass price = ''.join(data.xpath(price_xpath)) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = asin.strip() s.formats = 'Kindle' yield s
def search(self, query, max_results=10, timeout=60): # check for cyrillic symbols before performing search uquery = type(u'')(query.strip(), 'utf-8') reObj = re.search(u'^[а-яА-Я\\d\\s]{2,}$', uquery) if not reObj: return base_url = 'http://e-knigi.net' url = base_url + '/virtuemart?page=shop.browse&search_category=0&search_limiter=anywhere&keyword=' + quote( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) # if the store finds only one product, it opens directly detail view for data in doc.xpath('//div[@class="prod_details"]'): s = SearchResult() s.cover_url = ''.join( data.xpath( './/div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@src' )).strip() s.title = ''.join( data.xpath( './/div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@alt' )).strip() s.author = ''.join( data.xpath( './/div[@class="td_bg clearfix"]/div[@class="gk_product_tab"]/div/table/tr[3]/td[2]/text()' )).strip() s.price = ''.join( data.xpath( './/span[@class="productPrice"]/text()')).strip() s.detail_item = url s.drm = SearchResult.DRM_UNLOCKED yield s return # search in store results for data in doc.xpath('//div[@class="browseProductContainer"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[1]/@href')).strip() if not id: continue title = ''.join( data.xpath('.//a[@class="gk_vm_product_image"]/img/@title') ).strip() author = ''.join( data.xpath('.//div[@style="float:left;width:90%"]/b/text()' )).strip().replace('Автор: ', '') if title.lower().find( query.lower()) == -1 and author.lower().find( query.lower()) == -1: continue counter -= 1 s = SearchResult() s.cover_url = ''.join( data.xpath('.//a[@class="gk_vm_product_image"]/img/@src') ).strip() s.title = title s.author = author s.price = ''.join( data.xpath( './/span[@class="productPrice"]/text()')).strip() s.detail_item = base_url + id s.drm = SearchResult.DRM_UNLOCKED yield s
def search_kobo(query, max_results=10, timeout=60, write_html_to=None): from css_selectors import Select url = 'https://www.kobobooks.com/search/search.html?q=' + quote_plus(query) br = browser() with closing(br.open(url, timeout=timeout)) as f: raw = f.read() if write_html_to is not None: with open(write_html_to, 'wb') as f: f.write(raw) doc = html.fromstring(raw) select = Select(doc) for i, item in enumerate(select('.result-items .item-wrapper.book')): if i == max_results: break for img in select('.item-image img[src]', item): cover_url = img.get('src') if cover_url.startswith('//'): cover_url = 'https:' + cover_url break else: cover_url = None for p in select('p.title', item): title = etree.tostring(p, method='text', encoding='unicode').strip() for a in select('a[href]', p): url = a.get('href') break else: url = None break else: title = None authors = [] for a in select('p.contributor-list a.contributor-name', item): authors.append( etree.tostring(a, method='text', encoding='unicode').strip()) authors = authors_to_string(authors) for p in select('p.price', item): price = etree.tostring(p, method='text', encoding='unicode').strip() break else: price = None if title and authors and url: s = SearchResult() s.cover_url = cover_url s.title = title s.author = authors s.price = price s.detail_item = url s.formats = 'EPUB' s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode( 'ascii', 'backslashreplace').replace('%', '%25').replace( '\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) if doc.xpath( '//div[@id = "atfResults" and contains(@class, "grid")]'): data_xpath = '//div[contains(@class, "prod")]' format_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' ) asin_xpath = '@name' cover_xpath = './/img[contains(@class, "productImage")]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' price_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' ) elif doc.xpath( '//div[@id = "atfResults" and contains(@class, "ilresults")]' ): data_xpath = '//li[(@class="ilo")]' format_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' ) asin_xpath = '@name' cover_xpath = './div[@class = "ilf"]/a/img[contains(@class, "ilo")]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' # Results can be in a grid (table) or a column price_xpath = ( './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' ) elif doc.xpath( '//div[@id = "atfResults" and contains(@class, "list")]'): data_xpath = '//div[contains(@class, "prod")]' format_xpath = ( './/ul[contains(@class, "rsltL")]' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' ) asin_xpath = '@name' cover_xpath = './/img[contains(@class, "productImage")]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' price_xpath = ( './/ul[contains(@class, "rsltL")]' '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' ) else: return for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format = ''.join(data.xpath(format_xpath)) if 'kindle' not in format.lower(): continue # We must have an asin otherwise we can't easily reference the # book later. asin = data.xpath(asin_xpath) if asin: asin = asin[0] else: continue cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath(title_xpath)) author = ''.join(data.xpath(author_xpath)) try: author = author.split('by ', 1)[1].split(" (")[0] except: pass price = ''.join(data.xpath(price_xpath)) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = asin.strip() s.formats = 'Kindle' yield s
def search(self, query, max_results=10, timeout=60): # check for cyrillic symbols before performing search uquery = unicode(query.strip(), 'utf-8') reObj = re.search(u'^[а-яА-Я\\d\\s]{3,}$', uquery) if not reObj: return base_url = 'http://chitanka.info' url = base_url + '/search?q=' + urllib2.quote(query) counter = max_results # search for book title br = browser() try: with closing(br.open(url, timeout=timeout)) as f: f = unicode(f.read(), 'utf-8') doc = html.fromstring(f) for data in doc.xpath('//ul[@class="superlist booklist"]/li'): if counter <= 0: break id = ''.join(data.xpath('.//a[@class="booklink"]/@href')).strip() if not id: continue counter -= 1 s = SearchResult() s.cover_url = ''.join(data.xpath('.//a[@class="booklink"]/img/@src')).strip() s.title = ''.join(data.xpath('.//a[@class="booklink"]/i/text()')).strip() s.author = ''.join(data.xpath('.//span[@class="bookauthor"]/a/text()')).strip() s.detail_item = id s.drm = SearchResult.DRM_UNLOCKED s.downloads['FB2'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-fb2"]/@href')).strip().replace('.zip', '') s.downloads['EPUB'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-epub"]/@href')).strip().replace('.zip', '') s.downloads['TXT'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-txt"]/@href')).strip().replace('.zip', '') s.formats = 'FB2, EPUB, TXT, SFB' yield s except urllib2.HTTPError as e: if e.code == 404: return else: raise # search for author names for data in doc.xpath('//ul[@class="superlist"][1]/li/dl/dt'): author_url = ''.join(data.xpath('.//a[contains(@href,"/person/")]/@href')) if author_url == '': continue if counter <= 0: break br2 = browser() with closing(br2.open(base_url + author_url, timeout=timeout)) as f: if counter <= 0: break f = unicode(f.read(), 'utf-8') doc2 = html.fromstring(f) # search for book title for data in doc2.xpath('//ul[@class="superlist booklist"]/li'): if counter <= 0: break id = ''.join(data.xpath('.//a[@class="booklink"]/@href')).strip() if not id: continue title = ''.join(data.xpath('.//a[@class="booklink"]/i/text()')).strip() author = ''.join(data.xpath('.//span[@class="bookauthor"]/a/text()')).strip() if title.lower().find(query.lower()) == -1 and author.lower().find(query.lower()) == -1: continue counter -= 1 s = SearchResult() s.cover_url = ''.join(data.xpath('.//a[@class="booklink"]/img/@src')).strip() s.title = title s.author = author s.detail_item = id s.drm = SearchResult.DRM_UNLOCKED s.downloads['FB2'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-fb2"]/@href')).strip().replace('.zip', '') s.downloads['EPUB'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-epub"]/@href')).strip().replace('.zip', '') s.downloads['TXT'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-txt"]/@href')).strip().replace('.zip', '') s.formats = 'FB2, EPUB, TXT, SFB' yield s
def search(self, query, max_results=10, timeout=60): url = ( 'http://www.ebook.nl/store/advanced_search_result.php?keywords=' + urllib.parse.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath( '//table[contains(@class, "productListing")]/tr'): if counter <= 0: break details = data.xpath('./td/div[@class="prodImage"]/a') if not details: continue details = details[0] id = ''.join(details.xpath('./@href')).strip() id = id[id.rfind('/') + 1:] i = id.rfind('?') if i > 0: id = id[:i] if not id: continue cover_url = 'http://www.ebook.nl/store/' + ''.join( details.xpath('./img/@src')) title = ''.join(details.xpath('./img/@title')).strip() author = ''.join( data.xpath( './td/div[@class="prodTitle"]/h3/a/text()')).strip() price = ''.join( data.xpath('./td/div[@class="prodTitle"]/b/text()')) pdf = data.xpath( 'boolean(./td/div[@class="prodTitle"]/' 'p[contains(text(), "Bestandsformaat: Pdf")])') epub = data.xpath( 'boolean(./td/div[@class="prodTitle"]/' 'p[contains(text(), "Bestandsformaat: ePub")])') nodrm = data.xpath('boolean(./td/div[@class="prodTitle"]/' 'p[contains(text(), "zonder DRM") or' ' contains(text(), "watermerk")])') counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price if nodrm: s.drm = SearchResult.DRM_UNLOCKED else: s.drm = SearchResult.DRM_LOCKED s.detail_item = id formats = [] if epub: formats.append('ePub') if pdf: formats.append('PDF') s.formats = ','.join(formats) yield s