示例#1
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.waterstones.com/waterstonesweb/simpleSearch.do?simpleSearchString=ebook+' + urllib2.quote(
            query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[contains(@class, "results-pane")]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('./div/div/h2/a/@href')).strip()
                if not id:
                    continue
                cover_url = ''.join(
                    data.xpath('.//div[@class="image"]/a/img/@src'))
                if not cover_url.startswith("http"):
                    cover_url = 'http://www.waterstones.com' + cover_url
                title = ''.join(data.xpath('./div/div/h2/a/text()'))
                author = ', '.join(
                    data.xpath('.//p[@class="byAuthor"]/a/text()'))
                price = ''.join(
                    data.xpath(
                        './/p[@class="price"]/span[@class="priceRed2"]/text()')
                )
                drm = data.xpath(
                    'boolean(.//td[@headers="productFormat" and contains(., "DRM")])'
                )
                pdf = data.xpath(
                    'boolean(.//td[@headers="productFormat" and contains(., "PDF")])'
                )
                epub = data.xpath(
                    'boolean(.//td[@headers="productFormat" and contains(., "EPUB")])'
                )

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                if drm:
                    s.drm = SearchResult.DRM_LOCKED
                else:
                    s.drm = SearchResult.DRM_UNKNOWN
                s.detail_item = id
                formats = []
                if epub:
                    formats.append('ePub')
                if pdf:
                    formats.append('PDF')
                s.formats = ', '.join(formats)

                yield s
示例#2
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://woblink.com/katalog-e-book?query=' + urllib.quote_plus(query.encode('utf-8'))
        if max_results > 10:
            if max_results > 20:
                url += '&limit=30'
            else:
                url += '&limit=20'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="book-item backgroundmix"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//td[@class="w10 va-t mYHaveItYes"]/a[1]/@href'))
                if not id:
                    continue

                cover_url = ''.join(data.xpath('.//td[@class="w10 va-t mYHaveItYes"]/a[1]/img/@src'))
                title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()'))
                author = ', '.join(data.xpath('.//td[@class="va-t"]/h3/a/text()'))
                price = ''.join(data.xpath('.//div[@class="prices"]/span[1]/strong/span/text()'))
                price = re.sub('\.', ',', price)
                formats = [ form[8:-4].split('.')[0] for form in data.xpath('.//p[3]/img/@src')]

                s = SearchResult()
                s.cover_url = 'http://woblink.com' + cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price + ' zł'
                s.detail_item = id.strip()

                if 'epub_drm' in formats:
                    s.drm = SearchResult.DRM_LOCKED
                    s.formats = 'EPUB'

                    counter -= 1
                    yield s
                elif 'pdf' in formats:
                    s.drm = SearchResult.DRM_LOCKED
                    s.formats = 'PDF'

                    counter -= 1
                    yield s
                else:
                    s.drm = SearchResult.DRM_UNLOCKED
                    if 'MOBI_nieb' in formats:
                        formats.remove('MOBI_nieb')
                        formats.append('MOBI')
                    s.formats = ', '.join(formats).upper()

                    counter -= 1
                    yield s
示例#3
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://woblink.com/katalog-e-book?query=' + urllib.quote_plus(query.encode('utf-8'))
        if max_results > 10:
            if max_results > 20:
                url += '&limit=30'
            else:
                url += '&limit=20'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="book-item backgroundmix"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//td[@class="w10 va-t mYHaveItYes"]/a[1]/@href'))
                if not id:
                    continue

                cover_url = ''.join(data.xpath('.//td[@class="w10 va-t mYHaveItYes"]/a[1]/img/@src'))
                title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()'))
                author = ', '.join(data.xpath('.//td[@class="va-t"]/h3/a/text()'))
                price = ''.join(data.xpath('.//div[@class="prices"]/span[1]/strong/span/text()'))
                price = re.sub('\.', ',', price)
                formats = [ form[8:-4].split('.')[0] for form in data.xpath('.//p[3]/img/@src')]

                s = SearchResult()
                s.cover_url = 'http://woblink.com' + cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price + ' zł'
                s.detail_item = id.strip()

                if 'epub_drm' in formats:
                    s.drm = SearchResult.DRM_LOCKED
                    s.formats = 'EPUB'

                    counter -= 1
                    yield s
                elif 'pdf' in formats:
                    s.drm = SearchResult.DRM_LOCKED
                    s.formats = 'PDF'

                    counter -= 1
                    yield s
                else:
                    s.drm = SearchResult.DRM_UNLOCKED
                    if 'MOBI_nieb' in formats:
                        formats.remove('MOBI_nieb')
                        formats.append('MOBI')
                    s.formats = ', '.join(formats).upper()

                    counter -= 1
                    yield s
示例#4
0
    def search(self, query, max_results=10, timeout=60):
        url = ('http://www.ebook.nl/store/advanced_search_result.php?keywords='
               + urllib2.quote(query))
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//table[contains(@class, "productListing")]/tr'):
                if counter <= 0:
                    break

                details = data.xpath('./td/div[@class="prodImage"]/a')
                if not details:
                    continue
                details = details[0]
                id = ''.join(details.xpath('./@href')).strip()
                id = id[id.rfind('/')+1:]
                i = id.rfind('?')
                if i > 0:
                    id = id[:i]
                if not id:
                    continue
                cover_url = 'http://www.ebook.nl/store/' + ''.join(details.xpath('./img/@src'))
                title = ''.join(details.xpath('./img/@title')).strip()
                author = ''.join(data.xpath('./td/div[@class="prodTitle"]/h3/a/text()')).strip()
                price = ''.join(data.xpath('./td/div[@class="prodTitle"]/b/text()'))
                pdf = data.xpath('boolean(./td/div[@class="prodTitle"]/'
                                   'p[contains(text(), "Bestandsformaat: Pdf")])')
                epub = data.xpath('boolean(./td/div[@class="prodTitle"]/'
                                   'p[contains(text(), "Bestandsformaat: ePub")])')
                nodrm = data.xpath('boolean(./td/div[@class="prodTitle"]/'
                                   'p[contains(text(), "zonder DRM") or'
                                   '  contains(text(), "watermerk")])')
                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                if nodrm:
                    s.drm = SearchResult.DRM_UNLOCKED
                else:
                    s.drm = SearchResult.DRM_LOCKED
                s.detail_item = id
                formats = []
                if epub:
                    formats.append('ePub')
                if pdf:
                    formats.append('PDF')
                s.formats = ','.join(formats)

                yield s
示例#5
0
    def search(self, query, max_results=10, timeout=60):
        url = ('http://www.ebook.nl/store/advanced_search_result.php?keywords='
               + urllib2.quote(query))
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//table[contains(@class, "productListing")]/tr'):
                if counter <= 0:
                    break

                details = data.xpath('./td/div[@class="prodImage"]/a')
                if not details:
                    continue
                details = details[0]
                id = ''.join(details.xpath('./@href')).strip()
                id = id[id.rfind('/')+1:]
                i = id.rfind('?')
                if i > 0:
                    id = id[:i]
                if not id:
                    continue
                cover_url = 'http://www.ebook.nl/store/' + ''.join(details.xpath('./img/@src'))
                title = ''.join(details.xpath('./img/@title')).strip()
                author = ''.join(data.xpath('./td/div[@class="prodTitle"]/h3/a/text()')).strip()
                price = ''.join(data.xpath('./td/div[@class="prodTitle"]/b/text()'))
                pdf = data.xpath('boolean(./td/div[@class="prodTitle"]/'
                                   'p[contains(text(), "Bestandsformaat: Pdf")])')
                epub = data.xpath('boolean(./td/div[@class="prodTitle"]/'
                                   'p[contains(text(), "Bestandsformaat: ePub")])')
                nodrm = data.xpath('boolean(./td/div[@class="prodTitle"]/'
                                   'p[contains(text(), "zonder DRM") or'
                                   '  contains(text(), "watermerk")])')
                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                if nodrm:
                    s.drm = SearchResult.DRM_UNLOCKED
                else:
                    s.drm = SearchResult.DRM_LOCKED
                s.detail_item = id
                formats = []
                if epub:
                    formats.append('ePub')
                if pdf:
                    formats.append('PDF')
                s.formats = ','.join(formats)

                yield s
示例#6
0
    def search(self, query, max_results=10, timeout=60):
        # check for cyrillic symbols before performing search
        uquery = type(u'')(query.strip(), 'utf-8')
        reObj = re.search(u'^[а-яА-Я\\d\\s]{2,}$', uquery)
        if not reObj:
            return

        base_url = 'http://e-knigi.net'
        url = base_url + '/virtuemart?page=shop.browse&search_category=0&search_limiter=anywhere&keyword=' + quote(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())

            # if the store finds only one product, it opens directly detail view
            for data in doc.xpath('//div[@class="prod_details"]'):
                s = SearchResult()
                s.cover_url = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@src')).strip()
                s.title = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@alt')).strip()
                s.author = ''.join(data.xpath('.//div[@class="td_bg clearfix"]/div[@class="gk_product_tab"]/div/table/tr[3]/td[2]/text()')).strip()
                s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip()
                s.detail_item = url
                s.drm = SearchResult.DRM_UNLOCKED

                yield s
                return

            # search in store results
            for data in doc.xpath('//div[@class="browseProductContainer"]'):
                if counter <= 0:
                    break
                id = ''.join(data.xpath('.//a[1]/@href')).strip()
                if not id:
                    continue

                title = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@title')).strip()
                author = ''.join(data.xpath('.//div[@style="float:left;width:90%"]/b/text()')).strip().replace('Автор: ', '')

                if title.lower().find(query.lower()) == -1 and author.lower().find(query.lower()) == -1:
                    continue

                counter -= 1

                s = SearchResult()
                s.cover_url = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@src')).strip()
                s.title = title
                s.author = author
                s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip()
                s.detail_item = base_url + id
                s.drm = SearchResult.DRM_UNLOCKED

                yield s
示例#7
0
    def search(self, query, max_results=10, timeout=60):
        # check for cyrillic symbols before performing search
        uquery = unicode(query.strip(), 'utf-8')
        reObj = re.search(u'^[а-яА-Я\\d\\s]{2,}$', uquery)
        if not reObj:
            return

        base_url = 'http://e-knigi.net'
        url = base_url + '/virtuemart?page=shop.browse&search_category=0&search_limiter=anywhere&keyword=' + urllib2.quote(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())

            # if the store finds only one product, it opens directly detail view
            for data in doc.xpath('//div[@class="prod_details"]'):
                s = SearchResult()
                s.cover_url = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@src')).strip()
                s.title = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@alt')).strip()
                s.author = ''.join(data.xpath('.//div[@class="td_bg clearfix"]/div[@class="gk_product_tab"]/div/table/tr[3]/td[2]/text()')).strip()
                s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip()
                s.detail_item = url
                s.drm = SearchResult.DRM_UNLOCKED

                yield s
                return

            # search in store results
            for data in doc.xpath('//div[@class="browseProductContainer"]'):
                if counter <= 0:
                    break
                id = ''.join(data.xpath('.//a[1]/@href')).strip()
                if not id:
                    continue

                title = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@title')).strip()
                author = ''.join(data.xpath('.//div[@style="float:left;width:90%"]/b/text()')).strip().replace('Автор: ', '')

                if title.lower().find(query.lower()) == -1 and author.lower().find(query.lower()) == -1:
                    continue

                counter -= 1

                s = SearchResult()
                s.cover_url = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@src')).strip()
                s.title = title
                s.author = author
                s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip()
                s.detail_item = base_url + id
                s.drm = SearchResult.DRM_UNLOCKED

                yield s
示例#8
0
    def search(self, query, max_results=10, timeout=60):
        """Searches LibGen for Books. Since the mirror links are not direct
        downloads, it should not provide these as `s.downloads`.
        """
        debug_print = partial(module_debug_print, 'LibgenStore:search:')
        debug_print('search:query = ', query)

        libgen_results = self.libgen.search(query)

        for result in libgen_results.results[:min(max_results, len(libgen_results.results))]:
            debug_print('result.title = ', result.title)

            for mirror in result.mirrors[0:1]:  # Calibre only shows 1 anyway
                debug_print('result.mirror.url = ', mirror.url)

                s = SearchResult()

                s.store_name = PLUGIN_NAME
                s.cover_url = result.image_url
                s.title = '{} ({}, {}{})'.format(
                    result.title, result.language, mirror.size, mirror.unit)
                s.author = result.authors
                s.price = '0.00'
                s.detail_item = result.md5
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = mirror.format
                s.plugin_author = PLUGIN_AUTHORS

                debug_print('s = ', s)

                yield s
示例#9
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.bubok.es/resellers/calibre_search/' + urllib.quote_plus(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[contains(@class, "libro")]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//div[@class="url"]/text()'))

                title = ''.join(data.xpath('.//div[@class="titulo"]/text()'))

                author = ''.join(data.xpath('.//div[@class="autor"]/text()'))

                price = ''.join(data.xpath('.//div[@class="precio"]/text()'))

                formats = ''.join(data.xpath('.//div[@class="formatos"]/text()'))

                cover = ''.join(data.xpath('.//div[@class="portada"]/text()'))

                counter -= 1

                s = SearchResult()
                s.title = title.strip()
                s.author = author.strip()
                s.detail_item = id.strip()
                s.price = price.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = formats.strip()
                s.cover_url = cover.strip()
                yield s
示例#10
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.ebookshoppe.com/search.php?search_query=' + urllib2.quote(query)
        br = browser()
        br.addheaders = [("Referer", "http://www.ebookshoppe.com/")]

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//ul[@class="ProductList"]/li'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('./div[@class="ProductDetails"]/'
                                        'strong/a/@href')).strip()
                if not id:
                    continue
                cover_url = ''.join(data.xpath('./div[@class="ProductImage"]/a/img/@src'))
                title = ''.join(data.xpath('./div[@class="ProductDetails"]/strong/a/text()'))
                price = ''.join(data.xpath('./div[@class="ProductPriceRating"]/em/text()'))
                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.price = price
                s.drm = SearchResult.DRM_UNLOCKED
                s.detail_item = id

                self.get_author_and_formats(s, timeout)
                if not s.author:
                    continue

                yield s
示例#11
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.barnesandnoble.com/s/%s?keyword=%s&store=ebook' % (query.replace(' ', '-'), urllib.quote_plus(query))

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//ul[contains(@class, "result-set")]/li[contains(@class, "result")]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//div[contains(@class, "image-bounding-box")]/a/@href'))
                if not id:
                    continue

                cover_url = ''.join(data.xpath('.//img[contains(@class, "product-image")]/@src'))

                title = ''.join(data.xpath('.//a[@class="title"]//text()'))
                author = ', '.join(data.xpath('.//a[@class="contributor"]//text()'))
                price = ''.join(data.xpath('.//div[@class="price-format"]//span[contains(@class, "price")]/text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = id.strip()
                s.drm = SearchResult.DRM_UNKNOWN
                s.formats = 'Nook'

                yield s
示例#12
0
def search_kobo(query, max_results=10, timeout=60, write_html_to=None):
    from css_selectors import Select
    url = 'https://www.kobobooks.com/search/search.html?q=' + quote_plus(query)
    raw = read_url(url, timeout=timeout)
    if write_html_to is not None:
        with open(write_html_to, 'w') as f:
            f.write(raw)
    doc = html.fromstring(raw)
    select = Select(doc)
    for i, item in enumerate(select('.result-items .item-wrapper.book')):
        if i == max_results:
            break
        for img in select('.item-image img[src]', item):
            cover_url = img.get('src')
            if cover_url.startswith('//'):
                cover_url = 'https:' + cover_url
            break
        else:
            cover_url = None

        for p in select('h2.title', item):
            title = etree.tostring(p, method='text',
                                   encoding='unicode').strip()
            for a in select('a[href]', p):
                url = a.get('href')
                break
            else:
                url = None
            break
        else:
            title = None
        if title:
            for p in select('p.subtitle', item):
                title += ' - ' + etree.tostring(
                    p, method='text', encoding='unicode').strip()

        authors = []
        for a in select('.contributors a.contributor-name', item):
            authors.append(
                etree.tostring(a, method='text', encoding='unicode').strip())
        authors = authors_to_string(authors)

        for p in select('p.price', item):
            price = etree.tostring(p, method='text',
                                   encoding='unicode').strip()
            break
        else:
            price = None

        if title and authors and url:
            s = SearchResult()
            s.cover_url = cover_url
            s.title = title
            s.author = authors
            s.price = price
            s.detail_item = url
            s.formats = 'EPUB'
            s.drm = SearchResult.DRM_UNKNOWN

            yield s
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.bewrite.net/mm5/merchant.mvc?Search_Code=B&Screen=SRCH&Search=' + urllib2.quote(
            query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath(
                    '//div[@id="content"]//table/tr[position() > 1]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//a/@href'))
                if not id:
                    continue

                heading = ''.join(data.xpath('./td[2]//text()'))
                title, q, author = heading.partition('by ')
                cover_url = ''
                price = ''

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url.strip()
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = id.strip()
                s.drm = SearchResult.DRM_UNLOCKED

                yield s
示例#14
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://ebooks.foyles.co.uk/catalog/search/?query=' + urllib2.quote(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="doc-item"]'):
                if counter <= 0:
                    break
                id_ = ''.join(data.xpath('.//p[@class="doc-cover"]/a/@href')).strip()
                if not id_:
                    continue
                id_ = 'http://ebooks.foyles.co.uk' + id_

                cover_url = ''.join(data.xpath('.//p[@class="doc-cover"]/a/img/@src'))
                title = ''.join(data.xpath('.//span[@class="title"]/a/text()'))
                author = ', '.join(data.xpath('.//span[@class="author"]/span[@class="author"]/text()'))
                price = ''.join(data.xpath('.//span[@itemprop="price"]/text()')).strip()
                format_ = ''.join(data.xpath('.//p[@class="doc-meta-format"]/span[last()]/text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.detail_item = id_
                s.drm = SearchResult.DRM_LOCKED
                s.formats = format_

                yield s
    def search(self, query, max_results=10, timeout=60):
        url = 'http://m.gutenberg.org/ebooks/search.mobile/?default_prefix=all&sort_order=title&query=' + urllib.quote_plus(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//ol[@class="results"]/li[@class="booklink"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('./a/@href'))
                id = id.split('.mobile')[0]

                title = ''.join(data.xpath('.//span[@class="title"]/text()'))
                author = ''.join(data.xpath('.//span[@class="subtitle"]/text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = ''

                s.detail_item = id.strip()
                s.title = title.strip()
                s.author = author.strip()
                s.price = '$0.00'
                s.drm = SearchResult.DRM_UNLOCKED

                yield s
    def search(self, query, max_results=10, timeout=60):
        url = 'https://www.beam-shop.de/search?saltFieldLimitation=all&sSearch=' + urllib2.quote(query)
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[contains(@class, "product--box")]'):
                if counter <= 0:
                    break

                id_ = ''.join(data.xpath('./div/div[contains(@class, "product--info")]/a/@href')).strip()
                if not id_:
                    continue
                cover_url = ''.join(data.xpath('./div/div[contains(@class, "product--info")]/a//img/@srcset'))
                if cover_url:
                    cover_url = cover_url.split(',')[0].strip()
                author = data.xpath('.//a[@class="product--author"]/text()')[0].strip()
                title = data.xpath('.//a[@class="product--title"]/text()')[0].strip()
                price = data.xpath('.//div[@class="product--price"]/span/text()')[0].strip()
                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.drm = SearchResult.DRM_UNLOCKED
                s.detail_item = id_
#                 s.formats = None
                yield s
示例#17
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://zixo.pl/wyszukiwarka/?search=' + urllib.quote(query.encode('utf-8')) + '&product_type=0'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="productInline"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//a[@class="productThumb"]/@href'))
                if not id:
                    continue

                cover_url = ''.join(data.xpath('.//a[@class="productThumb"]/img/@src'))
                title = ''.join(data.xpath('.//a[@class="title"]/text()'))
                author = ','.join(data.xpath('.//div[@class="productDescription"]/span[1]/a/text()'))
                price = ''.join(data.xpath('.//div[@class="priceList"]/span/text()'))
                price = re.sub('\.', ',', price)

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.detail_item = 'http://zixo.pl' + id.strip()
                s.drm = SearchResult.DRM_LOCKED

                yield s
    def search(self, query, max_results=25, timeout=60):
        url = 'http://ebookpoint.pl/search?qa=&szukaj=' + quote_plus(
            query.decode('utf-8').encode('iso-8859-2')) + '&serwisyall=0&wprzyg=0&wsprzed=1&wyczerp=0&formaty=em-p'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//ul[@class="list"]/li'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('./a/@href'))
                if not id:
                    continue

                formats = ', '.join(data.xpath('.//ul[@class="book-type book-type-points"]//span[@class="popup"]/span/text()'))
                cover_url = ''.join(data.xpath('.//p[@class="cover"]/img/@data-src'))
                title = ''.join(data.xpath('.//div[@class="book-info"]/h3/a/text()'))
                author = ''.join(data.xpath('.//p[@class="author"]//text()'))
                price = ''.join(data.xpath('.//p[@class="price price-incart"]/a/ins/text()|.//p[@class="price price-add"]/a/text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = re.sub(r'\.',',',price)
                s.detail_item = id.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = formats.upper()

                yield s
示例#19
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.google.com/search?tbm=bks&q=' + urllib.quote_plus(query)
        
        br = browser()
        
        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//ol[@id="rso"]/li'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//h3/a/@href'))
                if not id:
                    continue

                title = ''.join(data.xpath('.//h3/a//text()'))
                authors = data.xpath('.//span[@class="f"]//a//text()')
                if authors and authors[-1].strip().lower() in ('preview', 'read'):
                    authors = authors[:-1]
                else:
                    continue
                author = ', '.join(authors)

                counter -= 1
                
                s = SearchResult()
                s.title = title.strip()
                s.author = author.strip()
                s.detail_item = id.strip()
                s.drm = SearchResult.DRM_UNKNOWN
                
                yield s
示例#20
0
    def search(self, query, max_results=12, timeout=60):
        url = 'http://virtualo.pl/?q=' + urllib.quote(
            query) + '&f=format_id:4,6,3'

        br = browser()
        no_drm_pattern = re.compile(r'Znak wodny|Brak')

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath(
                    '//div[@id="content"]//div[@class="list_box list_box_border"]'
            ):
                if counter <= 0:
                    break

                id = ''.join(
                    data.xpath('.//div[@class="list_middle_left"]//a/@href')
                ).split(r'?q=')[0]
                if not id:
                    continue

                price = ''.join(
                    data.xpath(
                        './/span[@class="price"]/text() | .//span[@class="price abbr"]/text()'
                    ))
                cover_url = ''.join(
                    data.xpath(
                        './/div[@class="list_middle_left"]//a//img/@src'))
                title = ''.join(
                    data.xpath(
                        './/div[@class="list_title list_text_left"]/a/text()'))
                author = ', '.join(
                    data.xpath(
                        './/div[@class="list_authors list_text_left"]/a/text()'
                    ))
                formats = [
                    form.split('_')[-1].replace('.png', '')
                    for form in data.xpath(
                        './/div[@style="width:55%;float:left;text-align:left;height:18px;"]//a/span/img/@src'
                    )
                ]
                nodrm = no_drm_pattern.search(''.join(
                    data.xpath(
                        './/div[@style="width:45%;float:right;text-align:right;height:18px;"]//span[@class="prompt_preview"]/text()'
                    )))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url.split('.jpg')[0] + '.jpg'
                s.title = title.strip()
                s.author = author.strip()
                s.price = price + ' zł'
                s.detail_item = 'http://virtualo.pl' + id.strip().split(
                    'http://')[0]
                s.formats = ', '.join(formats).upper()
                s.drm = SearchResult.DRM_UNLOCKED if nodrm else SearchResult.DRM_LOCKED

                yield s
示例#21
0
    def search(self, query, max_results=10, timeout=60):
        print("search!")
        q = query.decode('utf-8')

        url = "https://views.scraperwiki.com/run/haodooscraperview/?" + urlencode(
            {"q": q})
        print(url)

        br = browser()
        with closing(br.open(url, timeout=timeout)) as f:
            json_doc = f.read()
            if len(json_doc) > 0:
                result = json.loads(json_doc)
                for volume in result:
                    s = SearchResult()
                    s.title = volume['title']
                    s.detail_item = volume['url']
                    s.price = '$0.00'
                    s.drm = SearchResult.DRM_UNLOCKED

                    if volume.has_key('type') and len(volume["type"]):
                        for t in volume["type"]:
                            s.downloads[t['type']] = t['link']
                        s.formats = ', '.join(s.downloads.keys())
                    yield s
            else:
                print("scrape nothing.")
示例#22
0
    def search(self, query, max_results=10, timeout=60):
        url = ('http://www.ebook.nl/store/advanced_search_result.php?keywords=' + urllib2.quote(query))
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@id="books"]/div[@itemtype="http://schema.org/Book"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('./meta[@itemprop="url"]/@content')).strip()
                if not id:
                    continue
                cover_url = 'http://www.ebook.nl/store/' + ''.join(data.xpath('.//img[@itemprop="image"]/@src'))
                title = ''.join(data.xpath('./span[@itemprop="name"]/a/text()')).strip()
                author = ''.join(data.xpath('./span[@itemprop="author"]/a/text()')).strip()
                if author == '&nbsp':
                    author = ''
                price = ''.join(data.xpath('.//span[@itemprop="price"]//text()'))
                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.drm = SearchResult.DRM_UNKNOWN
                s.detail_item = id

                yield s
示例#23
0
    def search(self, query, max_results=10, timeout=60):
        print( "search!")
        q = query.decode('utf-8')

        url = "https://views.scraperwiki.com/run/haodooscraperview/?" + urlencode(
                {
                    "q": q
                } )
        print( url )

        br = browser()
        with closing(br.open(url, timeout=timeout)) as f:
            json_doc = f.read()
            if len(json_doc)>0:
                result = json.loads( json_doc )
                for volume in result:
                    s = SearchResult()
                    s.title = volume['title']
                    s.detail_item = volume['url']
                    s.price = '$0.00'
                    s.drm = SearchResult.DRM_UNLOCKED

                    if volume.has_key('type') and len(volume["type"]):
                        for t in volume["type"]:
                            s.downloads[ t['type'] ] = t['link']
                        s.formats = ', '.join(s.downloads.keys())
                    yield s
            else:
                print( "scrape nothing." )
示例#24
0
    def search(self, query, max_results=20, timeout=60):
        url = 'http://www.escapemagazine.pl/wyszukiwarka?query=' + urllib.quote_plus(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="item item_short"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//h2[@class="title"]/a[1]/@href'))
                if not id:
                    continue

                title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()'))
                author = ''.join(data.xpath('.//div[@class="author"]/text()'))
                price = ''.join(data.xpath('.//span[@class="price_now"]/strong/text()')) + ' zł'
                cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src'))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.detail_item = 'http://www.escapemagazine.pl' + id.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = 'PDF'

                yield s
示例#25
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.bewrite.net/mm5/merchant.mvc?Search_Code=B&Screen=SRCH&Search=' + urllib2.quote(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@id="content"]//table/tr[position() > 1]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//a/@href'))
                if not id:
                    continue

                heading = ''.join(data.xpath('./td[2]//text()'))
                title, q, author = heading.partition('by ')
                cover_url = ''
                price = ''

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url.strip()
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = id.strip()
                s.drm = SearchResult.DRM_UNLOCKED

                yield s
示例#26
0
    def search(self, query, max_results=10, timeout=60):
        url = 'https://www.beam-shop.de/search?saltFieldLimitation=all&sSearch=' + urllib2.quote(query)
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[contains(@class, "product--box")]'):
                if counter <= 0:
                    break

                id_ = ''.join(data.xpath('./div/div[contains(@class, "product--info")]/a/@href')).strip()
                if not id_:
                    continue
                cover_url = ''.join(data.xpath('./div/div[contains(@class, "product--info")]/a//img/@srcset'))
                if cover_url:
                    cover_url = cover_url.split(',')[0].strip()
                author = data.xpath('.//a[@class="product--author"]/text()')[0].strip()
                title = data.xpath('.//a[@class="product--title"]/text()')[0].strip()
                price = data.xpath('.//div[@class="product--price"]/span/text()')[0].strip()
                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.drm = SearchResult.DRM_UNLOCKED
                s.detail_item = id_
#                 s.formats = None
                yield s
示例#27
0
    def search(self, query, max_results=10, timeout=60):
        url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+')
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())#.decode('latin-1', 'replace'))

            data_xpath = '//div[contains(@class, "prod")]'
            format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()'
            asin_xpath = '@name'
            cover_xpath = './/img[@class="productImage"]/@src'
            title_xpath = './/h3[@class="newaps"]/a//text()'
            author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]/text()'
            price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()'

            for data in doc.xpath(data_xpath):
                if counter <= 0:
                    break

                # Even though we are searching digital-text only Amazon will still
                # put in results for non Kindle books (author pages). Se we need
                # to explicitly check if the item is a Kindle book and ignore it
                # if it isn't.
                format_ = ''.join(data.xpath(format_xpath))
                if 'kindle' not in format_.lower():
                    continue

                # We must have an asin otherwise we can't easily reference the
                # book later.
                asin = data.xpath(asin_xpath)
                if asin:
                    asin = asin[0]
                else:
                    continue

                cover_url = ''.join(data.xpath(cover_xpath))

                title = ''.join(data.xpath(title_xpath))
                author = ''.join(data.xpath(author_xpath))
                try:
                    if self.author_article:
                        author = author.split(self.author_article, 1)[1].split(" (")[0]
                except:
                    pass

                price = ''.join(data.xpath(price_xpath))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url.strip()
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = asin.strip()
                s.drm = SearchResult.DRM_UNKNOWN
                s.formats = 'Kindle'

                yield s
示例#28
0
    def search(self, query, max_results=10, timeout=60):
        base_url = 'https://www.millsandboon.co.uk'
        url = base_url + '/search.aspx??format=ebook&searchText=' + urllib2.quote(query)
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//article[contains(@class, "group")]'):
                if counter <= 0:
                    break
                id_ = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/@href')).strip()
                if not id_:
                    continue

                cover_url = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@src'))
                title =  ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@alt')).strip()
                author = ''.join(data.xpath('.//a[@class="author"]/text()'))
                price = ''.join(data.xpath('.//div[@class="type-wrapper"]/ul/li[child::span[text()="eBook"]]/a/text()'))
                format_ = ''.join(data.xpath('.//p[@class="doc-meta-format"]/span[last()]/text()'))
                drm = SearchResult.DRM_LOCKED

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.detail_item = id_
                s.drm = drm
                s.formats = format_

                yield s
示例#29
0
    def search(self, query, max_results=20, timeout=60):

        br = browser()

        counter = max_results
        page = 1
        while counter:
            with closing(
                br.open(
                    "http://www.publio.pl/e-booki,strona" + str(page) + ".html?q=" + urllib.quote(query),
                    timeout=timeout,
                )
            ) as f:
                doc = html.fromstring(f.read())
                for data in doc.xpath('//div[@class="item"]'):
                    if counter <= 0:
                        break

                    id = "".join(data.xpath('.//div[@class="img"]/a/@href'))
                    if not id:
                        continue

                    cover_url = "".join(data.xpath('.//div[@class="img"]/a/img/@data-original'))
                    title = "".join(data.xpath('.//div[@class="img"]/a/@title'))
                    title2 = "".join(data.xpath('.//div[@class="desc"]/h5//text()'))
                    if title2:
                        title = title + ". " + title2
                    if (
                        "".join(
                            data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[last()]/span/text()')
                        ).strip()
                        == "Seria:"
                    ):
                        series = "".join(
                            data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[last()]/a/@title')
                        )
                        title = title + " (seria " + series + ")"
                    author = ", ".join(
                        data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[@class="row"][1]/a/@title')
                    )
                    price = "".join(data.xpath('.//div[@class="priceBox tk-museo-slab"]/ins/text()'))
                    if not price:
                        price = "".join(data.xpath('.//div[@class="priceBox tk-museo-slab"]/text()')).strip()
                    formats = ", ".join(data.xpath('.//div[@class="formats"]/a/img/@alt'))

                    counter -= 1

                    s = SearchResult()
                    s.cover_url = "http://www.publio.pl" + cover_url
                    s.title = title.strip()
                    s.author = author
                    s.price = price
                    s.detail_item = "http://www.publio.pl" + id.strip()
                    s.drm = SearchResult.DRM_LOCKED if "DRM" in formats else SearchResult.DRM_UNLOCKED
                    s.formats = formats.replace(" DRM", "").strip()

                    yield s
                if not doc.xpath('boolean(//a[@class="next"])'):
                    break
                page += 1
示例#30
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://bookoteka.pl/list?search=' + urllib.quote_plus(query) + '&cat=1&hp=1&type=1'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//li[@class="EBOOK"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//a[@class="item_link"]/@href'))
                if not id:
                    continue

                cover_url = ''.join(data.xpath('.//a[@class="item_link"]/img/@src'))
                title = ''.join(data.xpath('.//div[@class="shelf_title"]/a/text()'))
                author = ''.join(data.xpath('.//div[@class="shelf_authors"][1]/text()'))
                price = ''.join(data.xpath('.//span[@class="EBOOK"]/text()'))
                price = price.replace('.', ',')
                formats = ', '.join(data.xpath('.//a[@class="fancybox protected"]/text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = 'http://bookoteka.pl' + cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.detail_item = 'http://bookoteka.pl' + id.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = formats.strip()

                yield s
    def search(self, query, max_results=10, timeout=60):
        url = ('https://www.whsmith.co.uk/search?keywordCategoryId=wc_dept_ebooks&results=60'
               '&page=1&keywords=' + quote(query))

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//li[@class="product"]'):
                if counter <= 0:
                    break
                id_ = ''.join(data.xpath('./a[@class="product_image_wrap"]/@href'))
                if not id_:
                    continue
                id_ = 'https://www.whsmith.co.uk' + id_
                cover_url = ''.join(data.xpath('.//img[@class="product_image"]/@src'))
                title = ''.join(data.xpath('.//h4[@class="product_title"]/text()'))
                author = ', '.join(data.xpath('.//span[@class="product_second"]/text()'))
                price = ''.join(data.xpath('.//span[@class="price"]/text()'))
                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.drm = SearchResult.DRM_LOCKED
                s.detail_item = id_
                s.formats = 'ePub'

                yield s
示例#32
0
    def search(self, query, max_results=10, timeout=60):
        url = ('http://www.whsmith.co.uk/search?keywordCategoryId=wc_dept_ebooks&results=60'
               '&page=1&keywords=' + urllib2.quote(query))

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//li[@class="product"]'):
                if counter <= 0:
                    break
                id_ = ''.join(data.xpath('./a[@class="product_image_wrap"]/@href'))
                if not id_:
                    continue
                id_ = 'http://www.whsmith.co.uk' + id_
                cover_url = ''.join(data.xpath('.//img[@class="product_image"]/@src'))
                title = ''.join(data.xpath('.//h4[@class="product_title"]/text()'))
                author = ', '.join(data.xpath('.//span[@class="product_second"]/text()'))
                price = ''.join(data.xpath('.//span[@class="price"]/text()'))
                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.drm = SearchResult.DRM_LOCKED
                s.detail_item = id_
                s.formats = 'ePub'

                yield s
示例#33
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.google.com/search?tbm=bks&q=' + urllib.quote_plus(query)
        
        br = browser()
        
        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//ol[@id="rso"]/li'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//h3/a/@href'))
                if not id:
                    continue

                title = ''.join(data.xpath('.//h3/a//text()'))
                authors = data.xpath('.//div[@class="f"]//a//text()')
                while authors and authors[-1].strip().lower() in ('preview', 'read', 'more editions'):
                    authors = authors[:-1]
                if not authors:
                    continue
                author = ', '.join(authors)

                counter -= 1
                
                s = SearchResult()
                s.title = title.strip()
                s.author = author.strip()
                s.detail_item = id.strip()
                s.drm = SearchResult.DRM_UNKNOWN
                
                yield s
示例#34
0
    def search(self, query, max_results=25, timeout=60):
        url = 'http://ebookpoint.pl/search?qa=&szukaj=' + quote_plus(
            query.decode('utf-8').encode('iso-8859-2')) + '&serwisyall=0&wprzyg=0&wsprzed=1&wyczerp=0&formaty=em-p'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//ul[@class="list"]/li'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('./a/@href'))
                if not id:
                    continue

                formats = ', '.join(data.xpath('.//ul[@class="book-type book-type-points"]//span[@class="popup"]/span/text()'))
                cover_url = ''.join(data.xpath('.//p[@class="cover"]/img/@data-src'))
                title = ''.join(data.xpath('.//div[@class="book-info"]/h3/a/text()'))
                author = ''.join(data.xpath('.//p[@class="author"]//text()'))
                price = ''.join(data.xpath('.//p[@class="price price-incart"]/a/ins/text()|.//p[@class="price price-add"]/a/text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = re.sub(r'\.',',',price)
                s.detail_item = id.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = formats.upper()

                yield s
示例#35
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.bubok.pt/resellers/calibre_search/' + urllib.quote_plus(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[contains(@class, "libro")]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//div[@class="url"]/text()'))

                title = ''.join(data.xpath('.//div[@class="titulo"]/text()'))

                author = ''.join(data.xpath('.//div[@class="autor"]/text()'))

                price = ''.join(data.xpath('.//div[@class="precio"]/text()'))

                formats = ''.join(data.xpath('.//div[@class="formatos"]/text()'))

                cover = ''.join(data.xpath('.//div[@class="portada"]/text()'))

                counter -= 1

                s = SearchResult()
                s.title = title.strip()
                s.author = author.strip()
                s.detail_item = id.strip()
                s.price = price.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = formats.strip()
                s.cover_url = cover.strip()
                yield s
示例#36
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://bookoteka.pl/list?search=' + urllib.quote_plus(query) + '&cat=1&hp=1&type=1'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//li[@class="EBOOK"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//a[@class="item_link"]/@href'))
                if not id:
                    continue

                cover_url = ''.join(data.xpath('.//a[@class="item_link"]/img/@src'))
                title = ''.join(data.xpath('.//div[@class="shelf_title"]/a/text()'))
                author = ''.join(data.xpath('.//div[@class="shelf_authors"][1]/text()'))
                price = ''.join(data.xpath('.//span[@class="EBOOK"]/text()'))
                price = price.replace('.', ',')
                formats = ', '.join(data.xpath('.//a[@class="fancybox protected"]/text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = 'http://bookoteka.pl' + cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.detail_item = 'http://bookoteka.pl' + id.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = formats.strip()

                yield s
示例#37
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.kobobooks.com/search/search.html?q=' + urllib.quote_plus(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//ul[contains(@class, "flowview-items")]/li'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('./a[contains(@class, "block-link")]/@href'))
                if not id:
                    continue
                id = id[1:]

                price = ''.join(data.xpath('.//a[contains(@class, "primary-button")]//text()'))

                cover_url = ''.join(data.xpath('.//img[1]/@src'))
                cover_url = 'http:%s' % cover_url

                title = ''.join(data.xpath('.//p[contains(@class, "flowview-item-title")]//text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.price = price.strip()
                s.detail_item = 'http://store.kobobooks.com/' + id.strip()
                s.formats = 'EPUB'
                s.drm = SearchResult.DRM_UNKNOWN

                yield s
示例#38
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.ebookshoppe.com/search.php?search_query=' + quote(query)
        br = browser()
        br.addheaders = [("Referer", "http://www.ebookshoppe.com/")]

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//ul[@class="ProductList"]/li'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('./div[@class="ProductDetails"]/'
                                        'strong/a/@href')).strip()
                if not id:
                    continue
                cover_url = ''.join(data.xpath('./div[@class="ProductImage"]/a/img/@src'))
                title = ''.join(data.xpath('./div[@class="ProductDetails"]/strong/a/text()'))
                price = ''.join(data.xpath('./div[@class="ProductPriceRating"]/em/text()'))
                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.price = price
                s.drm = SearchResult.DRM_UNLOCKED
                s.detail_item = id

                self.get_author_and_formats(s, timeout)
                if not s.author:
                    continue

                yield s
示例#39
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://woblink.com/katalog-ebooki?query=' + urllib.quote_plus(query.encode('utf-8'))
        if max_results > 10:
            if max_results > 20:
                url += '&limit=30'
            else:
                url += '&limit=20'

        counter = max_results

        try:
            results = fork_job(js_browser,'get_results', (url, timeout,), module_is_source_code=True)
        except WorkerError as e:
            raise Exception('Could not get results: %s'%e.orig_tb)
        doc = html.fromstring(strip_encoding_declarations(results['result']))
        for data in doc.xpath('//div[@class="nw_katalog_lista_ksiazka"]'):
            if counter <= 0:
                break

            id = ''.join(data.xpath('.//div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/@href'))
            if not id:
                continue

            cover_url = ''.join(data.xpath('.//div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/img/@src'))
            title = ''.join(data.xpath('.//h2[@class="nw_katalog_lista_ksiazka_detale_tytul"]/a[1]/text()'))
            author = ', '.join(data.xpath('.//p[@class="nw_katalog_lista_ksiazka_detale_autor"]/a/text()'))
            price = ''.join(data.xpath('.//div[@class="nw_opcjezakupu_cena"]/text()'))
            formats = ', '.join(data.xpath('.//p[@class="nw_katalog_lista_ksiazka_detale_format"]/span/text()'))

            s = SearchResult()
            s.cover_url = 'http://woblink.com' + cover_url
            s.title = title.strip()
            s.author = author.strip()
            s.price = price + ' zł'
            s.detail_item = id.strip()
            s.formats = formats

            if 'DRM' in formats:
                s.drm = SearchResult.DRM_LOCKED

                counter -= 1
                yield s
            else:
                s.drm = SearchResult.DRM_UNLOCKED

                counter -= 1
                yield s
    def search(self, query, max_results=10, timeout=60):
        url = 'http://ebookstore.sony.com/search?keyword=%s' % urllib.quote_plus(
            query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for item in doc.xpath(
                    '//div[contains(@class, "searchResult")]/'
                    'descendant::li[contains(@class, "hreview")]'):
                if counter <= 0:
                    break

                curr = ''.join(
                    item.xpath(
                        'descendant::div[@class="pricing"]/descendant::*[@class="currency"]/@title'
                    )).strip()
                amt = ''.join(
                    item.xpath(
                        'descendant::div[@class="pricing"]/descendant::*[@class="amount"]/text()'
                    )).strip()
                s = SearchResult()
                s.price = (curr + ' ' +
                           amt) if (curr and amt) else _('Not Available')
                title = item.xpath('descendant::h3[@class="item"]')
                if not title: continue
                title = etree.tostring(title[0],
                                       method='text',
                                       encoding=unicode)
                if not title: continue
                s.title = title.strip()
                s.author = ''.join(
                    item.xpath('descendant::li[contains(@class, "author")]/'
                               'a[@class="fn"]/text()')).strip()
                if not s.author: continue
                detail_url = ''.join(
                    item.xpath('descendant::h3[@class="item"]'
                               '/descendant::a[@class="fn" and @href]/@href'))
                if not detail_url: continue
                s.detail_item = detail_url

                counter -= 1

                cover_url = ''.join(
                    item.xpath('descendant::li[@class="coverart"]/'
                               'descendant::img[@src]/@src'))
                if cover_url:
                    if cover_url.startswith('//'):
                        cover_url = 'http:' + cover_url
                    elif cover_url.startswith('/'):
                        cover_url = 'http://ebookstore.sony.com' + cover_url
                    s.cover_url = url_slash_cleaner(cover_url)

                s.drm = SearchResult.DRM_UNKNOWN
                s.formats = 'Sony'

                yield s
示例#41
0
def search(query, max_results=10, timeout=60):
    url = 'https://www.smashwords.com/books/search?query=' + urllib.parse.quote(
        query)

    br = browser()
    try:
        br.set_simple_cookie('adultOff',
                             'erotica',
                             '.smashwords.com',
                             path='/')
    except AttributeError:
        pass  # old version of mechanize

    counter = max_results
    with closing(br.open(url, timeout=timeout)) as f:
        doc = html.fromstring(f.read())
        for data in doc.xpath(
                '//div[@id="pageContent"]//div[@class="library-book"]'):
            if counter <= 0:
                break
            data = html.fromstring(html.tostring(data))

            id = None
            id_a = ''.join(
                data.xpath('//a[contains(@class, "library-title")]/@href'))
            if id_a:
                id = id_a.split('/')[-1]
            if not id:
                continue

            cover_url = ''.join(
                data.xpath('//img[contains(@class, "book-list-image")]/@src'))

            title = ''.join(
                data.xpath('.//a[contains(@class, "library-title")]/text()'))
            author = ''.join(data.xpath('.//a[@itemprop="author"]//text()'))

            price = ''.join(data.xpath('.//div[@class="subnote"]//text()'))
            if 'Price:' in price:
                try:
                    price = price.partition('Price:')[2]
                    price = re.sub('\s', ' ', price).strip()
                    price = price.split(' ')[0].strip()
                except Exception:
                    price = 'Unknown'
            if price == 'Free!':
                price = '$0.00'

            counter -= 1

            s = SearchResult()
            s.cover_url = cover_url
            s.title = title.strip()
            s.author = author.strip()
            s.price = price
            s.detail_item = '/books/view/' + id.strip()
            s.drm = SearchResult.DRM_UNLOCKED

            yield s
示例#42
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://woblink.com/publication?query=' + urllib.quote_plus(query.encode('utf-8'))
        if max_results > 10:
            if max_results > 20:
                url += '&limit=30'
            else:
                url += '&limit=20'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="book-item"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//td[@class="w10 va-t"]/a[1]/@href'))
                if not id:
                    continue

                cover_url = ''.join(data.xpath('.//td[@class="w10 va-t"]/a[1]/img/@src'))
                title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()'))
                author = ', '.join(data.xpath('.//p[@class="author"]/a/text()'))
                price = ''.join(data.xpath('.//div[@class="prices"]/span[1]/span/text()'))
                price = re.sub('\.', ',', price)
                formats = [ form[8:-4].split('_')[0] for form in data.xpath('.//p[3]/img/@src')]

                s = SearchResult()
                s.cover_url = 'http://woblink.com' + cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price + ' zł'
                s.detail_item = id.strip()
                
                # MOBI should be send first,
                if 'MOBI' in formats:
                    t = copy.copy(s)
                    t.title += ' MOBI'
                    t.drm = SearchResult.DRM_UNLOCKED
                    t.formats = 'MOBI'
                    formats.remove('MOBI')
                    
                    counter -= 1
                    yield t
                    
                # and the remaining formats (if any) next
                if formats:
                    if 'epub' in formats:
                        formats.remove('epub')
                        formats.append('WOBLINK')
                        if 'E Ink' in data.xpath('.//div[@class="prices"]/img/@title'):
                            formats.insert(0, 'EPUB')
                    
                    s.drm = SearchResult.DRM_LOCKED
                    s.formats = ', '.join(formats).upper()
                    
                    counter -= 1
                    yield s
示例#43
0
    def search(self, query, max_results=10, timeout=60):

        br = browser()
        page = 1

        counter = max_results
        while counter:
            with closing(
                    br.open(u'https://cdp.pl/ksiazki/e-book.html?q=' +
                            urllib.quote_plus(query) + '&p=' + str(page),
                            timeout=timeout)) as f:
                doc = html.fromstring(f.read())
                for data in doc.xpath('//ul[@class="grid-of-products"]/li'):
                    if counter <= 0:
                        break

                    id = ''.join(
                        data.xpath('.//a[@class="product-image"]/@href'))
                    if not id:
                        continue
                    if 'ksiazki' not in id:
                        continue

                    cover_url = ''.join(
                        data.xpath(
                            './/a[@class="product-image"]/img/@data-src'))
                    title = ''.join(data.xpath('.//h3[1]/a/@title'))
                    price = ''.join(
                        data.xpath('.//span[@class="custom_price"]/text()')
                    ) + ',' + ''.join(
                        data.xpath(
                            './/span[@class="custom_price"]/sup/text()'))
                    author = ''
                    formats = ''
                    with closing(br.open(id.strip(),
                                         timeout=timeout / 4)) as nf:
                        idata = html.fromstring(nf.read())
                        author = ', '.join(
                            idata.xpath(
                                './/ul[@class="film-data"]/li[1]/p/text()'))
                        formats = idata.xpath(
                            '//div[@class="product-attributes-container"][2]/ul/li/span/text()'
                        )[-1]

                    counter -= 1

                    s = SearchResult()
                    s.cover_url = cover_url
                    s.title = title.strip()
                    s.author = author
                    s.price = price + ' zł'
                    s.detail_item = id.strip()
                    s.drm = SearchResult.DRM_UNLOCKED
                    s.formats = formats.upper().strip()

                    yield s
                if not doc.xpath('//span[@class="next-page"]/a'):
                    break
            page += 1
示例#44
0
 def search(self, query, max_results=10, timeout=60):
     s = SearchResult()
     s.title = 'Amazon required that this<br>store be permanently closed.'
     s.author = ''
     s.price = ''
     s.detail_item = ''
     s.drm = SearchResult.DRM_UNKNOWN
     yield s
示例#45
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://woblink.com/katalog-e-book?query=' + urllib.quote_plus(query.encode('utf-8'))
        if max_results > 10:
            if max_results > 20:
                url += '&limit=30'
            else:
                url += '&limit=20'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="nw_katalog_lista_ksiazka"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/@href'))
                if not id:
                    continue

                cover_url = ''.join(data.xpath('.//div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/img/@src'))
                title = ''.join(data.xpath('.//h2[@class="nw_katalog_lista_ksiazka_detale_tytul"]/a[1]/text()'))
                author = ', '.join(data.xpath('.//h3[@class="nw_katalog_lista_ksiazka_detale_autor"]/a/text()'))
                price = ''.join(data.xpath('.//div[@class="nw_katalog_lista_ksiazka_opcjezakupu_cena"]/span/text()'))
                price = re.sub('\.', ',', price)
                formats = ', '.join(data.xpath('.//p[@class="nw_katalog_lista_ksiazka_detale_formaty"]/span/text()'))

                s = SearchResult()
                s.cover_url = 'http://woblink.com' + cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price + ' zł'
                s.detail_item = id.strip()
                s.formats = formats

                if 'EPUB DRM' in formats:
                    s.drm = SearchResult.DRM_LOCKED

                    counter -= 1
                    yield s
                else:
                    s.drm = SearchResult.DRM_UNLOCKED

                    counter -= 1
                    yield s
示例#46
0
    def search(self, query, max_results=10, timeout=60):
        counter = max_results
        page = 1
        url = 'http://www.gandalf.com.pl/we/' + urllib.quote_plus(query.decode('utf-8').encode('iso8859_2')) + '/bdb'

        br = browser()

        while counter:
            with closing(br.open((url + str(page-1) + '/#s') if (page-1) else (url + '/#s'), timeout=timeout)) as f:
                doc = html.fromstring(f.read())
                for data in doc.xpath('//div[@class="box"]'):
                    if counter <= 0:
                        break

                    id = ''.join(data.xpath('.//div[@class="info"]/h3/a/@href'))
                    if not id:
                        continue

                    cover_url = ''.join(data.xpath('.//div[@class="info"]/h3/a/@id'))
                    title = ''.join(data.xpath('.//div[@class="info"]/h3/a/@title'))
                    formats = ''.join(data.xpath('.//div[@class="info"]/p[1]/text()'))
                    formats = re.findall(r'\((.*?)\)',formats)[0]
                    author = ''.join(data.xpath('.//div[@class="info"]/h4/text() | .//div[@class="info"]/h4/span/text()'))
                    price = ''.join(data.xpath('.//div[@class="options"]/h3/text()'))
                    price = re.sub('PLN', 'zł', price)
                    price = re.sub('\.', ',', price)
                    drm = data.xpath('boolean(.//div[@class="info" and contains(., "Zabezpieczenie: DRM")])')

                    counter -= 1

                    s = SearchResult()
                    s.cover_url = 'http://imguser.gandalf.com.pl/' + re.sub('p', 'p_', cover_url) + '.jpg'
                    s.title = title.strip()
                    s.author = author.strip()
                    s.price = price
                    s.detail_item = id.strip()
                    if drm:
                        s.drm = SearchResult.DRM_LOCKED
                    else:
                        s.drm = SearchResult.DRM_UNLOCKED
                    s.formats = formats.upper().strip()

                    yield s
                if not doc.xpath('boolean(//div[@class="wyszukiwanie_podstawowe_header"]//div[@class="box"])'):
                    break
                page+=1
    def search(self, query, max_results=10, timeout=60):
        search_url = 'http://www.amazon.fr/s/?url=search-alias%3Ddigital-text&field-keywords='
        url = search_url + query.encode('ascii', 'backslashreplace').replace(
            '%', '%25').replace('\\x', '%').replace(' ', '+')
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            # doc = html.fromstring(f.read().decode('latin-1', 'replace'))
            # Apparently amazon Europe is responding in UTF-8 now
            doc = html.fromstring(f.read())

            data_xpath = '//div[contains(@class, "result") and contains(@class, "product")]'
            format_xpath = './/span[@class="format"]/text()'
            cover_xpath = './/img[@class="productImage"]/@src'

            for data in doc.xpath(data_xpath):
                if counter <= 0:
                    break

                # Even though we are searching digital-text only Amazon will still
                # put in results for non Kindle books (author pages). So we need
                # to explicitly check if the item is a Kindle book and ignore it
                # if it isn't.
                format = ''.join(data.xpath(format_xpath))
                if 'kindle' not in format.lower():
                    continue

                # We must have an asin otherwise we can't easily reference the
                # book later.
                asin = ''.join(data.xpath("@name"))

                cover_url = ''.join(data.xpath(cover_xpath))

                title = ''.join(data.xpath('.//a[@class="title"]/text()'))
                price = ''.join(
                    data.xpath(
                        './/div[@class="newPrice"]/span[contains(@class, "price")]/text()'
                    ))
                author = unicode(''.join(
                    data.xpath(
                        './/h3[@class="title"]/span[@class="ptBrand"]/text()'))
                                 )
                if author.startswith('de '):
                    author = author[3:]

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url.strip()
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = asin.strip()
                s.formats = 'Kindle'
                s.drm = SearchResult.DRM_UNKNOWN

                yield s
示例#48
0
    def search(self, query, max_results=100, timeout=180):
        url = 'http://www.e-knjiga.si/rezultati_cover.php?query=' + urllib2.quote(query)

        print("will search for: " + urllib2.quote(query) + ":\n  " + url)

        br = browser()

        # counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:

            html=etree.HTML(f.read())
            
            #get list of books
            for book in html.xpath("//table[@class='zebra']"):
                print(etree.tostring(book, pretty_print=True, method="html"))
        
                author = book.find('.//tr/[0]/td/[1]').text
                title = book.find('.//tr/[0]/td/[2]/a').text
                details = 'http://www.e-knjiga.si/' + book.find('.//tr/[0]/td/[2]/a').get("href")

                ## get details
                fo =  urllib2.urlopen(details)
                det=etree.HTML(fo.read())
                fo.close()
                
                table=det.find(".//div[@id='center_container']").find('./table')
                cover='http://www.e-knjiga.si/' + table.find('.//tr/[1]/td/[1]/div/img').get("src")
                description=table.find(".//tr/[6]/td[@class='knjige_spremna']").text
                
                links=[]
                files=table.find('.//tr/[7]/td/[1]')
                for file in files.iter('a'):
                    links.append("http://www.e-knjiga.si/"+file.get("href"))

                
                #print("Author:    " + author)
                #print("Title:     " + title)
                #print("Details:   " + details)
                #print("Description:  " + description)
                #print("Cover:        " + cover)
                #print("Files:       ")
                #print('\n              '.join(links))
                
                s = SearchResult()
                s.title = title
                s.author = author
                s.price = "0.00eur"
                s.drm = SearchResult.DRM_UNLOCKED
                s.detail_item = description
                
                for f in links:
                    ftype = f.split(".")[-1]
                    s.downloads[ftype] = f
                    s.formats += ftype
                    
                s.cover_url = cover

                yield s
示例#49
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.nexto.pl/szukaj.xml?search-clause=' + urllib.parse.quote_plus(
            query) + '&scid=1015'

        br = browser()
        offset = 0

        counter = max_results

        while counter:
            with closing(
                    br.open(url + '&_offset=' + str(offset),
                            timeout=timeout)) as f:
                doc = html.fromstring(f.read())
                for data in doc.xpath('//ul[@class="productslist"]/li'):
                    if counter <= 0:
                        break

                    id = ''.join(data.xpath('.//div[@class="col-2"]/a/@href'))
                    if not id:
                        continue

                    price = ''.join(
                        data.xpath('.//strong[@class="nprice"]/text()'))

                    cover_url = ''.join(
                        data.xpath('.//img[@class="cover"]/@src'))
                    cover_url = re.sub(r'%2F', '/', cover_url)
                    cover_url = re.sub(r'widthMax=120&heightMax=200',
                                       'widthMax=64&heightMax=64', cover_url)
                    title = ''.join(data.xpath('.//a[@class="title"]/text()'))
                    title = re.sub(r' – ebook', '', title)
                    author = ', '.join(
                        data.xpath('.//div[@class="col-7"]//h4//a/text()'))
                    formats = ', '.join(
                        data.xpath('.//ul[@class="formats"]/li//b/text()'))
                    DrmFree = re.search(
                        r'znak',
                        str(data.xpath(
                            './/ul[@class="formats"]/li//b/@title')))

                    counter -= 1

                    s = SearchResult()
                    s.cover_url = cover_url if cover_url[:
                                                         4] == 'http' else 'http://www.nexto.pl' + cover_url
                    s.title = title.strip()
                    s.author = author.strip()
                    s.price = price.strip()
                    s.detail_item = id.strip()
                    s.drm = SearchResult.DRM_UNLOCKED if DrmFree else SearchResult.DRM_LOCKED
                    s.formats = formats.upper().strip()

                    yield s
                if not doc.xpath(
                        '//div[@class="listnavigator"]//a[@class="next"]'):
                    break
            offset += 10
示例#50
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.waterstones.com/waterstonesweb/simpleSearch.do?simpleSearchString=ebook+' + urllib2.quote(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[contains(@class, "results-pane")]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('./div/div/h2/a/@href')).strip()
                if not id:
                    continue
                cover_url = ''.join(data.xpath('.//div[@class="image"]/a/img/@src'))
                if not cover_url.startswith("http"):
                    cover_url = 'http://www.waterstones.com' + cover_url
                title = ''.join(data.xpath('./div/div/h2/a/text()'))
                author = ', '.join(data.xpath('.//p[@class="byAuthor"]/a/text()'))
                price = ''.join(data.xpath('.//p[@class="price"]/span[@class="priceRed2"]/text()'))
                drm = data.xpath('boolean(.//td[@headers="productFormat" and contains(., "DRM")])')
                pdf = data.xpath('boolean(.//td[@headers="productFormat" and contains(., "PDF")])')
                epub = data.xpath('boolean(.//td[@headers="productFormat" and contains(., "EPUB")])')

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                if drm:
                    s.drm = SearchResult.DRM_LOCKED
                else:
                    s.drm = SearchResult.DRM_UNKNOWN
                s.detail_item = id
                formats = []
                if epub:
                    formats.append('ePub')
                if pdf:
                    formats.append('PDF')
                s.formats = ', '.join(formats)

                yield s
示例#51
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.barnesandnoble.com/s/%s?keyword=%s&store=ebook&view=list' % (
            query.replace(' ', '-'), urllib.parse.quote_plus(query))

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            raw = f.read()
            doc = html.fromstring(raw)
            for data in doc.xpath(
                    '//ol[contains(@class, "result-set")]/li[contains(@class, "result")]'
            ):
                if counter <= 0:
                    break

                id = ''.join(
                    data.xpath(
                        './/div[contains(@class, "image-block")]/a/@href'))
                if not id:
                    continue

                cover_url = ''
                cover_id = ''.join(
                    data.xpath(
                        './/img[contains(@class, "product-image")]/@id'))
                m = re.search(r"%s'.*?srcUrl: '(?P<iurl>.*?)'.*?}" % cover_id,
                              raw)
                if m:
                    cover_url = m.group('iurl')

                title = ''.join(
                    data.xpath(
                        'descendant::p[@class="title"]//span[@class="name"]//text()'
                    )).strip()
                if not title:
                    continue

                author = ', '.join(
                    data.xpath(
                        './/ul[contains(@class, "contributors")]//a[contains(@class, "subtle")]//text()'
                    )).strip()
                price = ''.join(
                    data.xpath('.//a[contains(@class, "bn-price")]//text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = id.strip()
                s.drm = SearchResult.DRM_UNKNOWN
                s.formats = 'Nook'

                yield s
示例#52
0
    def search(self, query, max_results=10, timeout=60):
        url = "http://ebookstore.sony.com/search?keyword=%s" % urllib.quote_plus(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for item in doc.xpath(
                '//div[contains(@class, "searchResult")]/' 'descendant::li[contains(@class, "hreview")]'
            ):
                if counter <= 0:
                    break

                curr = "".join(
                    item.xpath('descendant::div[@class="pricing"]/descendant::*[@class="currency"]/@title')
                ).strip()
                amt = "".join(
                    item.xpath('descendant::div[@class="pricing"]/descendant::*[@class="amount"]/text()')
                ).strip()
                s = SearchResult()
                s.price = (curr + " " + amt) if (curr and amt) else _("Not Available")
                title = item.xpath('descendant::h3[@class="item"]')
                if not title:
                    continue
                title = etree.tostring(title[0], method="text", encoding=unicode)
                if not title:
                    continue
                s.title = title.strip()
                s.author = "".join(
                    item.xpath('descendant::li[contains(@class, "author")]/' 'a[@class="fn"]/text()')
                ).strip()
                if not s.author:
                    continue
                detail_url = "".join(
                    item.xpath('descendant::h3[@class="item"]' '/descendant::a[@class="fn" and @href]/@href')
                )
                if not detail_url:
                    continue
                if detail_url.startswith("/"):
                    detail_url = "http:" + detail_url
                s.detail_item = detail_url

                counter -= 1

                cover_url = "".join(item.xpath('descendant::li[@class="coverart"]/' "descendant::img[@src]/@src"))
                if cover_url:
                    if cover_url.startswith("//"):
                        cover_url = "http:" + cover_url
                    elif cover_url.startswith("/"):
                        cover_url = "http://ebookstore.sony.com" + cover_url
                    s.cover_url = url_slash_cleaner(cover_url)

                s.drm = SearchResult.DRM_UNKNOWN
                s.formats = "Sony"

                yield s
示例#53
0
    def search(self, query, max_results=10, timeout=60):
        # check for cyrillic symbols before performing search
        uquery = unicode(query.strip(), 'utf-8')
        reObj = re.search(u'^[а-яА-Я\\d\\s]{3,}$', uquery)
        if not reObj:
            return

        base_url = 'http://chitanka.info'
        url = base_url + '/search?q=' + urllib2.quote(query)
        counter = max_results

        # search for book title
        br = browser()
        try:
            with closing(br.open(url, timeout=timeout)) as f:
                f = unicode(f.read(), 'utf-8')
                doc = html.fromstring(f)

                for data in doc.xpath('//ul[@class="superlist booklist"]/li'):
                    if counter <= 0:
                        break

                    id = ''.join(
                        data.xpath('.//a[@class="booklink"]/@href')).strip()
                    if not id:
                        continue

                    counter -= 1

                    s = SearchResult()
                    s.cover_url = ''.join(
                        data.xpath(
                            './/a[@class="booklink"]/img/@src')).strip()
                    s.title = ''.join(
                        data.xpath(
                            './/a[@class="booklink"]/i/text()')).strip()
                    s.author = ''.join(
                        data.xpath(
                            './/span[@class="bookauthor"]/a/text()')).strip()
                    s.detail_item = id
                    s.drm = SearchResult.DRM_UNLOCKED
                    s.downloads['FB2'] = base_url + ''.join(
                        data.xpath('.//a[@class="dl dl-fb2"]/@href')).strip(
                        ).replace('.zip', '')
                    s.downloads['EPUB'] = base_url + ''.join(
                        data.xpath('.//a[@class="dl dl-epub"]/@href')).strip(
                        ).replace('.zip', '')
                    s.downloads['TXT'] = base_url + ''.join(
                        data.xpath('.//a[@class="dl dl-txt"]/@href')).strip(
                        ).replace('.zip', '')
                    s.formats = 'FB2, EPUB, TXT, SFB'
                    yield s
        except urllib2.HTTPError, e:
            if e.code == 404:
                return
            else:
                raise
示例#54
0
 def parse_book_details(self, node):
     r = SearchResult()
     r.title = text(node, './/*[@itemprop="name"]')
     r.author = text(node, './/*', 'bookdetails__authorname')
     r.price = text(node, './/*', 'bookdetails__price')
     r.cover_url = text(node, './/img[@itemprop="image"]', '', '/@src')
     r.formats = text(node, './/*', 'book_info__format', '/span[2]/text()')
     r.drm = text(node, './/*', 'book_info__drm', '/span[2]/text()')
     return r
示例#55
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.baenebooks.com/searchadv.aspx?IsSubmit=true&SearchTerm=' + urllib2.quote(
            query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//table//table//table//table//tr'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('./td[1]/a/@href'))
                if not id or not id.startswith('p-'):
                    continue

                title = ''.join(data.xpath('./td[1]/a/text()'))

                author = ''
                cover_url = ''
                price = ''

                with closing(
                        br.open('http://www.baenebooks.com/' + id.strip(),
                                timeout=timeout / 4)) as nf:
                    idata = html.fromstring(nf.read())
                    author = ''.join(
                        idata.xpath(
                            '//span[@class="ProductNameText"]/../b/text()'))
                    author = author.split('by ')[-1]
                    price = ''.join(
                        idata.xpath('//span[@class="variantprice"]/text()'))
                    a, b, price = price.partition('$')
                    price = b + price

                    pnum = ''
                    mo = re.search(r'p-(?P<num>\d+)-', id.strip())
                    if mo:
                        pnum = mo.group('num')
                    if pnum:
                        cover_url = 'http://www.baenebooks.com/' + ''.join(
                            idata.xpath(
                                '//img[@id="ProductPic%s"]/@src' % pnum))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.detail_item = id.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = 'RB, MOBI, EPUB, LIT, LRF, RTF, HTML'

                yield s