示例#1
0
    def run(self):
        url = 'http://www.mobileread.com/forums/ebooks.php?do=getlist&type=html'

        self.update_details.emit(_('Checking last download date.'))
        last_download = self.config.get('last_download', None)
        # Don't update the book list if our cache is less than one week old.
        if last_download and (time.time() - last_download) < 604800:
            return

        self.update_details.emit(_('Downloading book list from MobileRead.'))
        # Download the book list HTML file from MobileRead.
        br = browser()
        raw_data = None
        try:
            with closing(br.open(url, timeout=self.timeout)) as f:
                raw_data = f.read()
        except:
            return

        if not raw_data or not self._run:
            return

        self.update_details.emit(_('Processing books.'))
        # Turn books listed in the HTML file into SearchResults's.
        books = []
        try:
            data = html.fromstring(raw_data)
            raw_books = data.xpath('//ul/li')
            self.total_changed.emit(len(raw_books))

            for i, book_data in enumerate(raw_books):
                self.update_details.emit(
                        _('%(num)s of %(tot)s books processed.') % dict(
                            num=i, tot=len(raw_books)))
                book = SearchResult()
                book.detail_item = ''.join(book_data.xpath('.//a/@href'))
                book.formats = ''.join(book_data.xpath('.//i/text()'))
                book.formats = book.formats.strip()

                text = ''.join(book_data.xpath('.//a/text()'))
                if ':' in text:
                    book.author, q, text = text.partition(':')
                book.author = book.author.strip()
                book.title = text.strip()
                books.append(book)

                if not self._run:
                    books = []
                    break
                else:
                    self.update_progress.emit(i)
        except:
            pass

        # Save the book list and it's create time.
        if books:
            self.config['book_list'] = self.seralize_books(books)
            self.config['last_download'] = time.time()
示例#2
0
    def run(self):
        url = 'http://www.mobileread.com/forums/ebooks.php?do=getlist&type=html'

        self.update_details.emit(_('Checking last download date.'))
        last_download = self.config.get('last_download', None)
        # Don't update the book list if our cache is less than one week old.
        if last_download and (time.time() - last_download) < 604800:
            return

        self.update_details.emit(_('Downloading book list from MobileRead.'))
        # Download the book list HTML file from MobileRead.
        br = browser()
        raw_data = None
        try:
            with closing(br.open(url, timeout=self.timeout)) as f:
                raw_data = f.read()
        except:
            return

        if not raw_data or not self._run:
            return

        self.update_details.emit(_('Processing books.'))
        # Turn books listed in the HTML file into SearchResults's.
        books = []
        try:
            data = html.fromstring(raw_data)
            raw_books = data.xpath('//ul/li')
            self.total_changed.emit(len(raw_books))

            for i, book_data in enumerate(raw_books):
                self.update_details.emit(
                    _('%(num)s of %(tot)s books processed.') %
                    dict(num=i, tot=len(raw_books)))
                book = SearchResult()
                book.detail_item = ''.join(book_data.xpath('.//a/@href'))
                book.formats = ''.join(book_data.xpath('.//i/text()'))
                book.formats = book.formats.strip()

                text = ''.join(book_data.xpath('.//a/text()'))
                if ':' in text:
                    book.author, q, text = text.partition(':')
                book.author = book.author.strip()
                book.title = text.strip()
                books.append(book)

                if not self._run:
                    books = []
                    break
                else:
                    self.update_progress.emit(i)
        except:
            pass

        # Save the book list and it's create time.
        if books:
            self.config['book_list'] = self.seralize_books(books)
            self.config['last_download'] = time.time()
示例#3
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://woblink.com/katalog-e-book?query=' + urllib.quote_plus(query.encode('utf-8'))
        if max_results > 10:
            if max_results > 20:
                url += '&limit=30'
            else:
                url += '&limit=20'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="book-item backgroundmix"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//td[@class="w10 va-t mYHaveItYes"]/a[1]/@href'))
                if not id:
                    continue

                cover_url = ''.join(data.xpath('.//td[@class="w10 va-t mYHaveItYes"]/a[1]/img/@src'))
                title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()'))
                author = ', '.join(data.xpath('.//td[@class="va-t"]/h3/a/text()'))
                price = ''.join(data.xpath('.//div[@class="prices"]/span[1]/strong/span/text()'))
                price = re.sub('\.', ',', price)
                formats = [ form[8:-4].split('.')[0] for form in data.xpath('.//p[3]/img/@src')]

                s = SearchResult()
                s.cover_url = 'http://woblink.com' + cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price + ' zł'
                s.detail_item = id.strip()

                if 'epub_drm' in formats:
                    s.drm = SearchResult.DRM_LOCKED
                    s.formats = 'EPUB'

                    counter -= 1
                    yield s
                elif 'pdf' in formats:
                    s.drm = SearchResult.DRM_LOCKED
                    s.formats = 'PDF'

                    counter -= 1
                    yield s
                else:
                    s.drm = SearchResult.DRM_UNLOCKED
                    if 'MOBI_nieb' in formats:
                        formats.remove('MOBI_nieb')
                        formats.append('MOBI')
                    s.formats = ', '.join(formats).upper()

                    counter -= 1
                    yield s
示例#4
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://woblink.com/katalog-e-book?query=' + urllib.quote_plus(query.encode('utf-8'))
        if max_results > 10:
            if max_results > 20:
                url += '&limit=30'
            else:
                url += '&limit=20'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="book-item backgroundmix"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//td[@class="w10 va-t mYHaveItYes"]/a[1]/@href'))
                if not id:
                    continue

                cover_url = ''.join(data.xpath('.//td[@class="w10 va-t mYHaveItYes"]/a[1]/img/@src'))
                title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()'))
                author = ', '.join(data.xpath('.//td[@class="va-t"]/h3/a/text()'))
                price = ''.join(data.xpath('.//div[@class="prices"]/span[1]/strong/span/text()'))
                price = re.sub('\.', ',', price)
                formats = [ form[8:-4].split('.')[0] for form in data.xpath('.//p[3]/img/@src')]

                s = SearchResult()
                s.cover_url = 'http://woblink.com' + cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price + ' zł'
                s.detail_item = id.strip()

                if 'epub_drm' in formats:
                    s.drm = SearchResult.DRM_LOCKED
                    s.formats = 'EPUB'

                    counter -= 1
                    yield s
                elif 'pdf' in formats:
                    s.drm = SearchResult.DRM_LOCKED
                    s.formats = 'PDF'

                    counter -= 1
                    yield s
                else:
                    s.drm = SearchResult.DRM_UNLOCKED
                    if 'MOBI_nieb' in formats:
                        formats.remove('MOBI_nieb')
                        formats.append('MOBI')
                    s.formats = ', '.join(formats).upper()

                    counter -= 1
                    yield s
示例#5
0
def search_kobo(query, max_results=10, timeout=60, write_html_to=None):
    from css_selectors import Select
    url = 'https://www.kobobooks.com/search/search.html?q=' + quote_plus(query)
    raw = read_url(url, timeout=timeout)
    if write_html_to is not None:
        with open(write_html_to, 'w') as f:
            f.write(raw)
    doc = html.fromstring(raw)
    select = Select(doc)
    for i, item in enumerate(select('.result-items .item-wrapper.book')):
        if i == max_results:
            break
        for img in select('.item-image img[src]', item):
            cover_url = img.get('src')
            if cover_url.startswith('//'):
                cover_url = 'https:' + cover_url
            break
        else:
            cover_url = None

        for p in select('h2.title', item):
            title = etree.tostring(p, method='text',
                                   encoding='unicode').strip()
            for a in select('a[href]', p):
                url = a.get('href')
                break
            else:
                url = None
            break
        else:
            title = None
        if title:
            for p in select('p.subtitle', item):
                title += ' - ' + etree.tostring(
                    p, method='text', encoding='unicode').strip()

        authors = []
        for a in select('.contributors a.contributor-name', item):
            authors.append(
                etree.tostring(a, method='text', encoding='unicode').strip())
        authors = authors_to_string(authors)

        for p in select('p.price', item):
            price = etree.tostring(p, method='text',
                                   encoding='unicode').strip()
            break
        else:
            price = None

        if title and authors and url:
            s = SearchResult()
            s.cover_url = cover_url
            s.title = title
            s.author = authors
            s.price = price
            s.detail_item = url
            s.formats = 'EPUB'
            s.drm = SearchResult.DRM_UNKNOWN

            yield s
示例#6
0
    def search(self, query, max_results=12, timeout=60):
        url = 'http://virtualo.pl/?q=' + urllib.quote(
            query) + '&f=format_id:4,6,3'

        br = browser()
        no_drm_pattern = re.compile(r'Znak wodny|Brak')

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath(
                    '//div[@id="content"]//div[@class="list_box list_box_border"]'
            ):
                if counter <= 0:
                    break

                id = ''.join(
                    data.xpath('.//div[@class="list_middle_left"]//a/@href')
                ).split(r'?q=')[0]
                if not id:
                    continue

                price = ''.join(
                    data.xpath(
                        './/span[@class="price"]/text() | .//span[@class="price abbr"]/text()'
                    ))
                cover_url = ''.join(
                    data.xpath(
                        './/div[@class="list_middle_left"]//a//img/@src'))
                title = ''.join(
                    data.xpath(
                        './/div[@class="list_title list_text_left"]/a/text()'))
                author = ', '.join(
                    data.xpath(
                        './/div[@class="list_authors list_text_left"]/a/text()'
                    ))
                formats = [
                    form.split('_')[-1].replace('.png', '')
                    for form in data.xpath(
                        './/div[@style="width:55%;float:left;text-align:left;height:18px;"]//a/span/img/@src'
                    )
                ]
                nodrm = no_drm_pattern.search(''.join(
                    data.xpath(
                        './/div[@style="width:45%;float:right;text-align:right;height:18px;"]//span[@class="prompt_preview"]/text()'
                    )))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url.split('.jpg')[0] + '.jpg'
                s.title = title.strip()
                s.author = author.strip()
                s.price = price + ' zł'
                s.detail_item = 'http://virtualo.pl' + id.strip().split(
                    'http://')[0]
                s.formats = ', '.join(formats).upper()
                s.drm = SearchResult.DRM_UNLOCKED if nodrm else SearchResult.DRM_LOCKED

                yield s
示例#7
0
    def search(self, query, max_results=20, timeout=60):

        br = browser()

        counter = max_results
        page = 1
        while counter:
            with closing(
                br.open(
                    "http://www.publio.pl/e-booki,strona" + str(page) + ".html?q=" + urllib.quote(query),
                    timeout=timeout,
                )
            ) as f:
                doc = html.fromstring(f.read())
                for data in doc.xpath('//div[@class="item"]'):
                    if counter <= 0:
                        break

                    id = "".join(data.xpath('.//div[@class="img"]/a/@href'))
                    if not id:
                        continue

                    cover_url = "".join(data.xpath('.//div[@class="img"]/a/img/@data-original'))
                    title = "".join(data.xpath('.//div[@class="img"]/a/@title'))
                    title2 = "".join(data.xpath('.//div[@class="desc"]/h5//text()'))
                    if title2:
                        title = title + ". " + title2
                    if (
                        "".join(
                            data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[last()]/span/text()')
                        ).strip()
                        == "Seria:"
                    ):
                        series = "".join(
                            data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[last()]/a/@title')
                        )
                        title = title + " (seria " + series + ")"
                    author = ", ".join(
                        data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[@class="row"][1]/a/@title')
                    )
                    price = "".join(data.xpath('.//div[@class="priceBox tk-museo-slab"]/ins/text()'))
                    if not price:
                        price = "".join(data.xpath('.//div[@class="priceBox tk-museo-slab"]/text()')).strip()
                    formats = ", ".join(data.xpath('.//div[@class="formats"]/a/img/@alt'))

                    counter -= 1

                    s = SearchResult()
                    s.cover_url = "http://www.publio.pl" + cover_url
                    s.title = title.strip()
                    s.author = author
                    s.price = price
                    s.detail_item = "http://www.publio.pl" + id.strip()
                    s.drm = SearchResult.DRM_LOCKED if "DRM" in formats else SearchResult.DRM_UNLOCKED
                    s.formats = formats.replace(" DRM", "").strip()

                    yield s
                if not doc.xpath('boolean(//a[@class="next"])'):
                    break
                page += 1
示例#8
0
    def search(self, query, max_results=10, timeout=60):
        url = ('http://www.whsmith.co.uk/search?keywordCategoryId=wc_dept_ebooks&results=60'
               '&page=1&keywords=' + urllib2.quote(query))

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//li[@class="product"]'):
                if counter <= 0:
                    break
                id_ = ''.join(data.xpath('./a[@class="product_image_wrap"]/@href'))
                if not id_:
                    continue
                id_ = 'http://www.whsmith.co.uk' + id_
                cover_url = ''.join(data.xpath('.//img[@class="product_image"]/@src'))
                title = ''.join(data.xpath('.//h4[@class="product_title"]/text()'))
                author = ', '.join(data.xpath('.//span[@class="product_second"]/text()'))
                price = ''.join(data.xpath('.//span[@class="price"]/text()'))
                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.drm = SearchResult.DRM_LOCKED
                s.detail_item = id_
                s.formats = 'ePub'

                yield s
示例#9
0
    def search(self, query, max_results=10, timeout=60):
        url = self.search_url + query.encode(
            'ascii', 'backslashreplace').replace('%', '%25').replace(
                '\\x', '%').replace(' ', '+')
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read().decode('latin-1', 'replace'))

            data_xpath = '//div[contains(@class, "prod")]'
            format_xpath = './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()'
            asin_xpath = '@name'
            cover_xpath = './/img[@class="productImage"]/@src'
            title_xpath = './/h3[@class="newaps"]/a//text()'
            author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
            price_xpath = './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()'

            for data in doc.xpath(data_xpath):
                if counter <= 0:
                    break

                # Even though we are searching digital-text only Amazon will still
                # put in results for non Kindle books (author pages). Se we need
                # to explicitly check if the item is a Kindle book and ignore it
                # if it isn't.
                format = ''.join(data.xpath(format_xpath))
                if 'kindle' not in format.lower():
                    continue

                # We must have an asin otherwise we can't easily reference the
                # book later.
                asin = data.xpath(asin_xpath)
                if asin:
                    asin = asin[0]
                else:
                    continue

                cover_url = ''.join(data.xpath(cover_xpath))

                title = ''.join(data.xpath(title_xpath))
                author = ''.join(data.xpath(author_xpath))
                try:
                    author = author.split('by ', 1)[1].split(" (")[0]
                except:
                    pass

                price = ''.join(data.xpath(price_xpath))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url.strip()
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = asin.strip()
                s.formats = 'Kindle'

                yield s
示例#10
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.bubok.es/resellers/calibre_search/' + urllib.quote_plus(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[contains(@class, "libro")]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//div[@class="url"]/text()'))

                title = ''.join(data.xpath('.//div[@class="titulo"]/text()'))

                author = ''.join(data.xpath('.//div[@class="autor"]/text()'))

                price = ''.join(data.xpath('.//div[@class="precio"]/text()'))

                formats = ''.join(data.xpath('.//div[@class="formatos"]/text()'))

                cover = ''.join(data.xpath('.//div[@class="portada"]/text()'))

                counter -= 1

                s = SearchResult()
                s.title = title.strip()
                s.author = author.strip()
                s.detail_item = id.strip()
                s.price = price.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = formats.strip()
                s.cover_url = cover.strip()
                yield s
示例#11
0
    def search(self, query, max_results=10, timeout=60):
        """Searches LibGen for Books. Since the mirror links are not direct
        downloads, it should not provide these as `s.downloads`.
        """
        debug_print = partial(module_debug_print, 'LibgenStore:search:')
        debug_print('search:query = ', query)

        libgen_results = self.libgen.search(query)

        for result in libgen_results.results[:min(max_results, len(libgen_results.results))]:
            debug_print('result.title = ', result.title)

            for mirror in result.mirrors[0:1]:  # Calibre only shows 1 anyway
                debug_print('result.mirror.url = ', mirror.url)

                s = SearchResult()

                s.store_name = PLUGIN_NAME
                s.cover_url = result.image_url
                s.title = '{} ({}, {}{})'.format(
                    result.title, result.language, mirror.size, mirror.unit)
                s.author = result.authors
                s.price = '0.00'
                s.detail_item = result.md5
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = mirror.format
                s.plugin_author = PLUGIN_AUTHORS

                debug_print('s = ', s)

                yield s
示例#12
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.barnesandnoble.com/s/%s?keyword=%s&store=ebook' % (query.replace(' ', '-'), urllib.quote_plus(query))

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//ul[contains(@class, "result-set")]/li[contains(@class, "result")]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//div[contains(@class, "image-bounding-box")]/a/@href'))
                if not id:
                    continue

                cover_url = ''.join(data.xpath('.//img[contains(@class, "product-image")]/@src'))

                title = ''.join(data.xpath('.//a[@class="title"]//text()'))
                author = ', '.join(data.xpath('.//a[@class="contributor"]//text()'))
                price = ''.join(data.xpath('.//div[@class="price-format"]//span[contains(@class, "price")]/text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = id.strip()
                s.drm = SearchResult.DRM_UNKNOWN
                s.formats = 'Nook'

                yield s
示例#13
0
    def search(self, query, max_results=10, timeout=60):
        print( "search!")
        q = query.decode('utf-8')

        url = "https://views.scraperwiki.com/run/haodooscraperview/?" + urlencode(
                {
                    "q": q
                } )
        print( url )

        br = browser()
        with closing(br.open(url, timeout=timeout)) as f:
            json_doc = f.read()
            if len(json_doc)>0:
                result = json.loads( json_doc )
                for volume in result:
                    s = SearchResult()
                    s.title = volume['title']
                    s.detail_item = volume['url']
                    s.price = '$0.00'
                    s.drm = SearchResult.DRM_UNLOCKED

                    if volume.has_key('type') and len(volume["type"]):
                        for t in volume["type"]:
                            s.downloads[ t['type'] ] = t['link']
                        s.formats = ', '.join(s.downloads.keys())
                    yield s
            else:
                print( "scrape nothing." )
示例#14
0
    def search(self, query, max_results=10, timeout=60):
        print("search!")
        q = query.decode('utf-8')

        url = "https://views.scraperwiki.com/run/haodooscraperview/?" + urlencode(
            {"q": q})
        print(url)

        br = browser()
        with closing(br.open(url, timeout=timeout)) as f:
            json_doc = f.read()
            if len(json_doc) > 0:
                result = json.loads(json_doc)
                for volume in result:
                    s = SearchResult()
                    s.title = volume['title']
                    s.detail_item = volume['url']
                    s.price = '$0.00'
                    s.drm = SearchResult.DRM_UNLOCKED

                    if volume.has_key('type') and len(volume["type"]):
                        for t in volume["type"]:
                            s.downloads[t['type']] = t['link']
                        s.formats = ', '.join(s.downloads.keys())
                    yield s
            else:
                print("scrape nothing.")
    def search(self, query, max_results=25, timeout=60):
        url = 'http://ebookpoint.pl/search?qa=&szukaj=' + quote_plus(
            query.decode('utf-8').encode('iso-8859-2')) + '&serwisyall=0&wprzyg=0&wsprzed=1&wyczerp=0&formaty=em-p'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//ul[@class="list"]/li'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('./a/@href'))
                if not id:
                    continue

                formats = ', '.join(data.xpath('.//ul[@class="book-type book-type-points"]//span[@class="popup"]/span/text()'))
                cover_url = ''.join(data.xpath('.//p[@class="cover"]/img/@data-src'))
                title = ''.join(data.xpath('.//div[@class="book-info"]/h3/a/text()'))
                author = ''.join(data.xpath('.//p[@class="author"]//text()'))
                price = ''.join(data.xpath('.//p[@class="price price-incart"]/a/ins/text()|.//p[@class="price price-add"]/a/text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = re.sub(r'\.',',',price)
                s.detail_item = id.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = formats.upper()

                yield s
示例#16
0
    def search(self, query, max_results=20, timeout=60):
        url = 'http://www.escapemagazine.pl/wyszukiwarka?query=' + urllib.quote_plus(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="item item_short"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//h2[@class="title"]/a[1]/@href'))
                if not id:
                    continue

                title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()'))
                author = ''.join(data.xpath('.//div[@class="author"]/text()'))
                price = ''.join(data.xpath('.//span[@class="price_now"]/strong/text()')) + ' zł'
                cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src'))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.detail_item = 'http://www.escapemagazine.pl' + id.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = 'PDF'

                yield s
示例#17
0
    def search(self, query, max_results=10, timeout=60):
        base_url = 'https://www.millsandboon.co.uk'
        url = base_url + '/search.aspx??format=ebook&searchText=' + urllib2.quote(query)
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//article[contains(@class, "group")]'):
                if counter <= 0:
                    break
                id_ = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/@href')).strip()
                if not id_:
                    continue

                cover_url = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@src'))
                title =  ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@alt')).strip()
                author = ''.join(data.xpath('.//a[@class="author"]/text()'))
                price = ''.join(data.xpath('.//div[@class="type-wrapper"]/ul/li[child::span[text()="eBook"]]/a/text()'))
                format_ = ''.join(data.xpath('.//p[@class="doc-meta-format"]/span[last()]/text()'))
                drm = SearchResult.DRM_LOCKED

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.detail_item = id_
                s.drm = drm
                s.formats = format_

                yield s
示例#18
0
def search_amazon(query, max_results=10, timeout=60,
                  write_html_to=None,
                  base_url=SEARCH_BASE_URL,
                  base_query=SEARCH_BASE_QUERY,
                  field_keywords='k'
                  ):
    uquery = base_query.copy()
    uquery[field_keywords] = query

    def asbytes(x):
        if isinstance(x, type('')):
            x = x.encode('utf-8')
        return x
    uquery = {asbytes(k):asbytes(v) for k, v in uquery.items()}
    url = base_url + '?' + urlencode(uquery)
    br = browser(user_agent=get_user_agent())

    counter = max_results
    with closing(br.open(url, timeout=timeout)) as f:
        raw = f.read()
        if write_html_to is not None:
            with open(write_html_to, 'wb') as f:
                f.write(raw)
        doc = html.fromstring(raw)
        for result in doc.xpath('//div[contains(@class, "s-result-list")]//div[@data-index and @data-asin]'):
            kformat = ''.join(result.xpath('.//a[contains(text(), "{}")]//text()'.format(KINDLE_EDITION)))
            # Even though we are searching digital-text only Amazon will still
            # put in results for non Kindle books (author pages). Se we need
            # to explicitly check if the item is a Kindle book and ignore it
            # if it isn't.
            if 'kindle' not in kformat.lower():
                continue
            asin = result.get('data-asin')
            if not asin:
                continue

            cover_url = ''.join(result.xpath('.//img/@src'))
            title = etree.tostring(result.xpath('.//h2')[0], method='text', encoding='unicode')
            adiv = result.xpath('.//div[contains(@class, "a-color-secondary")]')[0]
            aparts = etree.tostring(adiv, method='text', encoding='unicode').split()
            idx = aparts.index(BY)
            author = ' '.join(aparts[idx+1:]).split('|')[0].strip()
            price = ''
            for span in result.xpath('.//span[contains(@class, "a-price")]/span[contains(@class, "a-offscreen")]'):
                q = ''.join(span.xpath('./text()'))
                if q:
                    price = q
                    break

            counter -= 1

            s = SearchResult()
            s.cover_url = cover_url.strip()
            s.title = title.strip()
            s.author = author.strip()
            s.detail_item = asin.strip()
            s.price = price.strip()
            s.formats = 'Kindle'

            yield s
def open_search(url, query, max_results=10, timeout=60):
    description = Description(url)
    url_template = description.get_best_template()
    if not url_template:
        return
    oquery = Query(url_template)

    # set up initial values
    oquery.searchTerms = query
    oquery.count = max_results
    url = oquery.url()

    counter = max_results
    br = browser()
    with closing(br.open(url, timeout=timeout)) as f:
        doc = etree.fromstring(f.read())
        for data in doc.xpath('//*[local-name() = "entry"]'):
            if counter <= 0:
                break

            counter -= 1

            s = SearchResult()

            s.detail_item = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip()

            for link in data.xpath('./*[local-name() = "link"]'):
                rel = link.get('rel')
                href = link.get('href')
                type = link.get('type')

                if rel and href and type:
                    if 'http://opds-spec.org/thumbnail' in rel:
                        s.cover_url = href
                    elif 'http://opds-spec.org/image/thumbnail' in rel:
                        s.cover_url = href
                    elif 'http://opds-spec.org/acquisition/buy' in rel:
                        s.detail_item = href
                    elif 'http://opds-spec.org/acquisition/sample' in rel:
                        pass
                    elif 'http://opds-spec.org/acquisition' in rel:
                        if type:
                            ext = guess_extension(type)
                            if ext:
                                ext = ext[1:].upper().strip()
                                s.downloads[ext] = href
            s.formats = ', '.join(s.downloads.keys()).strip()

            s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
            s.author = ', '.join(data.xpath('./*[local-name() = "author"]//*[local-name() = "name"]//text()')).strip()

            price_e = data.xpath('.//*[local-name() = "price"][1]')
            if price_e:
                price_e = price_e[0]
                currency_code = price_e.get('currencycode', '')
                price = ''.join(price_e.xpath('.//text()')).strip()
                s.price = currency_code + ' ' + price
                s.price = s.price.strip()

            yield s
示例#20
0
    def search(self, query, max_results=10, timeout=60):
        url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+')
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())#.decode('latin-1', 'replace'))

            data_xpath = '//div[contains(@class, "prod")]'
            format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()'
            asin_xpath = '@name'
            cover_xpath = './/img[@class="productImage"]/@src'
            title_xpath = './/h3[@class="newaps"]/a//text()'
            author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]/text()'
            price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()'

            for data in doc.xpath(data_xpath):
                if counter <= 0:
                    break

                # Even though we are searching digital-text only Amazon will still
                # put in results for non Kindle books (author pages). Se we need
                # to explicitly check if the item is a Kindle book and ignore it
                # if it isn't.
                format_ = ''.join(data.xpath(format_xpath))
                if 'kindle' not in format_.lower():
                    continue

                # We must have an asin otherwise we can't easily reference the
                # book later.
                asin = data.xpath(asin_xpath)
                if asin:
                    asin = asin[0]
                else:
                    continue

                cover_url = ''.join(data.xpath(cover_xpath))

                title = ''.join(data.xpath(title_xpath))
                author = ''.join(data.xpath(author_xpath))
                try:
                    if self.author_article:
                        author = author.split(self.author_article, 1)[1].split(" (")[0]
                except:
                    pass

                price = ''.join(data.xpath(price_xpath))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url.strip()
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = asin.strip()
                s.drm = SearchResult.DRM_UNKNOWN
                s.formats = 'Kindle'

                yield s
示例#21
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://bookoteka.pl/list?search=' + urllib.quote_plus(query) + '&cat=1&hp=1&type=1'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//li[@class="EBOOK"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//a[@class="item_link"]/@href'))
                if not id:
                    continue

                cover_url = ''.join(data.xpath('.//a[@class="item_link"]/img/@src'))
                title = ''.join(data.xpath('.//div[@class="shelf_title"]/a/text()'))
                author = ''.join(data.xpath('.//div[@class="shelf_authors"][1]/text()'))
                price = ''.join(data.xpath('.//span[@class="EBOOK"]/text()'))
                price = price.replace('.', ',')
                formats = ', '.join(data.xpath('.//a[@class="fancybox protected"]/text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = 'http://bookoteka.pl' + cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.detail_item = 'http://bookoteka.pl' + id.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = formats.strip()

                yield s
示例#22
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.bubok.pt/resellers/calibre_search/' + urllib.quote_plus(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[contains(@class, "libro")]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//div[@class="url"]/text()'))

                title = ''.join(data.xpath('.//div[@class="titulo"]/text()'))

                author = ''.join(data.xpath('.//div[@class="autor"]/text()'))

                price = ''.join(data.xpath('.//div[@class="precio"]/text()'))

                formats = ''.join(data.xpath('.//div[@class="formatos"]/text()'))

                cover = ''.join(data.xpath('.//div[@class="portada"]/text()'))

                counter -= 1

                s = SearchResult()
                s.title = title.strip()
                s.author = author.strip()
                s.detail_item = id.strip()
                s.price = price.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = formats.strip()
                s.cover_url = cover.strip()
                yield s
示例#23
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://bookoteka.pl/list?search=' + urllib.quote_plus(query) + '&cat=1&hp=1&type=1'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//li[@class="EBOOK"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//a[@class="item_link"]/@href'))
                if not id:
                    continue

                cover_url = ''.join(data.xpath('.//a[@class="item_link"]/img/@src'))
                title = ''.join(data.xpath('.//div[@class="shelf_title"]/a/text()'))
                author = ''.join(data.xpath('.//div[@class="shelf_authors"][1]/text()'))
                price = ''.join(data.xpath('.//span[@class="EBOOK"]/text()'))
                price = price.replace('.', ',')
                formats = ', '.join(data.xpath('.//a[@class="fancybox protected"]/text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = 'http://bookoteka.pl' + cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.detail_item = 'http://bookoteka.pl' + id.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = formats.strip()

                yield s
示例#24
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.kobobooks.com/search/search.html?q=' + urllib.quote_plus(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//ul[contains(@class, "flowview-items")]/li'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('./a[contains(@class, "block-link")]/@href'))
                if not id:
                    continue
                id = id[1:]

                price = ''.join(data.xpath('.//a[contains(@class, "primary-button")]//text()'))

                cover_url = ''.join(data.xpath('.//img[1]/@src'))
                cover_url = 'http:%s' % cover_url

                title = ''.join(data.xpath('.//p[contains(@class, "flowview-item-title")]//text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.price = price.strip()
                s.detail_item = 'http://store.kobobooks.com/' + id.strip()
                s.formats = 'EPUB'
                s.drm = SearchResult.DRM_UNKNOWN

                yield s
示例#25
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://ebooks.eharlequin.com/BANGSearch.dll?Type=FullText&FullTextField=All&FullTextCriteria=' + urllib2.quote(query)
        
        br = browser()
        
        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//table[not(.//@class="sidelink")]/tr[.//ul[@id="details"]]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//ul[@id="details"]/li[@id="title-results"]/a/@href'))
                if not id:
                    continue

                title = ''.join(data.xpath('.//ul[@id="details"]/li[@id="title-results"]/a/text()'))
                author = ''.join(data.xpath('.//ul[@id="details"]/li[@id="author"][1]//a/text()'))
                price = ''.join(data.xpath('.//div[@class="ourprice"]/font/text()'))
                cover_url = ''.join(data.xpath('.//a[@href="%s"]/img/@src' % id))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = 'http://ebooks.eharlequin.com/' + id.strip()
                s.formats = 'EPUB'
                
                yield s
示例#26
0
    def search(self, query, max_results=25, timeout=60):
        url = 'http://ebookpoint.pl/search?qa=&szukaj=' + quote_plus(
            query.decode('utf-8').encode('iso-8859-2')) + '&serwisyall=0&wprzyg=0&wsprzed=1&wyczerp=0&formaty=em-p'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//ul[@class="list"]/li'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('./a/@href'))
                if not id:
                    continue

                formats = ', '.join(data.xpath('.//ul[@class="book-type book-type-points"]//span[@class="popup"]/span/text()'))
                cover_url = ''.join(data.xpath('.//p[@class="cover"]/img/@data-src'))
                title = ''.join(data.xpath('.//div[@class="book-info"]/h3/a/text()'))
                author = ''.join(data.xpath('.//p[@class="author"]//text()'))
                price = ''.join(data.xpath('.//p[@class="price price-incart"]/a/ins/text()|.//p[@class="price price-add"]/a/text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = re.sub(r'\.',',',price)
                s.detail_item = id.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = formats.upper()

                yield s
    def search(self, query, max_results=10, timeout=60):
        url = ('https://www.whsmith.co.uk/search?keywordCategoryId=wc_dept_ebooks&results=60'
               '&page=1&keywords=' + quote(query))

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//li[@class="product"]'):
                if counter <= 0:
                    break
                id_ = ''.join(data.xpath('./a[@class="product_image_wrap"]/@href'))
                if not id_:
                    continue
                id_ = 'https://www.whsmith.co.uk' + id_
                cover_url = ''.join(data.xpath('.//img[@class="product_image"]/@src'))
                title = ''.join(data.xpath('.//h4[@class="product_title"]/text()'))
                author = ', '.join(data.xpath('.//span[@class="product_second"]/text()'))
                price = ''.join(data.xpath('.//span[@class="price"]/text()'))
                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.drm = SearchResult.DRM_LOCKED
                s.detail_item = id_
                s.formats = 'ePub'

                yield s
示例#28
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://ebooks.foyles.co.uk/catalog/search/?query=' + urllib2.quote(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="doc-item"]'):
                if counter <= 0:
                    break
                id_ = ''.join(data.xpath('.//p[@class="doc-cover"]/a/@href')).strip()
                if not id_:
                    continue
                id_ = 'http://ebooks.foyles.co.uk' + id_

                cover_url = ''.join(data.xpath('.//p[@class="doc-cover"]/a/img/@src'))
                title = ''.join(data.xpath('.//span[@class="title"]/a/text()'))
                author = ', '.join(data.xpath('.//span[@class="author"]/span[@class="author"]/text()'))
                price = ''.join(data.xpath('.//span[@itemprop="price"]/text()')).strip()
                format_ = ''.join(data.xpath('.//p[@class="doc-meta-format"]/span[last()]/text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.detail_item = id_
                s.drm = SearchResult.DRM_LOCKED
                s.formats = format_

                yield s
示例#29
0
    def search(self, query, max_results=10, timeout=60):

        br = browser()
        page = 1

        counter = max_results
        while counter:
            with closing(
                    br.open(u'https://cdp.pl/ksiazki/e-book.html?q=' +
                            urllib.quote_plus(query) + '&p=' + str(page),
                            timeout=timeout)) as f:
                doc = html.fromstring(f.read())
                for data in doc.xpath('//ul[@class="grid-of-products"]/li'):
                    if counter <= 0:
                        break

                    id = ''.join(
                        data.xpath('.//a[@class="product-image"]/@href'))
                    if not id:
                        continue
                    if 'ksiazki' not in id:
                        continue

                    cover_url = ''.join(
                        data.xpath(
                            './/a[@class="product-image"]/img/@data-src'))
                    title = ''.join(data.xpath('.//h3[1]/a/@title'))
                    price = ''.join(
                        data.xpath('.//span[@class="custom_price"]/text()')
                    ) + ',' + ''.join(
                        data.xpath(
                            './/span[@class="custom_price"]/sup/text()'))
                    author = ''
                    formats = ''
                    with closing(br.open(id.strip(),
                                         timeout=timeout / 4)) as nf:
                        idata = html.fromstring(nf.read())
                        author = ', '.join(
                            idata.xpath(
                                './/ul[@class="film-data"]/li[1]/p/text()'))
                        formats = idata.xpath(
                            '//div[@class="product-attributes-container"][2]/ul/li/span/text()'
                        )[-1]

                    counter -= 1

                    s = SearchResult()
                    s.cover_url = cover_url
                    s.title = title.strip()
                    s.author = author
                    s.price = price + ' zł'
                    s.detail_item = id.strip()
                    s.drm = SearchResult.DRM_UNLOCKED
                    s.formats = formats.upper().strip()

                    yield s
                if not doc.xpath('//span[@class="next-page"]/a'):
                    break
            page += 1
    def search(self, query, max_results=10, timeout=60):
        url = 'http://ebookstore.sony.com/search?keyword=%s' % urllib.quote_plus(
            query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for item in doc.xpath(
                    '//div[contains(@class, "searchResult")]/'
                    'descendant::li[contains(@class, "hreview")]'):
                if counter <= 0:
                    break

                curr = ''.join(
                    item.xpath(
                        'descendant::div[@class="pricing"]/descendant::*[@class="currency"]/@title'
                    )).strip()
                amt = ''.join(
                    item.xpath(
                        'descendant::div[@class="pricing"]/descendant::*[@class="amount"]/text()'
                    )).strip()
                s = SearchResult()
                s.price = (curr + ' ' +
                           amt) if (curr and amt) else _('Not Available')
                title = item.xpath('descendant::h3[@class="item"]')
                if not title: continue
                title = etree.tostring(title[0],
                                       method='text',
                                       encoding=unicode)
                if not title: continue
                s.title = title.strip()
                s.author = ''.join(
                    item.xpath('descendant::li[contains(@class, "author")]/'
                               'a[@class="fn"]/text()')).strip()
                if not s.author: continue
                detail_url = ''.join(
                    item.xpath('descendant::h3[@class="item"]'
                               '/descendant::a[@class="fn" and @href]/@href'))
                if not detail_url: continue
                s.detail_item = detail_url

                counter -= 1

                cover_url = ''.join(
                    item.xpath('descendant::li[@class="coverart"]/'
                               'descendant::img[@src]/@src'))
                if cover_url:
                    if cover_url.startswith('//'):
                        cover_url = 'http:' + cover_url
                    elif cover_url.startswith('/'):
                        cover_url = 'http://ebookstore.sony.com' + cover_url
                    s.cover_url = url_slash_cleaner(cover_url)

                s.drm = SearchResult.DRM_UNKNOWN
                s.formats = 'Sony'

                yield s
示例#31
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://woblink.com/publication?query=' + urllib.quote_plus(query.encode('utf-8'))
        if max_results > 10:
            if max_results > 20:
                url += '&limit=30'
            else:
                url += '&limit=20'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="book-item"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//td[@class="w10 va-t"]/a[1]/@href'))
                if not id:
                    continue

                cover_url = ''.join(data.xpath('.//td[@class="w10 va-t"]/a[1]/img/@src'))
                title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()'))
                author = ', '.join(data.xpath('.//p[@class="author"]/a/text()'))
                price = ''.join(data.xpath('.//div[@class="prices"]/span[1]/span/text()'))
                price = re.sub('\.', ',', price)
                formats = [ form[8:-4].split('_')[0] for form in data.xpath('.//p[3]/img/@src')]

                s = SearchResult()
                s.cover_url = 'http://woblink.com' + cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price + ' zł'
                s.detail_item = id.strip()
                
                # MOBI should be send first,
                if 'MOBI' in formats:
                    t = copy.copy(s)
                    t.title += ' MOBI'
                    t.drm = SearchResult.DRM_UNLOCKED
                    t.formats = 'MOBI'
                    formats.remove('MOBI')
                    
                    counter -= 1
                    yield t
                    
                # and the remaining formats (if any) next
                if formats:
                    if 'epub' in formats:
                        formats.remove('epub')
                        formats.append('WOBLINK')
                        if 'E Ink' in data.xpath('.//div[@class="prices"]/img/@title'):
                            formats.insert(0, 'EPUB')
                    
                    s.drm = SearchResult.DRM_LOCKED
                    s.formats = ', '.join(formats).upper()
                    
                    counter -= 1
                    yield s
示例#32
0
def search(query, max_results=10, timeout=60, write_raw_to=None):
    url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + urllib.quote_plus(query)

    counter = max_results
    br = browser(user_agent='calibre/'+__version__)
    with closing(br.open(url, timeout=timeout)) as f:
        raw = f.read()
        if write_raw_to is not None:
            with open(write_raw_to, 'wb') as f:
                f.write(raw)
        doc = etree.fromstring(raw)
        for data in doc.xpath('//*[local-name() = "entry"]'):
            if counter <= 0:
                break

            counter -= 1

            s = SearchResult()

            # We could use the <link rel="alternate" type="text/html" ...> tag from the
            # detail odps page but this is easier.
            id = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip()
            s.detail_item = fix_url(url_slash_cleaner('%s/ebooks/%s' % (web_url, re.sub('[^\d]', '', id))))
            if not s.detail_item:
                continue

            s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
            s.author = ', '.join(data.xpath('./*[local-name() = "content"]//text()')).strip()
            if not s.title or not s.author:
                continue

            # Get the formats and direct download links.
            with closing(br.open(id, timeout=timeout/4)) as nf:
                ndoc = etree.fromstring(nf.read())
                for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'):
                    type = link.get('type')
                    href = link.get('href')
                    if type:
                        ext = mimetypes.guess_extension(type)
                        if ext:
                            ext = ext[1:].upper().strip()
                            s.downloads[ext] = fix_url(href)

            s.formats = ', '.join(s.downloads.keys())
            if not s.formats:
                continue

            for link in data.xpath('./*[local-name() = "link"]'):
                rel = link.get('rel')
                href = link.get('href')
                type = link.get('type')

                if rel and href and type:
                    href = fix_url(href)
                    if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
                        if href.startswith('data:image/png;base64,'):
                            s.cover_data = base64.b64decode(href.replace('data:image/png;base64,', ''))

            yield s
示例#33
0
def search(query, max_results=10, timeout=60, write_raw_to=None):
    url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + quote_plus(query)

    counter = max_results
    br = browser(user_agent='calibre/'+__version__)
    with closing(br.open(url, timeout=timeout)) as f:
        raw = f.read()
        if write_raw_to is not None:
            with open(write_raw_to, 'wb') as f:
                f.write(raw)
        doc = etree.fromstring(raw)
        for data in doc.xpath('//*[local-name() = "entry"]'):
            if counter <= 0:
                break

            counter -= 1

            s = SearchResult()

            # We could use the <link rel="alternate" type="text/html" ...> tag from the
            # detail odps page but this is easier.
            id = fix_url(''.join(data.xpath('./*[local-name() = "id"]/text()')).strip())
            s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (web_url, re.sub(r'[^\d]', '', id)))
            s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
            s.author = ', '.join(data.xpath('./*[local-name() = "content"]//text()')).strip()
            if not s.title or not s.author:
                continue

            # Get the formats and direct download links.
            with closing(br.open(id, timeout=timeout/4)) as nf:
                ndoc = etree.fromstring(nf.read())
                for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'):
                    type = link.get('type')
                    href = link.get('href')
                    if type:
                        ext = mimetypes.guess_extension(type)
                        if ext:
                            ext = ext[1:].upper().strip()
                            s.downloads[ext] = fix_url(href)

            s.formats = ', '.join(s.downloads.keys())
            if not s.formats:
                continue

            for link in data.xpath('./*[local-name() = "link"]'):
                rel = link.get('rel')
                href = link.get('href')
                type = link.get('type')

                if rel and href and type:
                    href = fix_url(href)
                    if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
                        if href.startswith('data:image/png;base64,'):
                            cdata = href.replace('data:image/png;base64,', '')
                            if not isinstance(cdata, bytes):
                                cdata = cdata.encode('ascii')
                            s.cover_data = base64.b64decode(cdata)

            yield s
示例#34
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.nexto.pl/szukaj.xml?search-clause=' + urllib.parse.quote_plus(
            query) + '&scid=1015'

        br = browser()
        offset = 0

        counter = max_results

        while counter:
            with closing(
                    br.open(url + '&_offset=' + str(offset),
                            timeout=timeout)) as f:
                doc = html.fromstring(f.read())
                for data in doc.xpath('//ul[@class="productslist"]/li'):
                    if counter <= 0:
                        break

                    id = ''.join(data.xpath('.//div[@class="col-2"]/a/@href'))
                    if not id:
                        continue

                    price = ''.join(
                        data.xpath('.//strong[@class="nprice"]/text()'))

                    cover_url = ''.join(
                        data.xpath('.//img[@class="cover"]/@src'))
                    cover_url = re.sub(r'%2F', '/', cover_url)
                    cover_url = re.sub(r'widthMax=120&heightMax=200',
                                       'widthMax=64&heightMax=64', cover_url)
                    title = ''.join(data.xpath('.//a[@class="title"]/text()'))
                    title = re.sub(r' – ebook', '', title)
                    author = ', '.join(
                        data.xpath('.//div[@class="col-7"]//h4//a/text()'))
                    formats = ', '.join(
                        data.xpath('.//ul[@class="formats"]/li//b/text()'))
                    DrmFree = re.search(
                        r'znak',
                        str(data.xpath(
                            './/ul[@class="formats"]/li//b/@title')))

                    counter -= 1

                    s = SearchResult()
                    s.cover_url = cover_url if cover_url[:
                                                         4] == 'http' else 'http://www.nexto.pl' + cover_url
                    s.title = title.strip()
                    s.author = author.strip()
                    s.price = price.strip()
                    s.detail_item = id.strip()
                    s.drm = SearchResult.DRM_UNLOCKED if DrmFree else SearchResult.DRM_LOCKED
                    s.formats = formats.upper().strip()

                    yield s
                if not doc.xpath(
                        '//div[@class="listnavigator"]//a[@class="next"]'):
                    break
            offset += 10
    def search(self, query, max_results=10, timeout=60):
        search_url = 'http://www.amazon.fr/s/?url=search-alias%3Ddigital-text&field-keywords='
        url = search_url + query.encode('ascii', 'backslashreplace').replace(
            '%', '%25').replace('\\x', '%').replace(' ', '+')
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            # doc = html.fromstring(f.read().decode('latin-1', 'replace'))
            # Apparently amazon Europe is responding in UTF-8 now
            doc = html.fromstring(f.read())

            data_xpath = '//div[contains(@class, "result") and contains(@class, "product")]'
            format_xpath = './/span[@class="format"]/text()'
            cover_xpath = './/img[@class="productImage"]/@src'

            for data in doc.xpath(data_xpath):
                if counter <= 0:
                    break

                # Even though we are searching digital-text only Amazon will still
                # put in results for non Kindle books (author pages). So we need
                # to explicitly check if the item is a Kindle book and ignore it
                # if it isn't.
                format = ''.join(data.xpath(format_xpath))
                if 'kindle' not in format.lower():
                    continue

                # We must have an asin otherwise we can't easily reference the
                # book later.
                asin = ''.join(data.xpath("@name"))

                cover_url = ''.join(data.xpath(cover_xpath))

                title = ''.join(data.xpath('.//a[@class="title"]/text()'))
                price = ''.join(
                    data.xpath(
                        './/div[@class="newPrice"]/span[contains(@class, "price")]/text()'
                    ))
                author = unicode(''.join(
                    data.xpath(
                        './/h3[@class="title"]/span[@class="ptBrand"]/text()'))
                                 )
                if author.startswith('de '):
                    author = author[3:]

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url.strip()
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = asin.strip()
                s.formats = 'Kindle'
                s.drm = SearchResult.DRM_UNKNOWN

                yield s
示例#36
0
    def search(self, query, max_results=10, timeout=60):
        # check for cyrillic symbols before performing search
        uquery = unicode(query.strip(), 'utf-8')
        reObj = re.search(u'^[а-яА-Я\\d\\s]{3,}$', uquery)
        if not reObj:
            return

        base_url = 'http://chitanka.info'
        url = base_url + '/search?q=' + urllib2.quote(query)
        counter = max_results

        # search for book title
        br = browser()
        try:
            with closing(br.open(url, timeout=timeout)) as f:
                f = unicode(f.read(), 'utf-8')
                doc = html.fromstring(f)

                for data in doc.xpath('//ul[@class="superlist booklist"]/li'):
                    if counter <= 0:
                        break

                    id = ''.join(
                        data.xpath('.//a[@class="booklink"]/@href')).strip()
                    if not id:
                        continue

                    counter -= 1

                    s = SearchResult()
                    s.cover_url = ''.join(
                        data.xpath(
                            './/a[@class="booklink"]/img/@src')).strip()
                    s.title = ''.join(
                        data.xpath(
                            './/a[@class="booklink"]/i/text()')).strip()
                    s.author = ''.join(
                        data.xpath(
                            './/span[@class="bookauthor"]/a/text()')).strip()
                    s.detail_item = id
                    s.drm = SearchResult.DRM_UNLOCKED
                    s.downloads['FB2'] = base_url + ''.join(
                        data.xpath('.//a[@class="dl dl-fb2"]/@href')).strip(
                        ).replace('.zip', '')
                    s.downloads['EPUB'] = base_url + ''.join(
                        data.xpath('.//a[@class="dl dl-epub"]/@href')).strip(
                        ).replace('.zip', '')
                    s.downloads['TXT'] = base_url + ''.join(
                        data.xpath('.//a[@class="dl dl-txt"]/@href')).strip(
                        ).replace('.zip', '')
                    s.formats = 'FB2, EPUB, TXT, SFB'
                    yield s
        except urllib2.HTTPError, e:
            if e.code == 404:
                return
            else:
                raise
示例#37
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.barnesandnoble.com/s/%s?keyword=%s&store=ebook&view=list' % (
            query.replace(' ', '-'), urllib.parse.quote_plus(query))

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            raw = f.read()
            doc = html.fromstring(raw)
            for data in doc.xpath(
                    '//ol[contains(@class, "result-set")]/li[contains(@class, "result")]'
            ):
                if counter <= 0:
                    break

                id = ''.join(
                    data.xpath(
                        './/div[contains(@class, "image-block")]/a/@href'))
                if not id:
                    continue

                cover_url = ''
                cover_id = ''.join(
                    data.xpath(
                        './/img[contains(@class, "product-image")]/@id'))
                m = re.search(r"%s'.*?srcUrl: '(?P<iurl>.*?)'.*?}" % cover_id,
                              raw)
                if m:
                    cover_url = m.group('iurl')

                title = ''.join(
                    data.xpath(
                        'descendant::p[@class="title"]//span[@class="name"]//text()'
                    )).strip()
                if not title:
                    continue

                author = ', '.join(
                    data.xpath(
                        './/ul[contains(@class, "contributors")]//a[contains(@class, "subtle")]//text()'
                    )).strip()
                price = ''.join(
                    data.xpath('.//a[contains(@class, "bn-price")]//text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = id.strip()
                s.drm = SearchResult.DRM_UNKNOWN
                s.formats = 'Nook'

                yield s
示例#38
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.waterstones.com/waterstonesweb/simpleSearch.do?simpleSearchString=ebook+' + urllib2.quote(
            query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[contains(@class, "results-pane")]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('./div/div/h2/a/@href')).strip()
                if not id:
                    continue
                cover_url = ''.join(
                    data.xpath('.//div[@class="image"]/a/img/@src'))
                if not cover_url.startswith("http"):
                    cover_url = 'http://www.waterstones.com' + cover_url
                title = ''.join(data.xpath('./div/div/h2/a/text()'))
                author = ', '.join(
                    data.xpath('.//p[@class="byAuthor"]/a/text()'))
                price = ''.join(
                    data.xpath(
                        './/p[@class="price"]/span[@class="priceRed2"]/text()')
                )
                drm = data.xpath(
                    'boolean(.//td[@headers="productFormat" and contains(., "DRM")])'
                )
                pdf = data.xpath(
                    'boolean(.//td[@headers="productFormat" and contains(., "PDF")])'
                )
                epub = data.xpath(
                    'boolean(.//td[@headers="productFormat" and contains(., "EPUB")])'
                )

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                if drm:
                    s.drm = SearchResult.DRM_LOCKED
                else:
                    s.drm = SearchResult.DRM_UNKNOWN
                s.detail_item = id
                formats = []
                if epub:
                    formats.append('ePub')
                if pdf:
                    formats.append('PDF')
                s.formats = ', '.join(formats)

                yield s
示例#39
0
def search_kobo(query, max_results=10, timeout=60, write_html_to=None):
    from css_selectors import Select
    url = 'http://www.kobobooks.com/search/search.html?q=' + urllib.quote_plus(query)

    br = browser()

    with closing(br.open(url, timeout=timeout)) as f:
        raw = f.read()
        if write_html_to is not None:
            with open(write_html_to, 'wb') as f:
                f.write(raw)
        doc = html.fromstring(raw)
        select = Select(doc)
        for i, item in enumerate(select('.result-items .item-wrapper.book')):
            if i == max_results:
                break
            for img in select('.item-image img[src]', item):
                cover_url = img.get('src')
                if cover_url.startswith('//'):
                    cover_url = 'http:' + cover_url
                break
            else:
                cover_url = None

            for p in select('p.title', item):
                title = etree.tostring(p, method='text', encoding=unicode).strip()
                for a in select('a[href]', p):
                    url = 'http://store.kobobooks.com' + a.get('href')
                    break
                else:
                    url = None
                break
            else:
                title = None

            authors = []
            for a in select('p.author a.contributor', item):
                authors.append(etree.tostring(a, method='text', encoding=unicode).strip())
            authors = authors_to_string(authors)

            for p in select('p.price', item):
                price = etree.tostring(p, method='text', encoding=unicode).strip()
                break
            else:
                price = None

            if title and authors and url:
                s = SearchResult()
                s.cover_url = cover_url
                s.title = title
                s.author = authors
                s.price = price
                s.detail_item = url
                s.formats = 'EPUB'
                s.drm = SearchResult.DRM_UNKNOWN

            yield s
示例#40
0
 def parse_search_result(self, node):
     r = SearchResult()
     r.detail_item = text(node, './/*', 'Item__title', '/a/@href')
     r.title       = text(node, './/*', 'Item__title', '/a/text()')
     r.author      = text(node, './/*', 'Item__authors')
     r.price       = text(node, './/*', 'pricing__price') + ' kr'
     r.formats     = text(node, './/*', 'Item__format-as-link')
     r.cover_url   = text(node, './/img', 'Item__image', '/@data-src')
     return r
示例#41
0
    def search(self, query, max_results=10, timeout=60):

        br = browser()
        page = 1

        counter = max_results
        while counter:
            with closing(
                    br.open('http://ebooki.allegro.pl/szukaj?fraza=' +
                            urllib.quote(query) + '&strona=' + str(page),
                            timeout=timeout)) as f:
                doc = html.fromstring(f.read().decode('utf-8'))
                for data in doc.xpath(
                        '//div[@class="listing-list"]/div[@class="listing-list-item"]'
                ):
                    if counter <= 0:
                        break

                    id = ''.join(
                        data.xpath(
                            './/div[@class="listing-cover-wrapper"]/a/@href'))
                    if not id:
                        continue

                    cover_url = ''.join(
                        data.xpath(
                            './/div[@class="listing-cover-wrapper"]/a/img/@src'
                        ))
                    title = ''.join(
                        data.xpath(
                            './/div[@class="listing-info"]/div[1]/a/text()'))
                    author = ', '.join(
                        data.xpath(
                            './/div[@class="listing-info"]/div[2]/a/text()'))
                    price = ''.join(
                        data.xpath('.//div[@class="book-price"]/text()'))
                    formats = ', '.join(
                        data.xpath(
                            './/div[@class="listing-buy-formats"]//div[@class="devices-wrapper"]/span[@class="device-label"]/span/text()'
                        ))

                    counter -= 1

                    s = SearchResult()
                    s.cover_url = 'http://ebooki.allegro.pl/' + cover_url
                    s.title = title.strip()
                    s.author = author.strip()
                    s.price = price
                    s.detail_item = 'http://ebooki.allegro.pl/' + id[1:]
                    s.formats = formats.upper()
                    s.drm = SearchResult.DRM_UNLOCKED

                    yield s
                if not doc.xpath(
                        '//a[@class="paging-arrow right-paging-arrow"]'):
                    break
            page += 1
示例#42
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.baenebooks.com/searchadv.aspx?IsSubmit=true&SearchTerm=' + urllib2.quote(
            query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//table//table//table//table//tr'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('./td[1]/a/@href'))
                if not id or not id.startswith('p-'):
                    continue

                title = ''.join(data.xpath('./td[1]/a/text()'))

                author = ''
                cover_url = ''
                price = ''

                with closing(
                        br.open('http://www.baenebooks.com/' + id.strip(),
                                timeout=timeout / 4)) as nf:
                    idata = html.fromstring(nf.read())
                    author = ''.join(
                        idata.xpath(
                            '//span[@class="ProductNameText"]/../b/text()'))
                    author = author.split('by ')[-1]
                    price = ''.join(
                        idata.xpath('//span[@class="variantprice"]/text()'))
                    a, b, price = price.partition('$')
                    price = b + price

                    pnum = ''
                    mo = re.search(r'p-(?P<num>\d+)-', id.strip())
                    if mo:
                        pnum = mo.group('num')
                    if pnum:
                        cover_url = 'http://www.baenebooks.com/' + ''.join(
                            idata.xpath(
                                '//img[@id="ProductPic%s"]/@src' % pnum))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.detail_item = id.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = 'RB, MOBI, EPUB, LIT, LRF, RTF, HTML'

                yield s
示例#43
0
 def parse_book_details(self, node):
     r = SearchResult()
     r.title     = text(node, './/*[@itemprop="name"]')
     r.author    = text(node, './/*[@itemprop="author"]')
     r.price     = text(node, './/*', 'price ')
     r.cover_url = text(node, './/img[@itemprop="image"]', '', '/@src')
     r.formats   = text(node, '//th[contains(., "Mediatyp")]/following-sibling::td[1]')
     r.drm       = r.formats
     return r
示例#44
0
 def parse_book_details(self, node):
     r = SearchResult()
     r.title = text(node, './/*[@itemprop="name"]')
     r.author = text(node, './/*', 'bookdetails__authorname')
     r.price = text(node, './/*', 'bookdetails__price')
     r.cover_url = text(node, './/img[@itemprop="image"]', '', '/@src')
     r.formats = text(node, './/*', 'book_info__format', '/span[2]/text()')
     r.drm = text(node, './/*', 'book_info__drm', '/span[2]/text()')
     return r
示例#45
0
    def search(self, query, max_results=10, timeout=60):
        url = "http://ebookstore.sony.com/search?keyword=%s" % urllib.quote_plus(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for item in doc.xpath(
                '//div[contains(@class, "searchResult")]/' 'descendant::li[contains(@class, "hreview")]'
            ):
                if counter <= 0:
                    break

                curr = "".join(
                    item.xpath('descendant::div[@class="pricing"]/descendant::*[@class="currency"]/@title')
                ).strip()
                amt = "".join(
                    item.xpath('descendant::div[@class="pricing"]/descendant::*[@class="amount"]/text()')
                ).strip()
                s = SearchResult()
                s.price = (curr + " " + amt) if (curr and amt) else _("Not Available")
                title = item.xpath('descendant::h3[@class="item"]')
                if not title:
                    continue
                title = etree.tostring(title[0], method="text", encoding=unicode)
                if not title:
                    continue
                s.title = title.strip()
                s.author = "".join(
                    item.xpath('descendant::li[contains(@class, "author")]/' 'a[@class="fn"]/text()')
                ).strip()
                if not s.author:
                    continue
                detail_url = "".join(
                    item.xpath('descendant::h3[@class="item"]' '/descendant::a[@class="fn" and @href]/@href')
                )
                if not detail_url:
                    continue
                if detail_url.startswith("/"):
                    detail_url = "http:" + detail_url
                s.detail_item = detail_url

                counter -= 1

                cover_url = "".join(item.xpath('descendant::li[@class="coverart"]/' "descendant::img[@src]/@src"))
                if cover_url:
                    if cover_url.startswith("//"):
                        cover_url = "http:" + cover_url
                    elif cover_url.startswith("/"):
                        cover_url = "http://ebookstore.sony.com" + cover_url
                    s.cover_url = url_slash_cleaner(cover_url)

                s.drm = SearchResult.DRM_UNKNOWN
                s.formats = "Sony"

                yield s
示例#46
0
    def search(self, query, max_results=10, timeout=60):
        '''
        XinXii's open search url is:
        http://www.xinxii.com/catalog-search/query/?keywords={searchTerms}&amp;pw={startPage?}&amp;doc_lang={docLang}&amp;ff={docFormat},{docFormat},{docFormat}

        This url requires the docLang and docFormat. However, the search itself
        sent to XinXii does not require them. They can be ignored. We cannot
        push this into the stanard OpenSearchOPDSStore search because of the
        required attributes.

        XinXii doesn't return all info supported by OpenSearchOPDSStore search
        function so this one is modified to remove parts that are used.
        '''

        url = 'http://www.xinxii.com/catalog-search/query/?keywords=' + urllib.quote_plus(query)

        counter = max_results
        br = browser()
        with closing(br.open(url, timeout=timeout)) as f:
            doc = etree.fromstring(f.read())
            for data in doc.xpath('//*[local-name() = "entry"]'):
                if counter <= 0:
                    break

                counter -= 1

                s = SearchResult()

                s.detail_item = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip()

                for link in data.xpath('./*[local-name() = "link"]'):
                    rel = link.get('rel')
                    href = link.get('href')
                    type = link.get('type')

                    if rel and href and type:
                        if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
                            s.cover_url = href
                        if rel == 'alternate':
                            s.detail_item = href

                s.formats = 'EPUB, PDF'

                s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
                s.author = ', '.join(data.xpath('./*[local-name() = "author"]//*[local-name() = "name"]//text()')).strip()

                price_e = data.xpath('.//*[local-name() = "price"][1]')
                if price_e:
                    price_e = price_e[0]
                    currency_code = price_e.get('currencycode', '')
                    price = ''.join(price_e.xpath('.//text()')).strip()
                    s.price = currency_code + ' ' + price
                    s.price = s.price.strip()


                yield s
示例#47
0
 def deseralize_books(self, sbooks):
     books = []
     for s in sbooks:
         b = SearchResult()
         b.author = s.get('author', '')
         b.title = s.get('title', '')
         b.detail_item = s.get('detail_item', '')
         b.formats = s.get('formats', '')
         books.append(b)
     return books
示例#48
0
def search_amazon(query, max_results=10, timeout=60,
                  write_html_to=None,
                  base_url=SEARCH_BASE_URL,
                  base_query=SEARCH_BASE_QUERY,
                  field_keywords='k'
                  ):
    uquery = base_query.copy()
    uquery[field_keywords] = query

    def asbytes(x):
        if isinstance(x, type('')):
            x = x.encode('utf-8')
        return x
    uquery = {asbytes(k):asbytes(v) for k, v in uquery.items()}
    url = base_url + '?' + urlencode(uquery)
    br = browser(user_agent=get_user_agent())

    counter = max_results
    with closing(br.open(url, timeout=timeout)) as f:
        raw = f.read()
        if write_html_to is not None:
            with open(write_html_to, 'wb') as f:
                f.write(raw)
        doc = html.fromstring(raw)
        for result in doc.xpath('//div[contains(@class, "s-result-list")]//div[@data-index and @data-asin]'):
            kformat = ''.join(result.xpath('.//a[contains(text(), "Kindle Edition")]//text()'))
            # Even though we are searching digital-text only Amazon will still
            # put in results for non Kindle books (author pages). Se we need
            # to explicitly check if the item is a Kindle book and ignore it
            # if it isn't.
            if 'kindle' not in kformat.lower():
                continue
            asin = result.get('data-asin')
            if not asin:
                continue

            cover_url = ''.join(result.xpath('.//img/@src'))
            title = etree.tostring(result.xpath('.//h5')[0], method='text', encoding='unicode')
            adiv = result.xpath('.//div[contains(@class, "a-color-secondary")]')[0]
            aparts = etree.tostring(adiv, method='text', encoding='unicode').split()
            idx = aparts.index('|')
            author = ' '.join(aparts[1:idx])
            price = ''.join(result.xpath('.//span[contains(@class, "a-price")]/span[contains(@class, "a-offscreen")]/text()'))

            counter -= 1

            s = SearchResult()
            s.cover_url = cover_url.strip()
            s.title = title.strip()
            s.author = author.strip()
            s.detail_item = asin.strip()
            s.price = price.strip()
            s.formats = 'Kindle'

            yield s
示例#49
0
    def search(self, query, max_results=10, timeout=60):
        url = ('http://www.ebook.nl/store/advanced_search_result.php?keywords='
               + urllib2.quote(query))
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//table[contains(@class, "productListing")]/tr'):
                if counter <= 0:
                    break

                details = data.xpath('./td/div[@class="prodImage"]/a')
                if not details:
                    continue
                details = details[0]
                id = ''.join(details.xpath('./@href')).strip()
                id = id[id.rfind('/')+1:]
                i = id.rfind('?')
                if i > 0:
                    id = id[:i]
                if not id:
                    continue
                cover_url = 'http://www.ebook.nl/store/' + ''.join(details.xpath('./img/@src'))
                title = ''.join(details.xpath('./img/@title')).strip()
                author = ''.join(data.xpath('./td/div[@class="prodTitle"]/h3/a/text()')).strip()
                price = ''.join(data.xpath('./td/div[@class="prodTitle"]/b/text()'))
                pdf = data.xpath('boolean(./td/div[@class="prodTitle"]/'
                                   'p[contains(text(), "Bestandsformaat: Pdf")])')
                epub = data.xpath('boolean(./td/div[@class="prodTitle"]/'
                                   'p[contains(text(), "Bestandsformaat: ePub")])')
                nodrm = data.xpath('boolean(./td/div[@class="prodTitle"]/'
                                   'p[contains(text(), "zonder DRM") or'
                                   '  contains(text(), "watermerk")])')
                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                if nodrm:
                    s.drm = SearchResult.DRM_UNLOCKED
                else:
                    s.drm = SearchResult.DRM_LOCKED
                s.detail_item = id
                formats = []
                if epub:
                    formats.append('ePub')
                if pdf:
                    formats.append('PDF')
                s.formats = ','.join(formats)

                yield s
示例#50
0
    def search(self, query, max_results=10, timeout=60):
        url = ('http://www.ebook.de/de/pathSearch?nav=52122&searchString='
               + urllib2.quote(query))
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[contains(@class, "articlecontainer")]'):
                if counter <= 0:
                    break

                details = data.xpath('./div[contains(@class, "articleinfobox")]')
                if not details:
                    continue
                details = details[0]
                id_ = ''.join(details.xpath('./a/@name')).strip()
                if not id_:
                    continue
                title = ''.join(details.xpath('./h3[@class="title"]/a/text()')).strip()

                author = ''.join(details.xpath('.//div[@class="author"]/text()')).strip()
                if author.startswith('von'):
                    author = author[4:]

                pdf = details.xpath(
                        'boolean(.//span[@class="bindername" and contains(text(), "pdf")]/text())')
                epub = details.xpath(
                        'boolean(.//span[@class="bindername" and contains(text(), "epub")]/text())')
                mobi = details.xpath(
                        'boolean(.//span[@class="bindername" and contains(text(), "mobipocket")]/text())')

                cover_url = ''.join(data.xpath('.//div[@class="coverimg"]/a/img/@src'))
                price = ''.join(data.xpath('.//div[@class="preis"]/text()')).replace('*', '').strip()

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.drm = SearchResult.DRM_UNKNOWN
                s.detail_item = id_
                formats = []
                if epub:
                    formats.append('ePub')
                if pdf:
                    formats.append('PDF')
                if mobi:
                    formats.append('MOBI')
                s.formats = ', '.join(formats)

                yield s
示例#51
0
    def search(self, query, max_results=10, timeout=60):
        url = self.SEARCH_URL % urllib.quote_plus(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for item in doc.xpath('//div[@id="searchresult-list"]/descendant::div[contains(@class, "doc-item")]'):
                if counter <= 0:
                    break

                s = SearchResult()
                s.price = _('Not Available')
                p = ''.join(item.xpath('descendant::p[@class="doc-price"]/descendant::span[@itemprop="price"]/text()')).strip()
                if p:
                    s.price = 'AUD ' + p.split('$')[-1]

                title = item.xpath('descendant::h3[@class="doc-title"]')
                if not title:
                    continue
                title = etree.tostring(title[0], method='text', encoding=unicode)
                if not title:
                    continue
                st = item.xpath('descendant::p[@class="doc-subtitle"]')
                if st:
                    st = etree.tostring(st[0], method='text', encoding=unicode)
                    if st and st.strip():
                        title = title.strip() + ': ' + st
                s.title = title.strip()
                aut = item.xpath('descendant::p[@class="doc-author"]')
                if not aut:
                    continue
                s.author = etree.tostring(aut[0], method='text', encoding=unicode).strip()
                if not s.author:
                    continue
                du = ''.join(item.xpath('descendant::h3[position() = 1 and @class="doc-title"]/descendant::a[position() = 1 and @href]/@href')).strip()
                if not du:
                    continue
                detail_url = 'https://au.readerstore.sony.com'+du
                s.detail_item = detail_url

                counter -= 1

                cover_url = ''.join(item.xpath(
                    'descendant::p[@class="doc-cover" and position() = 1]/'
                    'descendant::img[position() = 1 and @src]/@src'))
                if cover_url:
                    s.cover_url = url_slash_cleaner(cover_url)

                s.drm = SearchResult.DRM_UNKNOWN
                s.formats = 'Sony'

                yield s
示例#52
0
    def search(self, query, max_results=10, timeout=60):
        url = u'http://uk.nook.com/s/%s?s%%5Bdref%%5D=1&s%%5Bkeyword%%5D=%s' % (
            query.replace(' ', '-'), urllib.quote(query))

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            raw = f.read()
            doc = html.fromstring(raw)
            for data in doc.xpath('//ul[contains(@class, "product_list")]/li'):
                if counter <= 0:
                    break

                id_ = ''.join(
                    data.xpath('.//span[contains(@class, "image")]/a/@href'))
                if not id_:
                    continue
                if id_.startswith('/gb'):
                    id_ = id_[3:]
                id_ = 'http://uk.nook.com' + id_.strip()

                cover_url = ''.join(
                    data.xpath(
                        './/span[contains(@class, "image")]//img/@data-src'))

                title = ''.join(
                    data.xpath(
                        './/div[contains(@class, "title")]//text()')).strip()
                if not title:
                    continue

                author = ', '.join(
                    data.xpath(
                        './/div[contains(@class, "contributor")]//a/text()')
                ).strip()
                price = ''.join(
                    data.xpath('.//div[contains(@class, "action")]//a//text()')
                ).strip()
                price = re.sub(r'[^\d.,£]', '', price)

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = id_
                s.drm = SearchResult.DRM_UNKNOWN
                s.formats = 'Nook'

                yield s
示例#53
0
    def search(self, query, max_results=10, timeout=60):
        search_url = "http://www.amazon.fr/s/?url=search-alias%3Ddigital-text&field-keywords="
        url = search_url + query.encode("ascii", "backslashreplace").replace("%", "%25").replace("\\x", "%").replace(
            " ", "+"
        )
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            # doc = html.fromstring(f.read().decode('latin-1', 'replace'))
            # Apparently amazon Europe is responding in UTF-8 now
            doc = html.fromstring(f.read())

            data_xpath = '//div[contains(@class, "result") and contains(@class, "product")]'
            format_xpath = './/span[@class="format"]/text()'
            cover_xpath = './/img[@class="productImage"]/@src'

            for data in doc.xpath(data_xpath):
                if counter <= 0:
                    break

                # Even though we are searching digital-text only Amazon will still
                # put in results for non Kindle books (author pages). So we need
                # to explicitly check if the item is a Kindle book and ignore it
                # if it isn't.
                format = "".join(data.xpath(format_xpath))
                if "kindle" not in format.lower():
                    continue

                # We must have an asin otherwise we can't easily reference the
                # book later.
                asin = "".join(data.xpath("@name"))

                cover_url = "".join(data.xpath(cover_xpath))

                title = "".join(data.xpath('.//a[@class="title"]/text()'))
                price = "".join(data.xpath('.//div[@class="newPrice"]/span[contains(@class, "price")]/text()'))
                author = unicode("".join(data.xpath('.//h3[@class="title"]/span[@class="ptBrand"]/text()')))
                if author.startswith("de "):
                    author = author[3:]

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url.strip()
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = asin.strip()
                s.formats = "Kindle"
                s.drm = SearchResult.DRM_UNKNOWN

                yield s
示例#54
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.beam-ebooks.de/suchergebnis.php?Type=&limit={0}&sw={1}'.format(
                                                    max_results, urllib2.quote(query))
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//table[tr/td/div[@class="stil2"]]'):
                if counter <= 0:
                    break

                id_ = ''.join(data.xpath('./tr/td[1]/a/@href')).strip()
                if not id_:
                    continue
                id_ = id_[7:]
                cover_url = ''.join(data.xpath('./tr/td[1]/a/img/@src'))
                if cover_url:
                    cover_url = 'http://www.beam-ebooks.de' + cover_url
                temp = ''.join(data.xpath('./tr/td[1]/a/img/@alt'))
                colon = temp.find(':')
                if not temp.startswith('eBook') or colon < 0:
                    continue
                author = temp[5:colon]
                title = temp[colon+1:]
                price = ''.join(data.xpath('./tr/td[3]/text()'))
                pdf = data.xpath(
                        'boolean(./tr/td[3]/a/img[contains(@alt, "PDF")]/@alt)')
                epub = data.xpath(
                        'boolean(./tr/td[3]/a/img[contains(@alt, "ePub")]/@alt)')
                mobi = data.xpath(
                        'boolean(./tr/td[3]/a/img[contains(@alt, "Mobipocket")]/@alt)')
                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.drm = SearchResult.DRM_UNLOCKED
                s.detail_item = id_
                formats = []
                if epub:
                    formats.append('ePub')
                if pdf:
                    formats.append('PDF')
                if mobi:
                    formats.append('MOBI')
                s.formats = ', '.join(formats)

                yield s
示例#55
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.nexto.pl/szukaj.xml?search-clause=' + urllib.quote_plus(query) + '&scid=1015'

        br = browser()
        offset=0

        counter = max_results

        while counter:
            with closing(br.open(url + '&_offset=' + str(offset), timeout=timeout)) as f:
                doc = html.fromstring(f.read())
                for data in doc.xpath('//ul[@class="productslist"]/li'):
                    if counter <= 0:
                        break

                    id = ''.join(data.xpath('.//div[@class="cover_container"]/a[1]/@href'))
                    if not id:
                        continue

                    price = ''.join(data.xpath('.//strong[@class="nprice"]/text()'))

                    cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src'))
                    cover_url = re.sub(r'%2F', '/', cover_url)
                    cover_url = re.sub(r'widthMax=120&heightMax=200', 'widthMax=64&heightMax=64', cover_url)
                    title = ''.join(data.xpath('.//a[@class="title"]/text()'))
                    title = re.sub(r' - ebook$', '', title)
                    formats = ', '.join(data.xpath('.//ul[@class="formats_available"]/li//b/text()'))
                    DrmFree = re.search(r'znak', formats)
                    formats = re.sub(r'\ ?\(.+?\)', '', formats)

                    author = ''
                    with closing(br.open('http://www.nexto.pl/' + id.strip(), timeout=timeout/4)) as nf:
                        idata = html.fromstring(nf.read())
                        author = ', '.join(idata.xpath('//div[@class="basic_data"]/p[1]/b/a/text()'))

                    counter -= 1

                    s = SearchResult()
                    s.cover_url = cover_url if cover_url[:4] == 'http' else 'http://www.nexto.pl' + cover_url
                    s.title = title.strip()
                    s.author = author.strip()
                    s.price = price
                    s.detail_item = id.strip()
                    s.drm = SearchResult.DRM_UNLOCKED if DrmFree else SearchResult.DRM_LOCKED
                    s.formats = formats.upper().strip()

                    yield s
                if not doc.xpath('//div[@class="listnavigator"]//a[@class="next"]'):
                    break
            offset+=10