示例#1
0
    def run(self):
        url = 'http://www.mobileread.com/forums/ebooks.php?do=getlist&type=html'

        self.update_details.emit(_('Checking last download date.'))
        last_download = self.config.get('last_download', None)
        # Don't update the book list if our cache is less than one week old.
        if last_download and (time.time() - last_download) < 604800:
            return

        self.update_details.emit(_('Downloading book list from MobileRead.'))
        # Download the book list HTML file from MobileRead.
        br = browser()
        raw_data = None
        try:
            with closing(br.open(url, timeout=self.timeout)) as f:
                raw_data = f.read()
        except:
            return

        if not raw_data or not self._run:
            return

        self.update_details.emit(_('Processing books.'))
        # Turn books listed in the HTML file into SearchResults's.
        books = []
        try:
            data = html.fromstring(raw_data)
            raw_books = data.xpath('//ul/li')
            self.total_changed.emit(len(raw_books))

            for i, book_data in enumerate(raw_books):
                self.update_details.emit(
                        _('%(num)s of %(tot)s books processed.') % dict(
                            num=i, tot=len(raw_books)))
                book = SearchResult()
                book.detail_item = ''.join(book_data.xpath('.//a/@href'))
                book.formats = ''.join(book_data.xpath('.//i/text()'))
                book.formats = book.formats.strip()

                text = ''.join(book_data.xpath('.//a/text()'))
                if ':' in text:
                    book.author, q, text = text.partition(':')
                book.author = book.author.strip()
                book.title = text.strip()
                books.append(book)

                if not self._run:
                    books = []
                    break
                else:
                    self.update_progress.emit(i)
        except:
            pass

        # Save the book list and it's create time.
        if books:
            self.config['book_list'] = self.seralize_books(books)
            self.config['last_download'] = time.time()
示例#2
0
    def search(self, query, max_results=10, timeout=60):
        # check for cyrillic symbols before performing search
        uquery = unicode(query.strip(), 'utf-8')
        reObj = re.search(u'^[а-яА-Я\\d\\s]{2,}$', uquery)
        if not reObj:
            return

        base_url = 'http://e-knigi.net'
        url = base_url + '/virtuemart?page=shop.browse&search_category=0&search_limiter=anywhere&keyword=' + urllib2.quote(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())

            # if the store finds only one product, it opens directly detail view
            for data in doc.xpath('//div[@class="prod_details"]'):
                s = SearchResult()
                s.cover_url = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@src')).strip()
                s.title = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@alt')).strip()
                s.author = ''.join(data.xpath('.//div[@class="td_bg clearfix"]/div[@class="gk_product_tab"]/div/table/tr[3]/td[2]/text()')).strip()
                s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip()
                s.detail_item = url
                s.drm = SearchResult.DRM_UNLOCKED

                yield s
                return

            # search in store results
            for data in doc.xpath('//div[@class="browseProductContainer"]'):
                if counter <= 0:
                    break
                id = ''.join(data.xpath('.//a[1]/@href')).strip()
                if not id:
                    continue

                title = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@title')).strip()
                author = ''.join(data.xpath('.//div[@style="float:left;width:90%"]/b/text()')).strip().replace('Автор: ', '')

                if title.lower().find(query.lower()) == -1 and author.lower().find(query.lower()) == -1:
                    continue

                counter -= 1

                s = SearchResult()
                s.cover_url = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@src')).strip()
                s.title = title
                s.author = author
                s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip()
                s.detail_item = base_url + id
                s.drm = SearchResult.DRM_UNLOCKED

                yield s
示例#3
0
    def search(self, query, max_results=15, timeout=60):
        search_url = self.shop_url + '/webservice/webservice.asmx/SearchWebService?'\
                    'searchText=%s&searchContext=ebook' % urllib2.quote(query)
        search_urls = [ search_url ]

        ## add this as the fist try if it looks like ozon ID
        if re.match("^\d{6,9}$", query):
            ozon_detail = self.shop_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % query
            search_urls.insert(0, ozon_detail)

        xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
        counter = max_results
        br = browser()

        for url in search_urls:
            with closing(br.open(url, timeout=timeout)) as f:
                raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0]
                doc = etree.fromstring(raw)
                for data in doc.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]'):
                    if counter <= 0:
                        break
                    counter -= 1

                    s = SearchResult()
                    s.detail_item = data.xpath(xp_template.format('ID'))
                    s.title = data.xpath(xp_template.format('Name'))
                    s.author = data.xpath(xp_template.format('Author'))
                    s.price = data.xpath(xp_template.format('Price'))
                    s.cover_url = data.xpath(xp_template.format('Picture'))
                    s.price = format_price_in_RUR(s.price)
                    yield s
示例#4
0
    def search(self, query, max_results=20, timeout=60):

        br = browser()

        counter = max_results
        page = 1
        while counter:
            with closing(
                br.open(
                    "http://www.publio.pl/e-booki,strona" + str(page) + ".html?q=" + urllib.quote(query),
                    timeout=timeout,
                )
            ) as f:
                doc = html.fromstring(f.read())
                for data in doc.xpath('//div[@class="item"]'):
                    if counter <= 0:
                        break

                    id = "".join(data.xpath('.//div[@class="img"]/a/@href'))
                    if not id:
                        continue

                    cover_url = "".join(data.xpath('.//div[@class="img"]/a/img/@data-original'))
                    title = "".join(data.xpath('.//div[@class="img"]/a/@title'))
                    title2 = "".join(data.xpath('.//div[@class="desc"]/h5//text()'))
                    if title2:
                        title = title + ". " + title2
                    if (
                        "".join(
                            data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[last()]/span/text()')
                        ).strip()
                        == "Seria:"
                    ):
                        series = "".join(
                            data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[last()]/a/@title')
                        )
                        title = title + " (seria " + series + ")"
                    author = ", ".join(
                        data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[@class="row"][1]/a/@title')
                    )
                    price = "".join(data.xpath('.//div[@class="priceBox tk-museo-slab"]/ins/text()'))
                    if not price:
                        price = "".join(data.xpath('.//div[@class="priceBox tk-museo-slab"]/text()')).strip()
                    formats = ", ".join(data.xpath('.//div[@class="formats"]/a/img/@alt'))

                    counter -= 1

                    s = SearchResult()
                    s.cover_url = "http://www.publio.pl" + cover_url
                    s.title = title.strip()
                    s.author = author
                    s.price = price
                    s.detail_item = "http://www.publio.pl" + id.strip()
                    s.drm = SearchResult.DRM_LOCKED if "DRM" in formats else SearchResult.DRM_UNLOCKED
                    s.formats = formats.replace(" DRM", "").strip()

                    yield s
                if not doc.xpath('boolean(//a[@class="next"])'):
                    break
                page += 1
示例#5
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://m.gutenberg.org/ebooks/search.mobile/?default_prefix=all&sort_order=title&query=' + urllib.quote_plus(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//ol[@class="results"]/li[@class="booklink"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('./a/@href'))
                id = id.split('.mobile')[0]

                title = ''.join(data.xpath('.//span[@class="title"]/text()'))
                author = ''.join(data.xpath('.//span[@class="subtitle"]/text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = ''

                s.detail_item = id.strip()
                s.title = title.strip()
                s.author = author.strip()
                s.price = '$0.00'
                s.drm = SearchResult.DRM_UNLOCKED

                yield s
示例#6
0
    def search(self, query, max_results=25, timeout=60):
        url = 'http://ebookpoint.pl/search?qa=&szukaj=' + quote_plus(
            query.decode('utf-8').encode('iso-8859-2')) + '&serwisyall=0&wprzyg=0&wsprzed=1&wyczerp=0&formaty=em-p'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//ul[@class="list"]/li'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('./a/@href'))
                if not id:
                    continue

                formats = ', '.join(data.xpath('.//ul[@class="book-type book-type-points"]//span[@class="popup"]/span/text()'))
                cover_url = ''.join(data.xpath('.//p[@class="cover"]/img/@data-src'))
                title = ''.join(data.xpath('.//div[@class="book-info"]/h3/a/text()'))
                author = ''.join(data.xpath('.//p[@class="author"]//text()'))
                price = ''.join(data.xpath('.//p[@class="price price-incart"]/a/ins/text()|.//p[@class="price price-add"]/a/text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = re.sub(r'\.',',',price)
                s.detail_item = id.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = formats.upper()

                yield s
示例#7
0
def open_search(url, query, max_results=10, timeout=60):
    description = Description(url)
    url_template = description.get_best_template()
    if not url_template:
        return
    oquery = Query(url_template)

    # set up initial values
    oquery.searchTerms = query
    oquery.count = max_results
    url = oquery.url()

    counter = max_results
    br = browser()
    with closing(br.open(url, timeout=timeout)) as f:
        doc = etree.fromstring(f.read())
        for data in doc.xpath('//*[local-name() = "entry"]'):
            if counter <= 0:
                break

            counter -= 1

            s = SearchResult()

            s.detail_item = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip()

            for link in data.xpath('./*[local-name() = "link"]'):
                rel = link.get('rel')
                href = link.get('href')
                type = link.get('type')

                if rel and href and type:
                    if 'http://opds-spec.org/thumbnail' in rel:
                        s.cover_url = href
                    elif 'http://opds-spec.org/image/thumbnail' in rel:
                        s.cover_url = href
                    elif 'http://opds-spec.org/acquisition/buy' in rel:
                        s.detail_item = href
                    elif 'http://opds-spec.org/acquisition/sample' in rel:
                        pass
                    elif 'http://opds-spec.org/acquisition' in rel:
                        if type:
                            ext = guess_extension(type)
                            if ext:
                                ext = ext[1:].upper().strip()
                                s.downloads[ext] = href
            s.formats = ', '.join(s.downloads.keys()).strip()

            s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
            s.author = ', '.join(data.xpath('./*[local-name() = "author"]//*[local-name() = "name"]//text()')).strip()

            price_e = data.xpath('.//*[local-name() = "price"][1]')
            if price_e:
                price_e = price_e[0]
                currency_code = price_e.get('currencycode', '')
                price = ''.join(price_e.xpath('.//text()')).strip()
                s.price = currency_code + ' ' + price
                s.price = s.price.strip()

            yield s
示例#8
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.bubok.pt/resellers/calibre_search/' + urllib.quote_plus(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[contains(@class, "libro")]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//div[@class="url"]/text()'))

                title = ''.join(data.xpath('.//div[@class="titulo"]/text()'))

                author = ''.join(data.xpath('.//div[@class="autor"]/text()'))

                price = ''.join(data.xpath('.//div[@class="precio"]/text()'))

                formats = ''.join(data.xpath('.//div[@class="formatos"]/text()'))

                cover = ''.join(data.xpath('.//div[@class="portada"]/text()'))

                counter -= 1

                s = SearchResult()
                s.title = title.strip()
                s.author = author.strip()
                s.detail_item = id.strip()
                s.price = price.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = formats.strip()
                s.cover_url = cover.strip()
                yield s
示例#9
0
    def search(self, query, max_results=10, timeout=60):
        url = ('http://www.ebook.nl/store/advanced_search_result.php?keywords=' + urllib2.quote(query))
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@id="books"]/div[@itemtype="http://schema.org/Book"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('./meta[@itemprop="url"]/@content')).strip()
                if not id:
                    continue
                cover_url = 'http://www.ebook.nl/store/' + ''.join(data.xpath('.//img[@itemprop="image"]/@src'))
                title = ''.join(data.xpath('./span[@itemprop="name"]/a/text()')).strip()
                author = ''.join(data.xpath('./span[@itemprop="author"]/a/text()')).strip()
                if author == '&nbsp':
                    author = ''
                price = ''.join(data.xpath('.//span[@itemprop="price"]//text()'))
                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.drm = SearchResult.DRM_UNKNOWN
                s.detail_item = id

                yield s
示例#10
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://ebooks.eharlequin.com/BANGSearch.dll?Type=FullText&FullTextField=All&FullTextCriteria=' + urllib2.quote(query)
        
        br = browser()
        
        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//table[not(.//@class="sidelink")]/tr[.//ul[@id="details"]]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//ul[@id="details"]/li[@id="title-results"]/a/@href'))
                if not id:
                    continue

                title = ''.join(data.xpath('.//ul[@id="details"]/li[@id="title-results"]/a/text()'))
                author = ''.join(data.xpath('.//ul[@id="details"]/li[@id="author"][1]//a/text()'))
                price = ''.join(data.xpath('.//div[@class="ourprice"]/font/text()'))
                cover_url = ''.join(data.xpath('.//a[@href="%s"]/img/@src' % id))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = 'http://ebooks.eharlequin.com/' + id.strip()
                s.formats = 'EPUB'
                
                yield s
示例#11
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://zixo.pl/wyszukiwarka/?search=' + urllib.quote(query.encode('utf-8')) + '&product_type=0'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="productInline"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//a[@class="productThumb"]/@href'))
                if not id:
                    continue

                cover_url = ''.join(data.xpath('.//a[@class="productThumb"]/img/@src'))
                title = ''.join(data.xpath('.//a[@class="title"]/text()'))
                author = ','.join(data.xpath('.//div[@class="productDescription"]/span[1]/a/text()'))
                price = ''.join(data.xpath('.//div[@class="priceList"]/span/text()'))
                price = re.sub('\.', ',', price)

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.detail_item = 'http://zixo.pl' + id.strip()
                s.drm = SearchResult.DRM_LOCKED

                yield s
示例#12
0
    def search(self, query, max_results=10, timeout=60):
        url = ('http://www.whsmith.co.uk/search?keywordCategoryId=wc_dept_ebooks&results=60'
               '&page=1&keywords=' + urllib2.quote(query))

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//li[@class="product"]'):
                if counter <= 0:
                    break
                id_ = ''.join(data.xpath('./a[@class="product_image_wrap"]/@href'))
                if not id_:
                    continue
                id_ = 'http://www.whsmith.co.uk' + id_
                cover_url = ''.join(data.xpath('.//img[@class="product_image"]/@src'))
                title = ''.join(data.xpath('.//h4[@class="product_title"]/text()'))
                author = ', '.join(data.xpath('.//span[@class="product_second"]/text()'))
                price = ''.join(data.xpath('.//span[@class="price"]/text()'))
                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.drm = SearchResult.DRM_LOCKED
                s.detail_item = id_
                s.formats = 'ePub'

                yield s
示例#13
0
    def search(self, query, max_results=10, timeout=60):
        url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+')
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())#.decode('latin-1', 'replace'))

            data_xpath = '//div[contains(@class, "prod")]'
            format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()'
            asin_xpath = '@name'
            cover_xpath = './/img[@class="productImage"]/@src'
            title_xpath = './/h3[@class="newaps"]/a//text()'
            author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]/text()'
            price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()'

            for data in doc.xpath(data_xpath):
                if counter <= 0:
                    break

                # Even though we are searching digital-text only Amazon will still
                # put in results for non Kindle books (author pages). Se we need
                # to explicitly check if the item is a Kindle book and ignore it
                # if it isn't.
                format_ = ''.join(data.xpath(format_xpath))
                if 'kindle' not in format_.lower():
                    continue

                # We must have an asin otherwise we can't easily reference the
                # book later.
                asin = data.xpath(asin_xpath)
                if asin:
                    asin = asin[0]
                else:
                    continue

                cover_url = ''.join(data.xpath(cover_xpath))

                title = ''.join(data.xpath(title_xpath))
                author = ''.join(data.xpath(author_xpath))
                try:
                    if self.author_article:
                        author = author.split(self.author_article, 1)[1].split(" (")[0]
                except:
                    pass

                price = ''.join(data.xpath(price_xpath))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url.strip()
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = asin.strip()
                s.drm = SearchResult.DRM_UNKNOWN
                s.formats = 'Kindle'

                yield s
示例#14
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://bookoteka.pl/list?search=' + urllib.quote_plus(query) + '&cat=1&hp=1&type=1'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//li[@class="EBOOK"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//a[@class="item_link"]/@href'))
                if not id:
                    continue

                cover_url = ''.join(data.xpath('.//a[@class="item_link"]/img/@src'))
                title = ''.join(data.xpath('.//div[@class="shelf_title"]/a/text()'))
                author = ''.join(data.xpath('.//div[@class="shelf_authors"][1]/text()'))
                price = ''.join(data.xpath('.//span[@class="EBOOK"]/text()'))
                price = price.replace('.', ',')
                formats = ', '.join(data.xpath('.//a[@class="fancybox protected"]/text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = 'http://bookoteka.pl' + cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.detail_item = 'http://bookoteka.pl' + id.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = formats.strip()

                yield s
示例#15
0
    def search(self, query, max_results=10, timeout=60):
        url = "http://www.legimi.com/pl/ebooki/?szukaj=" + urllib.quote_plus(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@id="listBooks"]/div'):
                if counter <= 0:
                    break

                id = "".join(data.xpath('.//a[@class="plainLink"]/@href'))
                if not id:
                    continue

                cover_url = "".join(data.xpath(".//img[1]/@src"))
                title = "".join(data.xpath('.//span[@class="bookListTitle ellipsis"]/text()'))
                author = "".join(data.xpath('.//span[@class="bookListAuthor ellipsis"]/text()'))
                price = "".join(data.xpath('.//div[@class="bookListPrice"]/span/text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = "http://www.legimi.com/" + cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.detail_item = "http://www.legimi.com/" + id.strip()

                yield s
示例#16
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.google.com/search?tbm=bks&q=' + urllib.quote_plus(query)
        
        br = browser()
        
        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//ol[@id="rso"]/li'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//h3/a/@href'))
                if not id:
                    continue

                title = ''.join(data.xpath('.//h3/a//text()'))
                authors = data.xpath('.//div[@class="f"]//a//text()')
                while authors and authors[-1].strip().lower() in ('preview', 'read', 'more editions'):
                    authors = authors[:-1]
                if not authors:
                    continue
                author = ', '.join(authors)

                counter -= 1
                
                s = SearchResult()
                s.title = title.strip()
                s.author = author.strip()
                s.detail_item = id.strip()
                s.drm = SearchResult.DRM_UNKNOWN
                
                yield s
示例#17
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.barnesandnoble.com/s/%s?keyword=%s&store=ebook' % (query.replace(' ', '-'), urllib.quote_plus(query))

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//ul[contains(@class, "result-set")]/li[contains(@class, "result")]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//div[contains(@class, "image-bounding-box")]/a/@href'))
                if not id:
                    continue

                cover_url = ''.join(data.xpath('.//img[contains(@class, "product-image")]/@src'))

                title = ''.join(data.xpath('.//a[@class="title"]//text()'))
                author = ', '.join(data.xpath('.//a[@class="contributor"]//text()'))
                price = ''.join(data.xpath('.//div[@class="price-format"]//span[contains(@class, "price")]/text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = id.strip()
                s.drm = SearchResult.DRM_UNKNOWN
                s.formats = 'Nook'

                yield s
示例#18
0
    def search(self, query, max_results=15, timeout=60):
        search_url = (
            self.shop_url + "/webservice/webservice.asmx/SearchWebService?"
            "searchText=%s&searchContext=ebook" % urllib2.quote(query)
        )
        search_urls = [search_url]

        xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
        counter = max_results
        br = browser()

        for url in search_urls:
            with closing(br.open(url, timeout=timeout)) as f:
                raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0]
                doc = etree.fromstring(raw)
                for data in doc.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]'):
                    if counter <= 0:
                        break
                    counter -= 1

                    s = SearchResult()
                    s.detail_item = data.xpath(xp_template.format("ID"))
                    s.title = data.xpath(xp_template.format("Name"))
                    s.author = data.xpath(xp_template.format("Author"))
                    s.price = data.xpath(xp_template.format("Price"))
                    s.cover_url = data.xpath(xp_template.format("Picture"))
                    s.price = format_price_in_RUR(s.price)
                    yield s
示例#19
0
    def search(self, query, max_results=20, timeout=60):
        url = 'http://www.escapemagazine.pl/wyszukiwarka?query=' + urllib.quote_plus(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="item item_short"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//h2[@class="title"]/a[1]/@href'))
                if not id:
                    continue

                title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()'))
                author = ''.join(data.xpath('.//div[@class="author"]/text()'))
                price = ''.join(data.xpath('.//span[@class="price_now"]/strong/text()')) + ' zł'
                cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src'))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.detail_item = 'http://www.escapemagazine.pl' + id.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = 'PDF'

                yield s
示例#20
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://ebooks.foyles.co.uk/catalog/search/?query=' + urllib2.quote(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="doc-item"]'):
                if counter <= 0:
                    break
                id_ = ''.join(data.xpath('.//p[@class="doc-cover"]/a/@href')).strip()
                if not id_:
                    continue
                id_ = 'http://ebooks.foyles.co.uk' + id_

                cover_url = ''.join(data.xpath('.//p[@class="doc-cover"]/a/img/@src'))
                title = ''.join(data.xpath('.//span[@class="title"]/a/text()'))
                author = ', '.join(data.xpath('.//span[@class="author"]/span[@class="author"]/text()'))
                price = ''.join(data.xpath('.//span[@itemprop="price"]/text()')).strip()
                format_ = ''.join(data.xpath('.//p[@class="doc-meta-format"]/span[last()]/text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.detail_item = id_
                s.drm = SearchResult.DRM_LOCKED
                s.formats = format_

                yield s
示例#21
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.bewrite.net/mm5/merchant.mvc?Search_Code=B&Screen=SRCH&Search=' + urllib2.quote(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@id="content"]//table/tr[position() > 1]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//a/@href'))
                if not id:
                    continue

                heading = ''.join(data.xpath('./td[2]//text()'))
                title, q, author = heading.partition('by ')
                cover_url = ''
                price = ''

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url.strip()
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = id.strip()
                s.drm = SearchResult.DRM_UNLOCKED

                yield s
示例#22
0
    def search(self, query, max_results=10, timeout=60):
        base_url = 'https://www.millsandboon.co.uk'
        url = base_url + '/search.aspx??format=ebook&searchText=' + urllib2.quote(query)
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//article[contains(@class, "group")]'):
                if counter <= 0:
                    break
                id_ = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/@href')).strip()
                if not id_:
                    continue

                cover_url = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@src'))
                title =  ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@alt')).strip()
                author = ''.join(data.xpath('.//a[@class="author"]/text()'))
                price = ''.join(data.xpath('.//div[@class="type-wrapper"]/ul/li[child::span[text()="eBook"]]/a/text()'))
                format_ = ''.join(data.xpath('.//p[@class="doc-meta-format"]/span[last()]/text()'))
                drm = SearchResult.DRM_LOCKED

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.detail_item = id_
                s.drm = drm
                s.formats = format_

                yield s
示例#23
0
    def search(self, query, max_results=10, timeout=60):
        url = 'https://www.beam-shop.de/search?saltFieldLimitation=all&sSearch=' + urllib2.quote(query)
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[contains(@class, "product--box")]'):
                if counter <= 0:
                    break

                id_ = ''.join(data.xpath('./div/div[contains(@class, "product--info")]/a/@href')).strip()
                if not id_:
                    continue
                cover_url = ''.join(data.xpath('./div/div[contains(@class, "product--info")]/a//img/@srcset'))
                if cover_url:
                    cover_url = cover_url.split(',')[0].strip()
                author = data.xpath('.//a[@class="product--author"]/text()')[0].strip()
                title = data.xpath('.//a[@class="product--title"]/text()')[0].strip()
                price = data.xpath('.//div[@class="product--price"]/span/text()')[0].strip()
                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.drm = SearchResult.DRM_UNLOCKED
                s.detail_item = id_
#                 s.formats = None
                yield s
示例#24
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://woblink.com/publication?query=' + urllib.quote_plus(query.encode('utf-8'))
        if max_results > 10:
            if max_results > 20:
                url += '&limit=30'
            else:
                url += '&limit=20'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="book-item"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//td[@class="w10 va-t"]/a[1]/@href'))
                if not id:
                    continue

                cover_url = ''.join(data.xpath('.//td[@class="w10 va-t"]/a[1]/img/@src'))
                title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()'))
                author = ', '.join(data.xpath('.//p[@class="author"]/a/text()'))
                price = ''.join(data.xpath('.//div[@class="prices"]/span[1]/span/text()'))
                price = re.sub('\.', ',', price)
                formats = [ form[8:-4].split('_')[0] for form in data.xpath('.//p[3]/img/@src')]

                s = SearchResult()
                s.cover_url = 'http://woblink.com' + cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price + ' zł'
                s.detail_item = id.strip()
                
                # MOBI should be send first,
                if 'MOBI' in formats:
                    t = copy.copy(s)
                    t.title += ' MOBI'
                    t.drm = SearchResult.DRM_UNLOCKED
                    t.formats = 'MOBI'
                    formats.remove('MOBI')
                    
                    counter -= 1
                    yield t
                    
                # and the remaining formats (if any) next
                if formats:
                    if 'epub' in formats:
                        formats.remove('epub')
                        formats.append('WOBLINK')
                        if 'E Ink' in data.xpath('.//div[@class="prices"]/img/@title'):
                            formats.insert(0, 'EPUB')
                    
                    s.drm = SearchResult.DRM_LOCKED
                    s.formats = ', '.join(formats).upper()
                    
                    counter -= 1
                    yield s
示例#25
0
def search(query, max_results=10, timeout=60, write_raw_to=None):
    url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + quote_plus(query)

    counter = max_results
    br = browser(user_agent='calibre/'+__version__)
    with closing(br.open(url, timeout=timeout)) as f:
        raw = f.read()
        if write_raw_to is not None:
            with open(write_raw_to, 'wb') as f:
                f.write(raw)
        doc = etree.fromstring(raw)
        for data in doc.xpath('//*[local-name() = "entry"]'):
            if counter <= 0:
                break

            counter -= 1

            s = SearchResult()

            # We could use the <link rel="alternate" type="text/html" ...> tag from the
            # detail odps page but this is easier.
            id = fix_url(''.join(data.xpath('./*[local-name() = "id"]/text()')).strip())
            s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (web_url, re.sub(r'[^\d]', '', id)))
            s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
            s.author = ', '.join(data.xpath('./*[local-name() = "content"]//text()')).strip()
            if not s.title or not s.author:
                continue

            # Get the formats and direct download links.
            with closing(br.open(id, timeout=timeout/4)) as nf:
                ndoc = etree.fromstring(nf.read())
                for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'):
                    type = link.get('type')
                    href = link.get('href')
                    if type:
                        ext = mimetypes.guess_extension(type)
                        if ext:
                            ext = ext[1:].upper().strip()
                            s.downloads[ext] = fix_url(href)

            s.formats = ', '.join(s.downloads.keys())
            if not s.formats:
                continue

            for link in data.xpath('./*[local-name() = "link"]'):
                rel = link.get('rel')
                href = link.get('href')
                type = link.get('type')

                if rel and href and type:
                    href = fix_url(href)
                    if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
                        if href.startswith('data:image/png;base64,'):
                            cdata = href.replace('data:image/png;base64,', '')
                            if not isinstance(cdata, bytes):
                                cdata = cdata.encode('ascii')
                            s.cover_data = base64.b64decode(cdata)

            yield s
示例#26
0
 def search(self, query, max_results=10, timeout=60):
     s = SearchResult()
     s.title = 'Amazon required that this<br>store be permanently closed.'
     s.author = ''
     s.price = ''
     s.detail_item = ''
     s.drm = SearchResult.DRM_UNKNOWN
     yield s
示例#27
0
    def search(self, query, max_results=100, timeout=180):
        url = 'http://www.e-knjiga.si/rezultati_cover.php?query=' + urllib2.quote(query)

        print("will search for: " + urllib2.quote(query) + ":\n  " + url)

        br = browser()

        # counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:

            html=etree.HTML(f.read())
            
            #get list of books
            for book in html.xpath("//table[@class='zebra']"):
                print(etree.tostring(book, pretty_print=True, method="html"))
        
                author = book.find('.//tr/[0]/td/[1]').text
                title = book.find('.//tr/[0]/td/[2]/a').text
                details = 'http://www.e-knjiga.si/' + book.find('.//tr/[0]/td/[2]/a').get("href")

                ## get details
                fo =  urllib2.urlopen(details)
                det=etree.HTML(fo.read())
                fo.close()
                
                table=det.find(".//div[@id='center_container']").find('./table')
                cover='http://www.e-knjiga.si/' + table.find('.//tr/[1]/td/[1]/div/img').get("src")
                description=table.find(".//tr/[6]/td[@class='knjige_spremna']").text
                
                links=[]
                files=table.find('.//tr/[7]/td/[1]')
                for file in files.iter('a'):
                    links.append("http://www.e-knjiga.si/"+file.get("href"))

                
                #print("Author:    " + author)
                #print("Title:     " + title)
                #print("Details:   " + details)
                #print("Description:  " + description)
                #print("Cover:        " + cover)
                #print("Files:       ")
                #print('\n              '.join(links))
                
                s = SearchResult()
                s.title = title
                s.author = author
                s.price = "0.00eur"
                s.drm = SearchResult.DRM_UNLOCKED
                s.detail_item = description
                
                for f in links:
                    ftype = f.split(".")[-1]
                    s.downloads[ftype] = f
                    s.formats += ftype
                    
                s.cover_url = cover

                yield s
示例#28
0
    def search(self, query, max_results=10, timeout=60):
        url = "http://ebookstore.sony.com/search?keyword=%s" % urllib.quote_plus(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for item in doc.xpath(
                '//div[contains(@class, "searchResult")]/' 'descendant::li[contains(@class, "hreview")]'
            ):
                if counter <= 0:
                    break

                curr = "".join(
                    item.xpath('descendant::div[@class="pricing"]/descendant::*[@class="currency"]/@title')
                ).strip()
                amt = "".join(
                    item.xpath('descendant::div[@class="pricing"]/descendant::*[@class="amount"]/text()')
                ).strip()
                s = SearchResult()
                s.price = (curr + " " + amt) if (curr and amt) else _("Not Available")
                title = item.xpath('descendant::h3[@class="item"]')
                if not title:
                    continue
                title = etree.tostring(title[0], method="text", encoding=unicode)
                if not title:
                    continue
                s.title = title.strip()
                s.author = "".join(
                    item.xpath('descendant::li[contains(@class, "author")]/' 'a[@class="fn"]/text()')
                ).strip()
                if not s.author:
                    continue
                detail_url = "".join(
                    item.xpath('descendant::h3[@class="item"]' '/descendant::a[@class="fn" and @href]/@href')
                )
                if not detail_url:
                    continue
                if detail_url.startswith("/"):
                    detail_url = "http:" + detail_url
                s.detail_item = detail_url

                counter -= 1

                cover_url = "".join(item.xpath('descendant::li[@class="coverart"]/' "descendant::img[@src]/@src"))
                if cover_url:
                    if cover_url.startswith("//"):
                        cover_url = "http:" + cover_url
                    elif cover_url.startswith("/"):
                        cover_url = "http://ebookstore.sony.com" + cover_url
                    s.cover_url = url_slash_cleaner(cover_url)

                s.drm = SearchResult.DRM_UNKNOWN
                s.formats = "Sony"

                yield s
示例#29
0
def search_kobo(query, max_results=10, timeout=60, write_html_to=None):
    from css_selectors import Select
    url = 'http://www.kobobooks.com/search/search.html?q=' + urllib.quote_plus(query)

    br = browser()

    with closing(br.open(url, timeout=timeout)) as f:
        raw = f.read()
        if write_html_to is not None:
            with open(write_html_to, 'wb') as f:
                f.write(raw)
        doc = html.fromstring(raw)
        select = Select(doc)
        for i, item in enumerate(select('.result-items .item-wrapper.book')):
            if i == max_results:
                break
            for img in select('.item-image img[src]', item):
                cover_url = img.get('src')
                if cover_url.startswith('//'):
                    cover_url = 'http:' + cover_url
                break
            else:
                cover_url = None

            for p in select('p.title', item):
                title = etree.tostring(p, method='text', encoding=unicode).strip()
                for a in select('a[href]', p):
                    url = 'http://store.kobobooks.com' + a.get('href')
                    break
                else:
                    url = None
                break
            else:
                title = None

            authors = []
            for a in select('p.author a.contributor', item):
                authors.append(etree.tostring(a, method='text', encoding=unicode).strip())
            authors = authors_to_string(authors)

            for p in select('p.price', item):
                price = etree.tostring(p, method='text', encoding=unicode).strip()
                break
            else:
                price = None

            if title and authors and url:
                s = SearchResult()
                s.cover_url = cover_url
                s.title = title
                s.author = authors
                s.price = price
                s.detail_item = url
                s.formats = 'EPUB'
                s.drm = SearchResult.DRM_UNKNOWN

            yield s
示例#30
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://woblink.com/katalog-e-book?query=' + urllib.quote_plus(query.encode('utf-8'))
        if max_results > 10:
            if max_results > 20:
                url += '&limit=30'
            else:
                url += '&limit=20'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="book-item backgroundmix"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//td[@class="w10 va-t mYHaveItYes"]/a[1]/@href'))
                if not id:
                    continue

                cover_url = ''.join(data.xpath('.//td[@class="w10 va-t mYHaveItYes"]/a[1]/img/@src'))
                title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()'))
                author = ', '.join(data.xpath('.//td[@class="va-t"]/h3/a/text()'))
                price = ''.join(data.xpath('.//div[@class="prices"]/span[1]/strong/span/text()'))
                price = re.sub('\.', ',', price)
                formats = [ form[8:-4].split('.')[0] for form in data.xpath('.//p[3]/img/@src')]

                s = SearchResult()
                s.cover_url = 'http://woblink.com' + cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price + ' zł'
                s.detail_item = id.strip()

                if 'epub_drm' in formats:
                    s.drm = SearchResult.DRM_LOCKED
                    s.formats = 'EPUB'

                    counter -= 1
                    yield s
                elif 'pdf' in formats:
                    s.drm = SearchResult.DRM_LOCKED
                    s.formats = 'PDF'

                    counter -= 1
                    yield s
                else:
                    s.drm = SearchResult.DRM_UNLOCKED
                    if 'MOBI_nieb' in formats:
                        formats.remove('MOBI_nieb')
                        formats.append('MOBI')
                    s.formats = ', '.join(formats).upper()

                    counter -= 1
                    yield s
示例#31
0
    def search(self, query, max_results=10, timeout=60):
        url = ('http://www.libri.de/shop/action/quickSearch?facetNodeId=6'
               '&mainsearchSubmit=Los!&searchString=' + urllib2.quote(query))
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[contains(@class, "item")]'):
                if counter <= 0:
                    break

                details = data.xpath('./div[@class="beschreibungContainer"]')
                if not details:
                    continue
                details = details[0]
                id = ''.join(
                    details.xpath('./div[@class="text"]/a/@name')).strip()
                if not id:
                    continue
                cover_url = ''.join(
                    details.xpath('.//div[@class="coverImg"]/a/img/@src'))
                title = ''.join(
                    details.xpath(
                        './div[@class="text"]/span[@class="titel"]/a/text()')
                ).strip()
                author = ''.join(
                    details.xpath(
                        './div[@class="text"]/span[@class="author"]/text()')
                ).strip()
                pdf = details.xpath(
                    'boolean(.//span[@class="format" and contains(text(), "pdf")]/text())'
                )
                epub = details.xpath(
                    'boolean(.//span[@class="format" and contains(text(), "epub")]/text())'
                )
                mobi = details.xpath(
                    'boolean(.//span[@class="format" and contains(text(), "mobipocket")]/text())'
                )
                price = ''.join(
                    data.xpath('.//span[@class="preis"]/text()')).replace(
                        '*', '').strip()

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.drm = SearchResult.DRM_UNKNOWN
                s.detail_item = id
                formats = []
                if epub:
                    formats.append('ePub')
                if pdf:
                    formats.append('PDF')
                if mobi:
                    formats.append('MOBI')
                s.formats = ', '.join(formats)

                yield s
示例#32
0
def search_amazon(
    query,
    max_results=10,
    timeout=60,
    write_html_to=None,
    search_url='http://www.amazon.com/s/?url=search-alias%3Ddigital-text&field-keywords='
):
    url = search_url + query.encode('ascii', 'backslashreplace').replace(
        '%', '%25').replace('\\x', '%').replace(' ', '+')
    br = browser()

    counter = max_results
    with closing(br.open(url, timeout=timeout)) as f:
        raw = f.read()
        if write_html_to is not None:
            with open(write_html_to, 'wb') as f:
                f.write(raw)
        doc = html.fromstring(raw)
        try:
            results = doc.xpath('//div[@id="atfResults" and @class]')[0]
        except IndexError:
            return

        if 's-result-list-parent-container' in results.get('class', ''):
            data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]"
            format_xpath = './/a[contains(text(), "Kindle Edition")]//text()'
            asin_xpath = '@data-asin'
            cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src"
            title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()"
            author_xpath = './/span[starts-with(text(), "by ")]/following-sibling::span//text()'
            price_xpath = '(.//span[contains(@class, " s-price ")])[last()]//text()'
        elif 'grid' in results.get('class', ''):
            data_xpath = '//div[contains(@class, "prod")]'
            format_xpath = (
                './/ul[contains(@class, "rsltGridList")]'
                '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()'
            )
            asin_xpath = '@name'
            cover_xpath = './/img[contains(@class, "productImage")]/@src'
            title_xpath = './/h3[@class="newaps"]/a//text()'
            author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
            price_xpath = (
                './/ul[contains(@class, "rsltGridList")]'
                '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()'
            )
        elif 'ilresults' in results.get('class', ''):
            data_xpath = '//li[(@class="ilo")]'
            format_xpath = (
                './/ul[contains(@class, "rsltGridList")]'
                '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()'
            )
            asin_xpath = '@name'
            cover_xpath = './div[@class = "ilf"]/a/img[contains(@class, "ilo")]/@src'
            title_xpath = './/h3[@class="newaps"]/a//text()'
            author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
            # Results can be in a grid (table) or a column
            price_xpath = (
                './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]'
                '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()'
            )
        elif 'list' in results.get('class', ''):
            data_xpath = '//div[contains(@class, "prod")]'
            format_xpath = (
                './/ul[contains(@class, "rsltL")]'
                '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()'
            )
            asin_xpath = '@name'
            cover_xpath = './/img[contains(@class, "productImage")]/@src'
            title_xpath = './/h3[@class="newaps"]/a//text()'
            author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
            price_xpath = (
                './/ul[contains(@class, "rsltL")]'
                '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()'
            )
        else:
            return

        for data in doc.xpath(data_xpath):
            if counter <= 0:
                break

            # Even though we are searching digital-text only Amazon will still
            # put in results for non Kindle books (author pages). Se we need
            # to explicitly check if the item is a Kindle book and ignore it
            # if it isn't.
            format = ''.join(data.xpath(format_xpath))
            if 'kindle' not in format.lower():
                continue

            # We must have an asin otherwise we can't easily reference the
            # book later.
            asin = data.xpath(asin_xpath)
            if asin:
                asin = asin[0]
            else:
                continue

            cover_url = ''.join(data.xpath(cover_xpath))

            title = ''.join(data.xpath(title_xpath))
            author = ''.join(data.xpath(author_xpath))
            try:
                author = author.split('by ', 1)[1].split(" (")[0]
            except:
                pass

            price = ''.join(data.xpath(price_xpath))

            counter -= 1

            s = SearchResult()
            s.cover_url = cover_url.strip()
            s.title = title.strip()
            s.author = author.strip()
            s.price = price.strip()
            s.detail_item = asin.strip()
            s.formats = 'Kindle'

            yield s
示例#33
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://ebookstore.sony.com/search?keyword=%s' % urllib.quote_plus(
            query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for item in doc.xpath(
                    '//div[contains(@class, "searchResult")]/'
                    'descendant::li[contains(@class, "hreview")]'):
                if counter <= 0:
                    break

                curr = ''.join(
                    item.xpath(
                        'descendant::div[@class="pricing"]/p[@class="price money"]/descendant::*[@class="currency"]/@title'
                    )).strip()
                amt = ''.join(
                    item.xpath(
                        'descendant::div[@class="pricing"]/p[@class="price money"]/descendant::*[@class="amount"]/text()'
                    )).strip()
                s = SearchResult()
                s.price = (curr + ' ' +
                           amt) if (curr and amt) else _('Not Available')
                title = item.xpath('descendant::h3[@class="item"]')
                if not title:
                    continue
                title = etree.tostring(title[0],
                                       method='text',
                                       encoding=unicode)
                if not title:
                    continue
                s.title = title.strip()
                s.author = ''.join(
                    item.xpath('descendant::li[contains(@class, "author")]/'
                               'a[@class="fn"]/text()')).strip()
                if not s.author:
                    continue
                detail_url = ''.join(
                    item.xpath('descendant::h3[@class="item"]'
                               '/descendant::a[@class="fn" and @href]/@href'))
                if not detail_url:
                    continue
                if detail_url.startswith('/'):
                    detail_url = 'http:' + detail_url
                s.detail_item = detail_url

                counter -= 1

                cover_url = ''.join(
                    item.xpath('descendant::li[@class="coverart"]/'
                               'descendant::img[@src]/@src'))
                if cover_url:
                    if cover_url.startswith('//'):
                        cover_url = 'http:' + cover_url
                    elif cover_url.startswith('/'):
                        cover_url = 'http://ebookstore.sony.com' + cover_url
                    s.cover_url = url_slash_cleaner(cover_url)

                s.drm = SearchResult.DRM_UNKNOWN
                s.formats = 'Sony'

                yield s
示例#34
0
def search_kobo(query, max_results=10, timeout=60, write_html_to=None):
    from css_selectors import Select
    url = 'https://www.kobobooks.com/search/search.html?q=' + quote_plus(query)

    br = browser()

    with closing(br.open(url, timeout=timeout)) as f:
        raw = f.read()
        if write_html_to is not None:
            with open(write_html_to, 'wb') as f:
                f.write(raw)
        doc = html.fromstring(raw)
        select = Select(doc)
        for i, item in enumerate(select('.result-items .item-wrapper.book')):
            if i == max_results:
                break
            for img in select('.item-image img[src]', item):
                cover_url = img.get('src')
                if cover_url.startswith('//'):
                    cover_url = 'https:' + cover_url
                break
            else:
                cover_url = None

            for p in select('p.title', item):
                title = etree.tostring(p, method='text',
                                       encoding='unicode').strip()
                for a in select('a[href]', p):
                    url = a.get('href')
                    break
                else:
                    url = None
                break
            else:
                title = None

            authors = []
            for a in select('p.contributor-list a.contributor-name', item):
                authors.append(
                    etree.tostring(a, method='text',
                                   encoding='unicode').strip())
            authors = authors_to_string(authors)

            for p in select('p.price', item):
                price = etree.tostring(p, method='text',
                                       encoding='unicode').strip()
                break
            else:
                price = None

            if title and authors and url:
                s = SearchResult()
                s.cover_url = cover_url
                s.title = title
                s.author = authors
                s.price = price
                s.detail_item = url
                s.formats = 'EPUB'
                s.drm = SearchResult.DRM_UNKNOWN

                yield s
示例#35
0
def search_amazon(query,
                  max_results=10,
                  timeout=60,
                  write_html_to=None,
                  base_url=SEARCH_BASE_URL,
                  base_query=SEARCH_BASE_QUERY,
                  field_keywords='field-keywords'):
    uquery = base_query.copy()
    uquery[field_keywords] = query

    def asbytes(x):
        if isinstance(x, type('')):
            x = x.encode('utf-8')
        return x

    uquery = {asbytes(k): asbytes(v) for k, v in uquery.items()}
    url = base_url + '?' + urllib.urlencode(uquery).decode('ascii')
    br = browser(user_agent=get_user_agent())

    counter = max_results
    with closing(br.open(url, timeout=timeout)) as f:
        raw = f.read()
        if write_html_to is not None:
            with open(write_html_to, 'wb') as f:
                f.write(raw)
        doc = html.fromstring(raw)
        try:
            results = doc.xpath('//div[@id="atfResults" and @class]')[0]
        except IndexError:
            return

        if 's-result-list-parent-container' in results.get('class', ''):
            data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]"
            format_xpath = './/a[@title="%s"]/@title' % KINDLE_EDITION
            asin_xpath = '@data-asin'
            cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src"
            title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()"
            author_xpath = './/span[starts-with(text(), "%s ")]/following-sibling::span//text()' % BY
            price_xpath = (
                'descendant::div[@class="a-row a-spacing-none" and'
                ' not(span[contains(@class, "kindle-unlimited")])]//span[contains(@class, "s-price")]//text()'
            )
        else:
            return

        for data in doc.xpath(data_xpath):
            if counter <= 0:
                break

            # Even though we are searching digital-text only Amazon will still
            # put in results for non Kindle books (author pages). Se we need
            # to explicitly check if the item is a Kindle book and ignore it
            # if it isn't.
            format = ''.join(data.xpath(format_xpath))
            if 'kindle' not in format.lower():
                continue

            # We must have an asin otherwise we can't easily reference the
            # book later.
            asin = data.xpath(asin_xpath)
            if asin:
                asin = asin[0]
            else:
                continue

            cover_url = ''.join(data.xpath(cover_xpath))

            title = ''.join(data.xpath(title_xpath))
            author = ''.join(data.xpath(author_xpath))
            try:
                author = author.split('by ', 1)[1].split(" (")[0]
            except:
                pass

            price = ''.join(data.xpath(price_xpath))

            counter -= 1

            s = SearchResult()
            s.cover_url = cover_url.strip()
            s.title = title.strip()
            s.author = author.strip()
            s.price = price.strip()
            s.detail_item = asin.strip()
            s.formats = 'Kindle'

            yield s
示例#36
0
    def search(self, query, max_results=10, timeout=60):
        url = self.SEARCH_URL % urllib.quote_plus(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for item in doc.xpath(
                    '//div[@id="searchresult-list"]/descendant::div[contains(@class, "doc-item")]'
            ):
                if counter <= 0:
                    break

                s = SearchResult()
                s.price = _('Not Available')
                p = ''.join(
                    item.xpath(
                        'descendant::p[@class="doc-price"]/descendant::span[@itemprop="price"]/text()'
                    )).strip()
                if p:
                    s.price = 'AUD ' + p.split('$')[-1]

                title = item.xpath('descendant::h3[@class="doc-title"]')
                if not title:
                    continue
                title = etree.tostring(title[0],
                                       method='text',
                                       encoding=unicode)
                if not title:
                    continue
                st = item.xpath('descendant::p[@class="doc-subtitle"]')
                if st:
                    st = etree.tostring(st[0], method='text', encoding=unicode)
                    if st and st.strip():
                        title = title.strip() + ': ' + st
                s.title = title.strip()
                aut = item.xpath('descendant::p[@class="doc-author"]')
                if not aut:
                    continue
                s.author = etree.tostring(aut[0],
                                          method='text',
                                          encoding=unicode).strip()
                if not s.author:
                    continue
                du = ''.join(
                    item.xpath(
                        'descendant::h3[position() = 1 and @class="doc-title"]/descendant::a[position() = 1 and @href]/@href'
                    )).strip()
                if not du:
                    continue
                detail_url = 'https://au.readerstore.sony.com' + du
                s.detail_item = detail_url

                counter -= 1

                cover_url = ''.join(
                    item.xpath(
                        'descendant::p[@class="doc-cover" and position() = 1]/'
                        'descendant::img[position() = 1 and @src]/@src'))
                if cover_url:
                    s.cover_url = url_slash_cleaner(cover_url)

                s.drm = SearchResult.DRM_UNKNOWN
                s.formats = 'Sony'

                yield s
示例#37
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://woblink.com/ebooki-kategorie?query=' + urllib.quote_plus(
            query.encode('utf-8'))
        if max_results > 10:
            if max_results > 20:
                url += '&limit=30'
            else:
                url += '&limit=20'

        counter = max_results

        try:
            results = fork_job(js_browser,
                               'get_results', (
                                   url,
                                   timeout,
                               ),
                               module_is_source_code=True)
        except WorkerError as e:
            raise Exception('Could not get results: %s' % e.orig_tb)
        doc = html.fromstring(strip_encoding_declarations(results['result']))
        for data in doc.xpath('//div[@class="nw_katalog_lista_ksiazka "]'):
            if counter <= 0:
                break

            id = ''.join(
                data.xpath(
                    './/div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/@href'
                ))
            if not id:
                continue

            cover_url = ''.join(
                data.xpath(
                    './/div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/img/@src'
                ))
            title = ''.join(
                data.xpath(
                    './/h3[@class="nw_katalog_lista_ksiazka_detale_tytul"]/a[1]/text()'
                ))
            author = ', '.join(
                data.xpath(
                    './/p[@class="nw_katalog_lista_ksiazka_detale_autor"]/a/text()'
                ))
            price = ''.join(
                data.xpath('.//div[@class="nw_opcjezakupu_cena"]/text()'))
            formats = ', '.join(
                data.xpath(
                    './/p[@class="nw_katalog_lista_ksiazka_detale_format"]/span/text()'
                ))

            s = SearchResult()
            s.cover_url = 'http://woblink.com' + cover_url
            s.title = title.strip()
            s.author = author.strip()
            s.price = price + ' zł'
            s.detail_item = id.strip()
            s.formats = formats

            if 'DRM' in formats:
                s.drm = SearchResult.DRM_LOCKED

                counter -= 1
                yield s
            else:
                s.drm = SearchResult.DRM_UNLOCKED

                counter -= 1
                yield s
示例#38
0
    def search(self, query, max_results=10, timeout=60):
        url = self.search_url + query.encode(
            'ascii', 'backslashreplace').replace('%', '%25').replace(
                '\\x', '%').replace(' ', '+')
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())

            if doc.xpath(
                    '//div[@id = "atfResults" and contains(@class, "grid")]'):
                data_xpath = '//div[contains(@class, "prod")]'
                format_xpath = (
                    './/ul[contains(@class, "rsltGridList")]'
                    '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()'
                )
                asin_xpath = '@name'
                cover_xpath = './/img[contains(@class, "productImage")]/@src'
                title_xpath = './/h3[@class="newaps"]/a//text()'
                author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
                price_xpath = (
                    './/ul[contains(@class, "rsltGridList")]'
                    '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()'
                )
            elif doc.xpath(
                    '//div[@id = "atfResults" and contains(@class, "ilresults")]'
            ):
                data_xpath = '//li[(@class="ilo")]'
                format_xpath = (
                    './/ul[contains(@class, "rsltGridList")]'
                    '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()'
                )
                asin_xpath = '@name'
                cover_xpath = './div[@class = "ilf"]/a/img[contains(@class, "ilo")]/@src'
                title_xpath = './/h3[@class="newaps"]/a//text()'
                author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
                # Results can be in a grid (table) or a column
                price_xpath = (
                    './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]'
                    '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()'
                )
            elif doc.xpath(
                    '//div[@id = "atfResults" and contains(@class, "list")]'):
                data_xpath = '//div[contains(@class, "prod")]'
                format_xpath = (
                    './/ul[contains(@class, "rsltL")]'
                    '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()'
                )
                asin_xpath = '@name'
                cover_xpath = './/img[contains(@class, "productImage")]/@src'
                title_xpath = './/h3[@class="newaps"]/a//text()'
                author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
                price_xpath = (
                    './/ul[contains(@class, "rsltL")]'
                    '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()'
                )
            else:
                return

            for data in doc.xpath(data_xpath):
                if counter <= 0:
                    break

                # Even though we are searching digital-text only Amazon will still
                # put in results for non Kindle books (author pages). Se we need
                # to explicitly check if the item is a Kindle book and ignore it
                # if it isn't.
                format = ''.join(data.xpath(format_xpath))
                if 'kindle' not in format.lower():
                    continue

                # We must have an asin otherwise we can't easily reference the
                # book later.
                asin = data.xpath(asin_xpath)
                if asin:
                    asin = asin[0]
                else:
                    continue

                cover_url = ''.join(data.xpath(cover_xpath))

                title = ''.join(data.xpath(title_xpath))
                author = ''.join(data.xpath(author_xpath))
                try:
                    author = author.split('by ', 1)[1].split(" (")[0]
                except:
                    pass

                price = ''.join(data.xpath(price_xpath))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url.strip()
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = asin.strip()
                s.formats = 'Kindle'

                yield s
示例#39
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.diesel-ebooks.com/index.php?page=seek&id[m]=&id[c]=scope%253Dinventory&id[q]=' + urllib.quote_plus(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            book_url = f.geturl()
            doc = html.fromstring(f.read())

            if doc.xpath('not(boolean(//select[contains(@id, "selection")]))'):
                # This is the page for an individual book
                id = ''.join(doc.xpath('//div[@class="price_fat"]//a/@href'))
                mo = re.search('(?<=id=).+?(?=&)', id)
                if not mo:
                    yield None
                id = mo.group()

                cover_url = ''.join(doc.xpath('//div[@class="cover"]/a/@href'))

                title = ''.join(doc.xpath('//div[@class="desc_fat"]//h1/text()'))
                author = ''.join(doc.xpath('//div[@class="desc_fat"]//span[@itemprop="author"]/text()'))
                price = ''.join(doc.xpath('//div[@class="price_fat"]//h1/text()'))

                formats = ', '.join(doc.xpath('//div[@class="desc_fat"]//p[contains(text(), "Format")]/text()'))
                a, b, formats = formats.partition('Format:')

                drm = SearchResult.DRM_LOCKED
                if 'drm free' in formats.lower():
                    drm = SearchResult.DRM_UNLOCKED

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = book_url
                s.formats = formats
                s.drm = drm

                yield s
            else:
                for data in doc.xpath('//div[contains(@class, "item")]'):
                    if counter <= 0:
                        break

                    id = ''.join(data.xpath('div[@class="cover"]/a/@href'))
                    if not id or '/item/' not in id:
                        continue

                    cover_url = ''.join(data.xpath('div[@class="cover"]//img/@src'))

                    title = ''.join(data.xpath('.//div[@class="content"]//h2/a/text()'))
                    author = ''.join(data.xpath('.//div[@class="content"]/span//a/text()'))
                    price = ''
                    price_elem = data.xpath('.//div[@class="price_fat"]//h1/text()')
                    if price_elem:
                        price = price_elem[0]

                    formats = ', '.join(data.xpath('.//div[@class="book-info"]//text()')).strip()
                    a, b, formats = formats.partition('Format:')
                    drm = SearchResult.DRM_LOCKED
                    if 'drm free' in formats.lower():
                        drm = SearchResult.DRM_UNLOCKED

                    counter -= 1

                    s = SearchResult()
                    s.cover_url = cover_url
                    s.title = title.strip()
                    s.author = author.strip()
                    s.price = price.strip()
                    s.detail_item = id.strip()
                    s.formats = formats
                    s.drm = drm

                    yield s
示例#40
0
    def search(self, query, max_results=10, timeout=60):
        if not hasattr(self, 'open_search_url'):
            return

        description = Description(self.open_search_url)
        url_template = description.get_best_template()
        if not url_template:
            return
        oquery = Query(url_template)

        # set up initial values
        oquery.searchTerms = query
        oquery.count = max_results
        url = oquery.url()
        
        counter = max_results
        br = browser()
        with closing(br.open(url, timeout=timeout)) as f:
            doc = etree.fromstring(f.read())
            for data in doc.xpath('//*[local-name() = "entry"]'):
                if counter <= 0:
                    break
            
                counter -= 1
    
                s = SearchResult()
                
                s.detail_item = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip()

                for link in data.xpath('./*[local-name() = "link"]'):
                    rel = link.get('rel')
                    href = link.get('href')
                    type = link.get('type')
                    
                    if rel and href and type:
                        if 'http://opds-spec.org/thumbnail' in rel:
                            s.cover_url = href
                        elif 'http://opds-spec.org/image/thumbnail' in rel:
                            s.cover_url = href
                        elif 'http://opds-spec.org/acquisition/buy' in rel:
                            s.detail_item = href
                        elif 'http://opds-spec.org/acquisition' in rel:
                            if type:
                                ext = mimetypes.guess_extension(type)
                                if ext:
                                    ext = ext[1:].upper().strip()
                                    s.downloads[ext] = href
                s.formats = ', '.join(s.downloads.keys()).strip()
                
                s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
                s.author = ', '.join(data.xpath('./*[local-name() = "author"]//*[local-name() = "name"]//text()')).strip()
                
                price_e = data.xpath('.//*[local-name() = "price"][1]')
                if price_e:
                    price_e = price_e[0]
                    currency_code = price_e.get('currencycode', '')
                    price = ''.join(price_e.xpath('.//text()')).strip()
                    s.price = currency_code + ' ' + price
                    s.price = s.price.strip()
                

                yield s
示例#41
0
def search_amazon(self, query, max_results=10, timeout=60, write_html_to=None):
    field_keywords = self.FIELD_KEYWORDS
    uquery = self.SEARCH_BASE_QUERY.copy()
    uquery[field_keywords] = query

    def asbytes(x):
        if isinstance(x, type('')):
            x = x.encode('utf-8')
        return x

    uquery = {asbytes(k): asbytes(v) for k, v in uquery.items()}
    url = self.SEARCH_BASE_URL + '?' + urlencode(uquery)

    counter = max_results
    raw = read_url(self.scraper_storage, url, timeout=timeout)
    if write_html_to is not None:
        with open(write_html_to, 'w') as f:
            f.write(raw)
    doc = html.fromstring(raw)
    for result in doc.xpath(
            '//div[contains(@class, "s-result-list")]//div[@data-index and @data-asin]'
    ):
        kformat = ''.join(
            result.xpath('.//a[contains(text(), "{}")]//text()'.format(
                self.KINDLE_EDITION)))
        # Even though we are searching digital-text only Amazon will still
        # put in results for non Kindle books (author pages). So we need
        # to explicitly check if the item is a Kindle book and ignore it
        # if it isn't.
        if 'kindle' not in kformat.lower():
            continue
        asin = result.get('data-asin')
        if not asin:
            continue

        cover_url = ''.join(result.xpath('.//img/@src'))
        title = etree.tostring(result.xpath('.//h2')[0],
                               method='text',
                               encoding='unicode')
        adiv = result.xpath('.//div[contains(@class, "a-color-secondary")]')[0]
        aparts = etree.tostring(adiv, method='text',
                                encoding='unicode').split()
        idx = aparts.index(self.BY)
        author = ' '.join(aparts[idx + 1:]).split('|')[0].strip()
        price = ''
        for span in result.xpath(
                './/span[contains(@class, "a-price")]/span[contains(@class, "a-offscreen")]'
        ):
            q = ''.join(span.xpath('./text()'))
            if q:
                price = q
                break

        counter -= 1

        s = SearchResult()
        s.cover_url = cover_url.strip()
        s.title = title.strip()
        s.author = author.strip()
        s.detail_item = asin.strip()
        s.price = price.strip()
        s.formats = 'Kindle'

        yield s
示例#42
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.nexto.pl/szukaj.xml?search-clause=' + urllib.quote_plus(
            query) + '&scid=1015'

        br = browser()
        offset = 0

        counter = max_results

        while counter:
            with closing(
                    br.open(url + '&_offset=' + str(offset),
                            timeout=timeout)) as f:
                doc = html.fromstring(f.read())
                for data in doc.xpath('//ul[@class="productslist"]/li'):
                    if counter <= 0:
                        break

                    id = ''.join(
                        data.xpath(
                            './/div[@class="cover_container"]/a[1]/@href'))
                    if not id:
                        continue

                    price = ''.join(
                        data.xpath('.//strong[@class="nprice"]/text()'))

                    cover_url = ''.join(
                        data.xpath('.//img[@class="cover"]/@src'))
                    cover_url = re.sub(r'%2F', '/', cover_url)
                    cover_url = re.sub(r'widthMax=120&heightMax=200',
                                       'widthMax=64&heightMax=64', cover_url)
                    title = ''.join(data.xpath('.//a[@class="title"]/text()'))
                    title = re.sub(r' - ebook$', '', title)
                    formats = ', '.join(
                        data.xpath(
                            './/ul[@class="formats_available"]/li//b/text()'))
                    DrmFree = re.search(r'znak', formats)
                    formats = re.sub(r'\ ?\(.+?\)', '', formats)

                    author = ''
                    with closing(
                            br.open('http://www.nexto.pl/' + id.strip(),
                                    timeout=timeout / 4)) as nf:
                        idata = html.fromstring(nf.read())
                        author = ', '.join(
                            idata.xpath(
                                '//div[@class="basic_data"]/p[1]/b/a/text()'))

                    counter -= 1

                    s = SearchResult()
                    s.cover_url = cover_url if cover_url[:
                                                         4] == 'http' else 'http://www.nexto.pl' + cover_url
                    s.title = title.strip()
                    s.author = author.strip()
                    s.price = price
                    s.detail_item = id.strip()
                    s.drm = SearchResult.DRM_UNLOCKED if DrmFree else SearchResult.DRM_LOCKED
                    s.formats = formats.upper().strip()

                    yield s
                if not doc.xpath(
                        '//div[@class="listnavigator"]//a[@class="next"]'):
                    break
            offset += 10
示例#43
0
def search(query, max_results=10, timeout=60):
    url = 'https://woblink.com/publication/ajax?mode=none&query=' + quote_plus(
        query)
    if max_results > 10:
        if max_results > 20:
            url += '&limit=30'
        else:
            url += '&limit=20'
    br = browser(user_agent='CalibreCrawler/1.0')
    br.set_handle_gzip(True)
    rq = Request(url,
                 headers={
                     'Content-Type':
                     'application/x-www-form-urlencoded; charset=UTF-8',
                     'X-Requested-With': 'XMLHttpRequest',
                     'Referrer': 'https://woblink.com/ebooki-kategorie',
                     'Cache-Control': 'max-age=0',
                 },
                 data=urlencode({
                     'nw_filtry_filtr_zakrescen_formularz[min]':
                     '0',
                     'nw_filtry_filtr_zakrescen_formularz[max]':
                     '350',
                 }))
    r = br.open(rq)
    raw = r.read()
    doc = html.fromstring('<html><body>' + raw.decode('utf-8') +
                          '</body></html>')
    counter = max_results

    for data in doc.xpath(
            '//div[@class="nw_katalog_lista_ksiazka ebook " or @class="nw_katalog_lista_ksiazka ebook promocja"]'
    ):
        if counter <= 0:
            break

        id = ''.join(
            data.xpath(
                './/div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/@href'
            ))
        if not id:
            continue

        cover_url = ''.join(
            data.xpath(
                './/div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/img/@src'
            ))
        title = ''.join(
            data.xpath(
                './/h3[@class="nw_katalog_lista_ksiazka_detale_tytul"]/a[1]/text()'
            ))
        author = ', '.join(
            data.xpath(
                './/p[@class="nw_katalog_lista_ksiazka_detale_autor"]/a/text()'
            ))
        price = ''.join(
            data.xpath('.//div[@class="nw_opcjezakupu_cena"]/span[2]/text()'))
        formats = ', '.join(
            data.xpath(
                './/p[@class="nw_katalog_lista_ksiazka_detale_format"]/span/text()'
            ))

        s = SearchResult()
        s.cover_url = cover_url
        s.title = title.strip()
        s.author = author.strip()
        s.price = price + ' zł'
        s.detail_item = id.strip()
        s.formats = formats

        counter -= 1
        s.drm = SearchResult.DRM_LOCKED if 'DRM' in formats else SearchResult.DRM_UNLOCKED
        yield s
示例#44
0
    def search(self, query, max_results=20, timeout=60):

        br = browser()

        counter = max_results
        page = 1
        while counter:
            with closing(
                    br.open(
                        'http://www.publio.pl/szukaj,strona' + str(page) +
                        '.html?q=' + urllib.quote(query) +
                        '&sections=EMAGAZINE&sections=MINIBOOK&sections=EBOOK',
                        timeout=timeout)) as f:
                doc = html.fromstring(f.read())
                for data in doc.xpath('//div[@class="item"]'):
                    if counter <= 0:
                        break

                    id = ''.join(data.xpath('.//div[@class="img"]/a/@href'))
                    if not id:
                        continue

                    cover_url = ''.join(
                        data.xpath(
                            './/div[@class="img"]/a/img/@data-original'))
                    title = ''.join(
                        data.xpath('.//div[@class="img"]/a/@title'))
                    title2 = ''.join(
                        data.xpath('.//div[@class="desc"]/h5//text()'))
                    if title2:
                        title = title + '. ' + title2
                    if (''.join(
                            data.xpath(
                                './div[@class="desc"]/div[@class="detailShortList"]/div[last()]/span/text()'
                            )).strip() == "Seria:"):
                        series = ''.join(
                            data.xpath(
                                './div[@class="desc"]/div[@class="detailShortList"]/div[last()]/a/@title'
                            ))
                        title = title + ' (seria ' + series + ')'
                    author = ', '.join(
                        data.xpath(
                            './div[@class="desc"]/div[@class="detailShortList"]/div[@class="row"][1]/a/@title'
                        ))
                    price = ''.join(
                        data.xpath(
                            './/div[@class="priceBox tk-museo-slab"]/ins/text()'
                        ))
                    if not price:
                        price = ''.join(
                            data.xpath(
                                './/div[@class="priceBox tk-museo-slab"]/text()'
                            )).strip()
                    formats = ', '.join([
                        x.strip() for x in data.xpath(
                            './/div[@class="formats"]/a/text()')
                    ])

                    counter -= 1

                    s = SearchResult()
                    s.cover_url = 'http://www.publio.pl' + cover_url
                    s.title = title.strip()
                    s.author = author
                    s.price = price
                    s.detail_item = 'http://www.publio.pl' + id.strip()
                    s.drm = SearchResult.DRM_LOCKED if 'DRM' in formats else SearchResult.DRM_UNLOCKED
                    s.formats = formats.replace(' DRM', '').strip()

                    yield s
                if not doc.xpath('boolean(//a[@class="next"])'):
                    break
                page += 1
示例#45
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://woblink.com/katalog-e-book?query=' + urllib.quote_plus(
            query.encode('utf-8'))
        if max_results > 10:
            if max_results > 20:
                url += '&limit=30'
            else:
                url += '&limit=20'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="nw_katalog_lista_ksiazka"]'):
                if counter <= 0:
                    break

                id = ''.join(
                    data.xpath(
                        './/div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/@href'
                    ))
                if not id:
                    continue

                cover_url = ''.join(
                    data.xpath(
                        './/div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/img/@src'
                    ))
                title = ''.join(
                    data.xpath(
                        './/h2[@class="nw_katalog_lista_ksiazka_detale_tytul"]/a[1]/text()'
                    ))
                author = ', '.join(
                    data.xpath(
                        './/h3[@class="nw_katalog_lista_ksiazka_detale_autor"]/a/text()'
                    ))
                price = ''.join(
                    data.xpath(
                        './/div[@class="nw_katalog_lista_ksiazka_opcjezakupu_cena"]/span/text()'
                    ))
                price = re.sub('\.', ',', price)
                formats = ', '.join(
                    data.xpath(
                        './/p[@class="nw_katalog_lista_ksiazka_detale_formaty"]/span/text()'
                    ))

                s = SearchResult()
                s.cover_url = 'http://woblink.com' + cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price + ' zł'
                s.detail_item = id.strip()
                s.formats = formats

                if 'EPUB DRM' in formats:
                    s.drm = SearchResult.DRM_LOCKED

                    counter -= 1
                    yield s
                else:
                    s.drm = SearchResult.DRM_UNLOCKED

                    counter -= 1
                    yield s
示例#46
0
    def search(self, query, max_results=10, timeout=60):
        counter = max_results
        page = 1
        url = 'http://www.gandalf.com.pl/we/' + urllib.quote_plus(
            query.decode('utf-8').encode('iso8859_2')) + '/bdb'

        br = browser()

        while counter:
            with closing(
                    br.open((url + str(page - 1) + '/#s') if (page - 1) else
                            (url + '/#s'),
                            timeout=timeout)) as f:
                doc = html.fromstring(f.read())
                for data in doc.xpath('//div[@class="box"]'):
                    if counter <= 0:
                        break

                    id = ''.join(
                        data.xpath('.//div[@class="info"]/h3/a/@href'))
                    if not id:
                        continue

                    cover_url = ''.join(
                        data.xpath('.//div[@class="info"]/h3/a/@id'))
                    title = ''.join(
                        data.xpath('.//div[@class="info"]/h3/a/@title'))
                    formats = ''.join(
                        data.xpath('.//div[@class="info"]/p[1]/text()'))
                    formats = re.findall(r'\((.*?)\)', formats)[0]
                    author = ''.join(
                        data.xpath(
                            './/div[@class="info"]/h4/text() | .//div[@class="info"]/h4/span/text()'
                        ))
                    price = ''.join(
                        data.xpath('.//div[@class="options"]/h3/text()'))
                    price = re.sub('PLN', 'zł', price)
                    price = re.sub('\.', ',', price)
                    drm = data.xpath(
                        'boolean(.//div[@class="info" and contains(., "Zabezpieczenie: DRM")])'
                    )

                    counter -= 1

                    s = SearchResult()
                    s.cover_url = 'http://imguser.gandalf.com.pl/' + re.sub(
                        'p', 'p_', cover_url) + '.jpg'
                    s.title = title.strip()
                    s.author = author.strip()
                    s.price = price
                    s.detail_item = id.strip()
                    if drm:
                        s.drm = SearchResult.DRM_LOCKED
                    else:
                        s.drm = SearchResult.DRM_UNLOCKED
                    s.formats = formats.upper().strip()

                    yield s
                if not doc.xpath(
                        'boolean(//div[@class="wyszukiwanie_podstawowe_header"]//div[@class="box"])'
                ):
                    break
                page += 1
def search_flibusta(url, query, web_url, max_results=10, timeout=60):
    description = Description(url)
    url_template = description.get_best_template()
    if not url_template:
        return
    oquery = Query(url_template)

    # set up initial values
    oquery.searchTerms = query
    oquery.count = max_results
    url = oquery.url()

    counter = max_results
    br = browser()
    with closing(br.open(url, timeout=timeout)) as f:
        doc = etree.fromstring(f.read())
        for data in doc.xpath('//*[local-name() = "entry"]'):
            if counter <= 0:
                break

            counter -= 1

            s = SearchResult()

            s.detail_item = ''.join(
                data.xpath('./*[local-name() = "id"]/text()')).strip()

            for link in data.xpath('./*[local-name() = "link"]'):
                rel = link.get('rel')
                href = link.get('href')
                type = link.get('type')

                if rel and href and type:
                    if 'http://opds-spec.org/thumbnail' in rel:
                        s.cover_url = web_url + href
                    elif 'http://opds-spec.org/image/thumbnail' in rel:
                        s.cover_url = web_url + href
                    elif 'http://opds-spec.org/acquisition/buy' in rel:
                        s.detail_item = web_url + href
                    elif 'http://opds-spec.org/acquisition/sample' in rel:
                        pass
                    elif 'http://opds-spec.org/acquisition/open-access' in rel:
                        if 'application/fb2+zip' in type:
                            s.downloads['FB2'] = web_url + href
                        elif 'application/txt+zip' in type:
                            s.downloads['TXT'] = web_url + href
                        elif 'application/html+zip' in type:
                            s.downloads['HTML'] = web_url + href
                        elif 'application/x-mobipocket-ebook' in type:
                            s.downloads['MOBI'] = web_url + href
                        elif type:
                            ext = guess_extension(type)
                            ext2 = guess_extension(type.replace("+zip", ""))
                            if ext:
                                ext = ext[1:].upper().strip()
                                s.downloads[ext] = web_url + href
                            elif ext2:
                                ext2 = ext2[1:].upper().strip()
                                s.downloads[ext2] = web_url + href
            s.formats = ', '.join(s.downloads.keys()).strip()

            s.title = ' '.join(
                data.xpath('./*[local-name() = "title"]//text()')).strip()
            s.author = ', '.join(
                data.xpath(
                    './*[local-name() = "author"]//*[local-name() = "name"]//text()'
                )).strip()

            s.price = '$0.00'
            s.drm = SearchResult.DRM_UNLOCKED

            yield s
示例#48
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.empik.com/szukaj/produkt?c=ebooki-ebooki&q=' + urllib.quote(
            query
        ) + '&qtype=basicForm&start=1&catalogType=pl&searchCategory=3501&format=epub&format=mobi&format=pdf&resultsPP=' + str(
            max_results)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="productsSet"]/div'):
                if counter <= 0:
                    break

                id = ''.join(
                    data.xpath('.//a[@class="productBox-450Title"]/@href'))
                if not id:
                    continue

                cover_url = ''.join(
                    data.xpath(
                        './/div[@class="productBox-450Pic"]/a/img/@data-original'
                    ))
                title = ''.join(
                    data.xpath('.//a[@class="productBox-450Title"]/text()'))
                title = re.sub(r' \(ebook\)', '', title)
                author = ', '.join(
                    data.xpath(
                        './/div[@class="productBox-450Author"]/a/text()'))
                price = ''.join(
                    data.xpath('.//span[@class="currentPrice"]/text()'))
                formats = ''.join(
                    data.xpath('.//div[@class="productBox-450Type"]/text()'))
                formats = re.sub(r'Ebook *,? *', '', formats)
                formats = re.sub(r'\(.*\)', '', formats)
                with closing(
                        br.open('http://empik.com' + id.strip(),
                                timeout=timeout / 4)) as nf:
                    idata = html.fromstring(nf.read())
                    crawled = idata.xpath(
                        './/td[(@class="connectedInfo") or (@class="connectedInfo connectedBordered")]/a/text()'
                    )
                    formats_more = ','.join([
                        re.sub('ebook, ', '', x) for x in crawled
                        if 'ebook' in x
                    ])
                    if formats_more:
                        formats += ', ' + formats_more
                drm = data.xpath(
                    'boolean(.//div[@class="productBox-450Type" and contains(text(), "ADE")])'
                )

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.detail_item = 'http://empik.com' + id.strip()
                s.formats = formats.upper().strip()
                s.drm = SearchResult.DRM_LOCKED if drm else SearchResult.DRM_UNLOCKED

                yield s
示例#49
0
def open_search(url, query, max_results=10, timeout=60):
    description = Description(url)

    url_template = description.get_best_template()

    if not url_template:
        return
    oquery = Query(url_template)

    # set up initial values
    oquery.searchTerms = query
    oquery.count = max_results
    url = oquery.url()

    counter = max_results
    br = browser()
    with closing(br.open(url, timeout=timeout)) as f:
        doc = safe_xml_fromstring(f.read())
        for data in doc.xpath('//*[local-name() = "entry"]'):
            if counter <= 0:
                break

            counter -= 1

            s = SearchResult()

            s.detail_item = ''.join(
                data.xpath('./*[local-name() = "id"]/text()')).strip()

            for link in data.xpath('./*[local-name() = "link"]'):
                rel = link.get('rel')
                href = BASE_URL + link.get('href')
                type = link.get('type')
                title = link.get('title')
                ext = None

                if rel and href and type:
                    if 'http://opds-spec.org/image/thumbnail' == rel:
                        s.cover_url = href
                    elif 'http://opds-spec.org/acquisition/open-access' == rel:
                        if type == 'application/epub+zip' and title == 'Recommended compatible epub':
                            ext = 'EPUB'
                        elif type == 'application/x-mobipocket-ebook':
                            ext = 'AZW3'

                        if ext:
                            s.downloads[ext] = href

            s.formats = ', '.join(s.downloads.keys()).strip()

            s.title = ' '.join(
                data.xpath('./*[local-name() = "title"]//text()')).strip()
            s.author = ', '.join(
                data.xpath(
                    './*[local-name() = "author"]//*[local-name() = "name"]//text()'
                )).strip()

            s.drm = SearchResult.DRM_UNLOCKED
            s.price = '$0.00'

            yield s
示例#50
0
    def search(self, query, max_results=10, timeout=60):
        url = ('http://www.ebook.de/de/pathSearch?nav=52122&searchString=' +
               quote(query))
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath(
                    '//div[contains(@class, "articlecontainer")]'):
                if counter <= 0:
                    break

                details = data.xpath(
                    './div[contains(@class, "articleinfobox")]')
                if not details:
                    continue
                details = details[0]
                id_ = ''.join(details.xpath('./a/@name')).strip()
                if not id_:
                    continue
                title = ''.join(
                    details.xpath('./h3[@class="title"]/a/text()')).strip()

                author = ''.join(
                    details.xpath('.//div[@class="author"]/text()')).strip()
                if author.startswith('von'):
                    author = author[4:]

                pdf = details.xpath(
                    'boolean(.//span[@class="bindername" and contains(text(), "pdf")]/text())'
                )
                epub = details.xpath(
                    'boolean(.//span[@class="bindername" and contains(text(), "epub")]/text())'
                )
                mobi = details.xpath(
                    'boolean(.//span[@class="bindername" and contains(text(), "mobipocket")]/text())'
                )

                cover_url = ''.join(
                    data.xpath('.//div[@class="coverimg"]/a/img/@src'))
                price = ''.join(
                    data.xpath('.//div[@class="preis"]/text()')).replace(
                        '*', '').strip()

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.drm = SearchResult.DRM_UNKNOWN
                s.detail_item = id_
                formats = []
                if epub:
                    formats.append('ePub')
                if pdf:
                    formats.append('PDF')
                if mobi:
                    formats.append('MOBI')
                s.formats = ', '.join(formats)

                yield s
示例#51
0
    def search(self, query, max_results=10, timeout=60):
        url = (
            'http://www.ebook.nl/store/advanced_search_result.php?keywords=' +
            urllib.parse.quote(query))
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath(
                    '//table[contains(@class, "productListing")]/tr'):
                if counter <= 0:
                    break

                details = data.xpath('./td/div[@class="prodImage"]/a')
                if not details:
                    continue
                details = details[0]
                id = ''.join(details.xpath('./@href')).strip()
                id = id[id.rfind('/') + 1:]
                i = id.rfind('?')
                if i > 0:
                    id = id[:i]
                if not id:
                    continue
                cover_url = 'http://www.ebook.nl/store/' + ''.join(
                    details.xpath('./img/@src'))
                title = ''.join(details.xpath('./img/@title')).strip()
                author = ''.join(
                    data.xpath(
                        './td/div[@class="prodTitle"]/h3/a/text()')).strip()
                price = ''.join(
                    data.xpath('./td/div[@class="prodTitle"]/b/text()'))
                pdf = data.xpath(
                    'boolean(./td/div[@class="prodTitle"]/'
                    'p[contains(text(), "Bestandsformaat: Pdf")])')
                epub = data.xpath(
                    'boolean(./td/div[@class="prodTitle"]/'
                    'p[contains(text(), "Bestandsformaat: ePub")])')
                nodrm = data.xpath('boolean(./td/div[@class="prodTitle"]/'
                                   'p[contains(text(), "zonder DRM") or'
                                   '  contains(text(), "watermerk")])')
                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                if nodrm:
                    s.drm = SearchResult.DRM_UNLOCKED
                else:
                    s.drm = SearchResult.DRM_LOCKED
                s.detail_item = id
                formats = []
                if epub:
                    formats.append('ePub')
                if pdf:
                    formats.append('PDF')
                s.formats = ','.join(formats)

                yield s
    def search(self, query, max_results=10, timeout=60):
        '''
        Manybooks uses a very strange opds feed. The opds
        main feed is structured like a stanza feed. The 
        search result entries give very little information
        and requires you to go to a detail link. The detail
        link has the wrong type specified (text/html instead
        of application/atom+xml).
        '''
        if not hasattr(self, 'open_search_url'):
            return

        description = Description(self.open_search_url)
        url_template = description.get_best_template()
        if not url_template:
            return
        oquery = Query(url_template)

        # set up initial values
        oquery.searchTerms = query
        oquery.count = max_results
        url = oquery.url()

        counter = max_results
        br = browser()
        with closing(br.open(url, timeout=timeout)) as f:
            raw_data = f.read()
            raw_data = raw_data.decode('utf-8', 'replace')
            doc = etree.fromstring(raw_data)
            for data in doc.xpath('//*[local-name() = "entry"]'):
                if counter <= 0:
                    break

                counter -= 1

                s = SearchResult()

                detail_links = data.xpath(
                    './*[local-name() = "link" and @type = "text/html"]')
                if not detail_links:
                    continue
                detail_link = detail_links[0]
                detail_href = detail_link.get('href')
                if not detail_href:
                    continue

                s.detail_item = 'http://manybooks.net/titles/' + detail_href.split(
                    'tid=')[-1] + '.html'
                # These can have HTML inside of them. We are going to get them again later
                # just in case.
                s.title = ''.join(
                    data.xpath('./*[local-name() = "title"]//text()')).strip()
                s.author = ', '.join(
                    data.xpath(
                        './*[local-name() = "author"]//text()')).strip()

                # Follow the detail link to get the rest of the info.
                with closing(br.open(detail_href, timeout=timeout / 4)) as df:
                    ddoc = etree.fromstring(df.read())
                    ddata = ddoc.xpath('//*[local-name() = "entry"][1]')
                    if ddata:
                        ddata = ddata[0]

                        # This is the real title and author info we want. We got
                        # it previously just in case it's not specified here for some reason.
                        s.title = ''.join(
                            ddata.xpath('./*[local-name() = "title"]//text()')
                        ).strip()
                        s.author = ', '.join(
                            ddata.xpath('./*[local-name() = "author"]//text()')
                        ).strip()
                        if s.author.startswith(','):
                            s.author = s.author[1:]
                        if s.author.endswith(','):
                            s.author = s.author[:-1]

                        s.cover_url = ''.join(
                            ddata.xpath(
                                './*[local-name() = "link" and @rel = "http://opds-spec.org/thumbnail"][1]/@href'
                            )).strip()

                        for link in ddata.xpath(
                                './*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'
                        ):
                            type = link.get('type')
                            href = link.get('href')
                            if type:
                                ext = mimetypes.guess_extension(type)
                                if ext:
                                    ext = ext[1:].upper().strip()
                                    s.downloads[ext] = href

                s.price = '$0.00'
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = 'EPUB, PDB (eReader, PalmDoc, zTXT, Plucker, iSilo), FB2, ZIP, AZW, MOBI, PRC, LIT, PKG, PDF, TXT, RB, RTF, LRF, TCR, JAR'

                yield s
示例#53
0
    def search(self, query, max_results=10, timeout=60):
        # check for cyrillic symbols before performing search
        uquery = unicode(query.strip(), 'utf-8')
        reObj = re.search(u'^[а-яА-Я\\d\\s]{3,}$', uquery)
        if not reObj:
            return

        base_url = 'http://chitanka.info'
        url = base_url + '/search?q=' +  urllib2.quote(query)
        counter = max_results

        # search for book title
        br = browser()
        try:
            with closing(br.open(url, timeout=timeout)) as f:
                f = unicode(f.read(), 'utf-8')
                doc = html.fromstring(f)

                for data in doc.xpath('//ul[@class="superlist booklist"]/li'):
                    if counter <= 0:
                        break

                    id = ''.join(data.xpath('.//a[@class="booklink"]/@href')).strip()
                    if not id:
                        continue

                    counter -= 1

                    s = SearchResult()
                    s.cover_url = ''.join(data.xpath('.//a[@class="booklink"]/img/@src')).strip()
                    s.title = ''.join(data.xpath('.//a[@class="booklink"]/i/text()')).strip()
                    s.author = ''.join(data.xpath('.//span[@class="bookauthor"]/a/text()')).strip()
                    s.detail_item = id
                    s.drm = SearchResult.DRM_UNLOCKED
                    s.downloads['FB2'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-fb2"]/@href')).strip().replace('.zip', '')
                    s.downloads['EPUB'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-epub"]/@href')).strip().replace('.zip', '')
                    s.downloads['TXT'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-txt"]/@href')).strip().replace('.zip', '')
                    s.formats = 'FB2, EPUB, TXT, SFB'
                    yield s
        except urllib2.HTTPError as e:
            if e.code == 404:
                return
            else:
                raise

        # search for author names
        for data in doc.xpath('//ul[@class="superlist"][1]/li/dl/dt'):
            author_url = ''.join(data.xpath('.//a[contains(@href,"/person/")]/@href'))
            if author_url == '':
                continue
            if counter <= 0:
                break

            br2 = browser()
            with closing(br2.open(base_url + author_url, timeout=timeout)) as f:
                if counter <= 0:
                    break
                f = unicode(f.read(), 'utf-8')
                doc2 = html.fromstring(f)

                # search for book title
                for data in doc2.xpath('//ul[@class="superlist booklist"]/li'):
                    if counter <= 0:
                        break

                    id = ''.join(data.xpath('.//a[@class="booklink"]/@href')).strip()
                    if not id:
                        continue

                    title = ''.join(data.xpath('.//a[@class="booklink"]/i/text()')).strip()
                    author = ''.join(data.xpath('.//span[@class="bookauthor"]/a/text()')).strip()
                    if title.lower().find(query.lower()) == -1 and author.lower().find(query.lower()) == -1:
                        continue

                    counter -= 1

                    s = SearchResult()
                    s.cover_url = ''.join(data.xpath('.//a[@class="booklink"]/img/@src')).strip()
                    s.title = title
                    s.author = author
                    s.detail_item = id
                    s.drm = SearchResult.DRM_UNLOCKED
                    s.downloads['FB2'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-fb2"]/@href')).strip().replace('.zip', '')
                    s.downloads['EPUB'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-epub"]/@href')).strip().replace('.zip', '')
                    s.downloads['TXT'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-txt"]/@href')).strip().replace('.zip', '')
                    s.formats = 'FB2, EPUB, TXT, SFB'
                    yield s
示例#54
0
def search_amazon(query,
                  max_results=10,
                  timeout=60,
                  write_html_to=None,
                  base_url=SEARCH_BASE_URL,
                  base_query=SEARCH_BASE_QUERY,
                  field_keywords='k'):
    uquery = base_query.copy()
    uquery[field_keywords] = query

    def asbytes(x):
        if isinstance(x, type('')):
            x = x.encode('utf-8')
        return x

    uquery = {asbytes(k): asbytes(v) for k, v in uquery.items()}
    url = base_url + '?' + urlencode(uquery).decode('ascii')
    br = browser(user_agent=get_user_agent())

    counter = max_results
    with closing(br.open(url, timeout=timeout)) as f:
        raw = f.read()
        if write_html_to is not None:
            with open(write_html_to, 'wb') as f:
                f.write(raw)
        doc = html.fromstring(raw)
        for result in doc.xpath(
                '//div[contains(@class, "s-result-list")]//div[@data-index and @data-asin]'
        ):
            kformat = ''.join(
                result.xpath(
                    './/a[contains(text(), "Kindle Edition")]//text()'))
            # Even though we are searching digital-text only Amazon will still
            # put in results for non Kindle books (author pages). Se we need
            # to explicitly check if the item is a Kindle book and ignore it
            # if it isn't.
            if 'kindle' not in kformat.lower():
                continue
            asin = result.get('data-asin')
            if not asin:
                continue

            cover_url = ''.join(result.xpath('.//img/@src'))
            title = etree.tostring(result.xpath('.//h5')[0],
                                   method='text',
                                   encoding='unicode')
            adiv = result.xpath(
                './/div[contains(@class, "a-color-secondary")]')[0]
            aparts = etree.tostring(adiv, method='text',
                                    encoding='unicode').split()
            idx = aparts.index('|')
            author = ' '.join(aparts[1:idx])
            price = ''.join(
                result.xpath(
                    './/span[contains(@class, "a-price")]/span[contains(@class, "a-offscreen")]/text()'
                ))

            counter -= 1

            s = SearchResult()
            s.cover_url = cover_url.strip()
            s.title = title.strip()
            s.author = author.strip()
            s.detail_item = asin.strip()
            s.price = price.strip()
            s.formats = 'Kindle'

            yield s
示例#55
0
    def search(self, query, max_results=10, timeout=60):
        url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+')
        #print(url)
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            allText = f.read()
            doc = html.fromstring(allText)#.decode('latin-1', 'replace'))

            if doc.xpath('//div[@id = "atfResults" and contains(@class, "grid")]'):
                #print('grid form')
                data_xpath = '//div[contains(@class, "prod")]'
                format_xpath = (
                        './/ul[contains(@class, "rsltGridList")]'
                        '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
                asin_xpath = '@name'
                cover_xpath = './/img[@class="productImage"]/@src'
                title_xpath = './/h3[@class="newaps"]/a//text()'
                author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
                price_xpath = (
                        './/ul[contains(@class, "rsltGridList")]'
                        '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
            elif doc.xpath('//div[@id = "atfResults" and contains(@class, "ilresults")]'):
                #print('ilo form')
                data_xpath = '//li[(@class="ilo")]'
                format_xpath = (
                        './/ul[contains(@class, "rsltGridList")]'
                        '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
                asin_xpath = '@name'
                cover_xpath = './div[@class = "ilf"]/a/img[contains(@class, "ilo")]/@src'
                title_xpath = './/h3[@class="newaps"]/a//text()'
                author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
                # Results can be in a grid (table) or a column
                price_xpath = (
                        './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]'
                        '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
            elif doc.xpath('//div[@id = "atfResults" and contains(@class, "list")]'):
                #print('list form')
                data_xpath = '//div[contains(@class, "prod")]'
                format_xpath = (
                        './/ul[contains(@class, "rsltL")]'
                        '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
                asin_xpath = '@name'
                cover_xpath = './/img[@class="productImage"]/@src'
                title_xpath = './/h3[@class="newaps"]/a//text()'
                author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
                price_xpath = (
                        './/ul[contains(@class, "rsltL")]'
                        '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
            else:
                # URK -- whats this?
                print('unknown result table form for Amazon EU search')
                #with open("c:/amazon_search_results.html", "w") as out:
                #    out.write(allText)
                return


            for data in doc.xpath(data_xpath):
                if counter <= 0:
                    break

                # Even though we are searching digital-text only Amazon will still
                # put in results for non Kindle books (authors pages). Se we need
                # to explicitly check if the item is a Kindle book and ignore it
                # if it isn't.
                format_ = ''.join(data.xpath(format_xpath))
                if 'kindle' not in format_.lower():
                    continue

                # We must have an asin otherwise we can't easily reference the
                # book later.
                asin = data.xpath(asin_xpath)
                if asin:
                    asin = asin[0]
                else:
                    continue

                cover_url = ''.join(data.xpath(cover_xpath))

                title = ''.join(data.xpath(title_xpath))

                authors = ''.join(data.xpath(author_xpath))
                authors = re.sub('^' + self.author_article, '', authors)
                authors = re.sub(self.and_word, ' & ', authors)
                mo = re.match(r'(.*)(\(\d.*)$', authors)
                if mo:
                    authors = mo.group(1).strip()

                price = ''.join(data.xpath(price_xpath))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url.strip()
                s.title = title.strip()
                s.author = authors.strip()
                s.price = price.strip()
                s.detail_item = asin.strip()
                s.drm = SearchResult.DRM_UNKNOWN
                s.formats = 'Kindle'

                yield s
示例#56
0
def search(query, max_results=10, timeout=60, write_raw_to=None):
    url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + quote_plus(
        query)

    counter = max_results
    br = browser(user_agent='calibre/' + __version__)
    with closing(br.open(url, timeout=timeout)) as f:
        raw = f.read()
        if write_raw_to is not None:
            with open(write_raw_to, 'wb') as f:
                f.write(raw)
        doc = etree.fromstring(raw)
        for data in doc.xpath('//*[local-name() = "entry"]'):
            if counter <= 0:
                break

            counter -= 1

            s = SearchResult()

            # We could use the <link rel="alternate" type="text/html" ...> tag from the
            # detail odps page but this is easier.
            id = fix_url(''.join(
                data.xpath('./*[local-name() = "id"]/text()')).strip())
            s.detail_item = url_slash_cleaner(
                '%s/ebooks/%s' % (web_url, re.sub(r'[^\d]', '', id)))
            s.title = ' '.join(
                data.xpath('./*[local-name() = "title"]//text()')).strip()
            s.author = ', '.join(
                data.xpath('./*[local-name() = "content"]//text()')).strip()
            if not s.title or not s.author:
                continue

            # Get the formats and direct download links.
            with closing(br.open(id, timeout=timeout / 4)) as nf:
                ndoc = etree.fromstring(nf.read())
                for link in ndoc.xpath(
                        '//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'
                ):
                    type = link.get('type')
                    href = link.get('href')
                    if type:
                        ext = mimetypes.guess_extension(type)
                        if ext:
                            ext = ext[1:].upper().strip()
                            s.downloads[ext] = fix_url(href)

            s.formats = ', '.join(s.downloads.keys())
            if not s.formats:
                continue

            for link in data.xpath('./*[local-name() = "link"]'):
                rel = link.get('rel')
                href = link.get('href')
                type = link.get('type')

                if rel and href and type:
                    href = fix_url(href)
                    if rel in ('http://opds-spec.org/thumbnail',
                               'http://opds-spec.org/image/thumbnail'):
                        if href.startswith('data:image/png;base64,'):
                            cdata = href.replace('data:image/png;base64,', '')
                            if not isinstance(cdata, bytes):
                                cdata = cdata.encode('ascii')
                            s.cover_data = base64.b64decode(cdata)

            yield s
示例#57
0
    def search(self, query, max_results=10, timeout=60):
        # check for cyrillic symbols before performing search
        uquery = type(u'')(query.strip(), 'utf-8')
        reObj = re.search(u'^[а-яА-Я\\d\\s]{2,}$', uquery)
        if not reObj:
            return

        base_url = 'http://e-knigi.net'
        url = base_url + '/virtuemart?page=shop.browse&search_category=0&search_limiter=anywhere&keyword=' + quote(
            query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())

            # if the store finds only one product, it opens directly detail view
            for data in doc.xpath('//div[@class="prod_details"]'):
                s = SearchResult()
                s.cover_url = ''.join(
                    data.xpath(
                        './/div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@src'
                    )).strip()
                s.title = ''.join(
                    data.xpath(
                        './/div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@alt'
                    )).strip()
                s.author = ''.join(
                    data.xpath(
                        './/div[@class="td_bg clearfix"]/div[@class="gk_product_tab"]/div/table/tr[3]/td[2]/text()'
                    )).strip()
                s.price = ''.join(
                    data.xpath(
                        './/span[@class="productPrice"]/text()')).strip()
                s.detail_item = url
                s.drm = SearchResult.DRM_UNLOCKED

                yield s
                return

            # search in store results
            for data in doc.xpath('//div[@class="browseProductContainer"]'):
                if counter <= 0:
                    break
                id = ''.join(data.xpath('.//a[1]/@href')).strip()
                if not id:
                    continue

                title = ''.join(
                    data.xpath('.//a[@class="gk_vm_product_image"]/img/@title')
                ).strip()
                author = ''.join(
                    data.xpath('.//div[@style="float:left;width:90%"]/b/text()'
                               )).strip().replace('Автор: ', '')

                if title.lower().find(
                        query.lower()) == -1 and author.lower().find(
                            query.lower()) == -1:
                    continue

                counter -= 1

                s = SearchResult()
                s.cover_url = ''.join(
                    data.xpath('.//a[@class="gk_vm_product_image"]/img/@src')
                ).strip()
                s.title = title
                s.author = author
                s.price = ''.join(
                    data.xpath(
                        './/span[@class="productPrice"]/text()')).strip()
                s.detail_item = base_url + id
                s.drm = SearchResult.DRM_UNLOCKED

                yield s
示例#58
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://woblink.com/katalog-e-book?query=' + urllib.quote_plus(
            query.encode('utf-8'))
        if max_results > 10:
            if max_results > 20:
                url += '&limit=30'
            else:
                url += '&limit=20'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="book-item backgroundmix"]'):
                if counter <= 0:
                    break

                id = ''.join(
                    data.xpath(
                        './/td[@class="w10 va-t mYHaveItYes"]/a[1]/@href'))
                if not id:
                    continue

                cover_url = ''.join(
                    data.xpath(
                        './/td[@class="w10 va-t mYHaveItYes"]/a[1]/img/@src'))
                title = ''.join(
                    data.xpath('.//h2[@class="title"]/a[1]/text()'))
                author = ', '.join(
                    data.xpath('.//td[@class="va-t"]/h3/a/text()'))
                price = ''.join(
                    data.xpath(
                        './/div[@class="prices"]/span[1]/strong/span/text()'))
                price = re.sub('\.', ',', price)
                formats = [
                    form[8:-4].split('.')[0]
                    for form in data.xpath('.//p[3]/img/@src')
                ]

                s = SearchResult()
                s.cover_url = 'http://woblink.com' + cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price + ' zł'
                s.detail_item = id.strip()

                if 'epub_drm' in formats:
                    s.drm = SearchResult.DRM_LOCKED
                    s.formats = 'EPUB'

                    counter -= 1
                    yield s
                elif 'pdf' in formats:
                    s.drm = SearchResult.DRM_LOCKED
                    s.formats = 'PDF'

                    counter -= 1
                    yield s
                else:
                    s.drm = SearchResult.DRM_UNLOCKED
                    if 'MOBI_nieb' in formats:
                        formats.remove('MOBI_nieb')
                        formats.append('MOBI')
                    s.formats = ', '.join(formats).upper()

                    counter -= 1
                    yield s
示例#59
0
    def search(self, query, max_results=10, timeout=60):
        '''
        XinXii's open search url is:
        http://www.xinxii.com/catalog-search/query/?keywords={searchTerms}&amp;pw={startPage?}&amp;doc_lang={docLang}&amp;ff={docFormat},{docFormat},{docFormat}

        This url requires the docLang and docFormat. However, the search itself
        sent to XinXii does not require them. They can be ignored. We cannot
        push this into the stanard OpenSearchOPDSStore search because of the
        required attributes.

        XinXii doesn't return all info supported by OpenSearchOPDSStore search
        function so this one is modified to remove parts that are used.
        '''

        url = 'http://www.xinxii.com/catalog-search/query/?keywords=' + quote_plus(
            query)

        counter = max_results
        br = browser()
        with closing(br.open(url, timeout=timeout)) as f:
            doc = etree.fromstring(f.read())
            for data in doc.xpath('//*[local-name() = "entry"]'):
                if counter <= 0:
                    break

                counter -= 1

                s = SearchResult()

                s.detail_item = ''.join(
                    data.xpath('./*[local-name() = "id"]/text()')).strip()

                for link in data.xpath('./*[local-name() = "link"]'):
                    rel = link.get('rel')
                    href = link.get('href')
                    type = link.get('type')

                    if rel and href and type:
                        if rel in ('http://opds-spec.org/thumbnail',
                                   'http://opds-spec.org/image/thumbnail'):
                            s.cover_url = href
                        if rel == 'alternate':
                            s.detail_item = href

                s.formats = 'EPUB, PDF'

                s.title = ' '.join(
                    data.xpath('./*[local-name() = "title"]//text()')).strip()
                s.author = ', '.join(
                    data.xpath(
                        './*[local-name() = "author"]//*[local-name() = "name"]//text()'
                    )).strip()

                price_e = data.xpath('.//*[local-name() = "price"][1]')
                if price_e:
                    price_e = price_e[0]
                    currency_code = price_e.get('currencycode', '')
                    price = ''.join(price_e.xpath('.//text()')).strip()
                    s.price = currency_code + ' ' + price
                    s.price = s.price.strip()

                yield s
示例#60
0
    def search(self, query, max_results=10, timeout=60):
        '''
        Gutenberg's ODPS feed is poorly implmented and has a number of issues
        which require very special handling to fix the results.

        Issues:
          * "Sort Alphabetically" and "Sort by Release Date" are returned
            as book entries.
          * The author is put into a "content" tag and not the author tag.
          * The link to the book itself goes to an odps page which we need
            to turn into a link to a web page.
          * acquisition links are not part of the search result so we have
            to go to the odps item itself. Detail item pages have a nasty
            note saying:
              DON'T USE THIS PAGE FOR SCRAPING. 
              Seriously. You'll only get your IP blocked.
            We're using the ODPS feed because people are getting blocked with
            the previous implementation so due to this using ODPS probably
            won't solve this issue.
          * Images are not links but base64 encoded strings. They are also not
            real cover images but a little blue book thumbnail.
        '''

        url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + urllib.quote_plus(query)

        counter = max_results
        br = browser(user_agent='calibre/'+__version__)
        with closing(br.open(url, timeout=timeout)) as f:
            doc = etree.fromstring(f.read())
            for data in doc.xpath('//*[local-name() = "entry"]'):
                if counter <= 0:
                    break

                counter -= 1

                s = SearchResult()

                # We could use the <link rel="alternate" type="text/html" ...> tag from the
                # detail odps page but this is easier.
                id = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip()
                s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (self.web_url, re.sub('[^\d]', '', id)))
                if not s.detail_item:
                    continue

                s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
                s.author = ', '.join(data.xpath('./*[local-name() = "content"]//text()')).strip()
                if not s.title or not s.author:
                    continue

                # Get the formats and direct download links.
                with closing(br.open(id, timeout=timeout/4)) as nf:
                    ndoc = etree.fromstring(nf.read())
                    for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'):
                        type = link.get('type')
                        href = link.get('href')
                        if type:
                            ext = mimetypes.guess_extension(type)
                            if ext:
                                ext = ext[1:].upper().strip()
                                s.downloads[ext] = href

                s.formats = ', '.join(s.downloads.keys())
                if not s.formats:
                    continue

                for link in data.xpath('./*[local-name() = "link"]'):
                    rel = link.get('rel')
                    href = link.get('href')
                    type = link.get('type')

                    if rel and href and type:
                        if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
                            if href.startswith('data:image/png;base64,'):
                                s.cover_data = base64.b64decode(href.replace('data:image/png;base64,', ''))

                yield s