def create_search_result(self, data):
        xp_template = 'normalize-space(@{0})'

        sRes = SearchResult()
        sRes.drm = SearchResult.DRM_UNLOCKED
        sRes.detail_item = data.xpath(xp_template.format('hub_id'))
        sRes.title = data.xpath(
            'string(.//title-info/book-title/text()|.//publish-info/book-name/text())'
        )
        # aut = concat('.//title-info/author/first-name', ' ')
        authors = data.xpath('.//title-info/author/first-name/text()|'
                             './/title-info/author/middle-name/text()|'
                             './/title-info/author/last-name/text()')
        sRes.author = u' '.join(map(type(u''), authors))
        sRes.price = data.xpath(xp_template.format('price'))
        # cover vs cover_preview
        sRes.cover_url = data.xpath(xp_template.format('cover_preview'))
        sRes.price = format_price_in_RUR(sRes.price)

        types = data.xpath('//fb2-book//files/file/@type')
        fmt_set = _parse_ebook_formats(' '.join(types))
        sRes.formats = ', '.join(fmt_set)
        return sRes
示例#2
0
    def search(self, query, max_results=10, timeout=60):
        try:
            results = lg.lookup(
                lg.search(query, 'title') + lg.search(query, 'author'))
            print('Reached LibGen Mirrors.')
        except Exception as e:
            print(e)
            print(
                'pylibgen crashed. In most cases this is caused by unreachable LibGen Mirrors, try again in a few minutes.'
            )
            return

        self.num_results = len(results)

        for r in results:
            s = SearchResult()
            s.title = r['title']
            s.author = r['author']
            s.price = '$0.00'
            s.drm = SearchResult.DRM_UNLOCKED
            s.formats = r['extension']
            s.detail_item = r['md5']
            yield s
def search_google(query, max_results=10, timeout=60, write_html_to=None):
    url = 'https://www.google.com/search?tbm=bks&q=' + quote_plus(query)

    br = browser()

    counter = max_results
    with closing(br.open(url, timeout=timeout)) as f:
        raw = f.read()
        doc = parse_html(raw)
        if write_html_to is not None:
            praw = html.tostring(doc, encoding='utf-8')
            open(write_html_to, 'wb').write(praw)
        for data in doc.xpath('//div[@id="rso"]//div[@class="g"]'):
            if counter <= 0:
                break

            id = ''.join(data.xpath('.//h3/a/@href'))
            if not id:
                continue

            title = ''.join(data.xpath('.//h3/a//text()'))
            authors = data.xpath('descendant::div[@class="s"]//a[@class="fl" and @href]//text()')
            while authors and authors[-1].strip().lower() in ('preview', 'read', 'more editions'):
                authors = authors[:-1]
            if not authors:
                continue
            author = ' & '.join(authors)

            counter -= 1

            s = SearchResult()
            s.title = title.strip()
            s.author = author.strip()
            s.detail_item = id.strip()
            s.drm = SearchResult.DRM_UNKNOWN

            yield s
示例#4
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.bubok.pt/resellers/calibre_search/' + urllib.quote_plus(
            query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[contains(@class, "libro")]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//div[@class="url"]/text()'))

                title = ''.join(data.xpath('.//div[@class="titulo"]/text()'))

                author = ''.join(data.xpath('.//div[@class="autor"]/text()'))

                price = ''.join(data.xpath('.//div[@class="precio"]/text()'))

                formats = ''.join(
                    data.xpath('.//div[@class="formatos"]/text()'))

                cover = ''.join(data.xpath('.//div[@class="portada"]/text()'))

                counter -= 1

                s = SearchResult()
                s.title = title.strip()
                s.author = author.strip()
                s.detail_item = id.strip()
                s.price = price.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = formats.strip()
                s.cover_url = cover.strip()
                yield s
示例#5
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.legimi.com/pl/ebooki/?szukaj=' + urllib.quote_plus(
            query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@id="listBooks"]/div'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//a[1]/@href'))
                if not id:
                    continue

                cover_url = ''.join(data.xpath('.//img[1]/@src'))
                title = ''.join(
                    data.xpath(
                        './/span[@class="bookListTitle ellipsis"]/text()'))
                author = ''.join(
                    data.xpath(
                        './/span[@class="bookListAuthor ellipsis"]/text()'))
                price = ''.join(
                    data.xpath('.//div[@class="bookListPrice"]/span/text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = 'http:' + cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.detail_item = 'http://www.legimi.com/' + id.strip()

                yield s
示例#6
0
    def search(self, query, max_results=10, timeout=60):

        br = browser()
        page=1

        counter = max_results
        while counter:
            with closing(br.open('https://www.swiatebookow.pl/ebooki/?q=' + quote(query) + '&page=' + str(page), timeout=timeout)) as f:
                doc = html.fromstring(f.read().decode('utf-8'))
                for data in doc.xpath('//div[@class="category-item-container"]//div[@class="book-large"]'):
                    if counter <= 0:
                        break

                    id = ''.join(data.xpath('./a/@href'))
                    if not id:
                        continue

                    cover_url = ''.join(data.xpath('.//div[@class="cover-xs"]/img/@src'))
                    price = ''.join(data.xpath('.//span[@class="item-price"]/text()')+data.xpath('.//span[@class="sub-price"]/text()'))
                    title = ''.join(data.xpath('.//h3/text()'))
                    author = ', '.join(data.xpath('.//div[@class="details"]/p/a/text()'))

                    counter -= 1

                    s = SearchResult()
                    s.cover_url =  'https://www.swiatebookow.pl' + cover_url
                    s.title = title.strip()
                    s.author = author.strip()
                    s.price = price
                    s.detail_item = 'https://www.swiatebookow.pl' + id
                    # s.formats = formats.upper()
                    s.drm = SearchResult.DRM_UNLOCKED

                    yield s
                if not doc.xpath('//div[@class="paging_bootstrap pagination"]//a[@class="next"]'):
                    break
            page+=1
    def search(self, query, max_results=10, timeout=60):
        '''
        Searches LibGen for Books. Since the mirror links are not direct
        downloads, it should not provide these as `s.downloads`.
        '''

        debug_print('Libgen Fiction::__init__.py:LibgenStore:search:query =',
                    query)

        libgen_results = self.libgen.search(query)

        for result in libgen_results.results[:min(max_results, len(libgen_results.results))]:
            debug_print('Libgen Fiction::__init__.py:LibgenStore:search:'
                        'result.title =',
                        result.title)

            for mirror in result.mirrors[0:1]:  # Calibre only shows 1 anyway
                debug_print('Libgen Fiction::__init__.py:LibgenStore:search:'
                            'result.mirror.url =', mirror.url)

                s = SearchResult()

                s.store_name = PLUGIN_NAME
                s.cover_url = result.image_url
                s.title = '{} ({}, {}{})'.format(
                    result.title, result.language, mirror.size, mirror.unit)
                s.author = result.authors
                s.price = '0.00'
                s.detail_item = result.md5
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = mirror.format
                s.plugin_author = PLUGIN_AUTHORS

                debug_print('Libgen Fiction::__init__.py:LibgenStore:search:s =',
                            s)

                yield s
示例#8
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://wolnelektury.pl/szukaj?q=' + urllib.quote_plus(query.encode('utf-8'))

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//li[@class="Book-item"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//div[@class="title"]/a/@href'))
                if not id:
                    continue

                cover_url = ''.join(data.xpath('.//div[@class="cover-area"]//img/@src'))
                title = ''.join(data.xpath('.//div[@class="title"]/a[1]/text()'))
                author = ', '.join(data.xpath('.//div[@class="author"]/a/text()'))
                price = '0,00 zł'

                counter -= 1

                s = SearchResult()
                for link in data.xpath('.//div[@class="book-box-formats"]/span/a'):
                    ext = ''.join(link.xpath('./text()'))
                    href = 'http://wolnelektury.pl' + link.get('href')
                    s.downloads[ext] = href
                s.cover_url = 'http://wolnelektury.pl' + cover_url.strip()
                s.title = title.strip()
                s.author = author
                s.price = price
                s.detail_item = 'http://wolnelektury.pl' + id
                s.formats = ', '.join(s.downloads.keys())
                s.drm = SearchResult.DRM_UNLOCKED

                yield s
示例#9
0
    def search(self, query, max_results=20, timeout=60):

        br = browser()

        counter = max_results
        page = 1
        while counter:
            with closing(br.open('http://www.publio.pl/e-booki,strona' + str(page) + '.html?q=' + quote(query), timeout=timeout)) as f:  # noqa
                doc = html.fromstring(f.read())
                for data in doc.xpath('//div[@class="products-list"]//div[@class="product-tile"]'):
                    if counter <= 0:
                        break

                    id = ''.join(data.xpath('.//a[@class="product-tile-cover"]/@href'))
                    if not id:
                        continue

                    cover_url = ''.join(data.xpath('.//img[@class="product-tile-cover-photo"]/@src'))
                    title = ''.join(data.xpath('.//span[@class="product-tile-title-long"]/text()'))
                    author = ', '.join(data.xpath('.//span[@class="product-tile-author"]/a/text()'))
                    price = ''.join(data.xpath('.//div[@class="product-tile-price-wrapper "]/a/ins/text()'))
                    formats = ''.join(data.xpath('.//a[@class="product-tile-cover"]/img/@alt')).split(' - ebook ')[1]

                    counter -= 1

                    s = SearchResult()
                    s.cover_url = 'http://www.publio.pl' + cover_url
                    s.title = title.strip()
                    s.author = author
                    s.price = price
                    s.detail_item = 'http://www.publio.pl' + id.strip()
                    s.formats = formats.upper().strip()

                    yield s
                if not doc.xpath('boolean(//a[@class="next"])'):
                    break
                page+=1
示例#10
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.empik.com/ebooki/ebooki,3501,s?resultsPP=' + str(max_results) + '&q=' + urllib.quote(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="search-list-item"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//div[@class="name"]/a/@href'))
                if not id:
                    continue

                cover_url = ''.join(data.xpath('.//a/img[@class="lazy"]/@lazy-img'))
                author = ', '.join(data.xpath('.//div[@class="smartAuthorWrapper"]/a/text()'))
                title = ''.join(data.xpath('.//div[@class="name"]/a/@title'))
                price = ''.join(data.xpath('.//div[@class="price"]/text()'))

                with closing(br.open('http://empik.com' + id.strip(), timeout=timeout/4)) as nf:
                    idata = html.fromstring(nf.read())
                    crawled = idata.xpath('.//a[(@class="chosen hrefstyle") or (@class="connectionsLink hrefstyle")]/text()')
                    formats = ','.join([re.sub('ebook, ','', x.strip()) for x in crawled if 'ebook' in x])

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.split('  - ')[0]
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = 'http://empik.com' + id.strip()
                s.formats = formats.upper().strip()

                yield s
示例#11
0
    def search(self, query, max_results=10, timeout=60):
        url = 'https://www.legimi.pl/ebooki/?sort=score&searchphrase=' + quote_plus(
            query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath(
                    '//div[@class="book-search row auto-clear"]/div'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//div[@class="panel-body"]/a/@href'))
                if not id:
                    continue

                cover_url = ''.join(
                    data.xpath('.//div[@class="img-content"]/img/@data-src'))
                title = ''.join(
                    data.xpath(
                        './/a[@class="book-title clampBookTitle"]/text()'))
                author = ' '.join(
                    data.xpath(
                        './/div[@class="authors-container clampBookAuthors"]/a/text()'
                    ))
                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.detail_item = 'https://www.legimi.pl' + id.strip()
                s.drm = SearchResult.DRM_UNLOCKED

                yield s
示例#12
0
    def search(self, query, max_results=10, timeout=60):
        url = (
            'https://www.whsmith.co.uk/search?keywordCategoryId=wc_dept_ebooks&results=60'
            '&page=1&keywords=' + quote(query))

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//li[@class="product"]'):
                if counter <= 0:
                    break
                id_ = ''.join(
                    data.xpath('./a[@class="product_image_wrap"]/@href'))
                if not id_:
                    continue
                id_ = 'https://www.whsmith.co.uk' + id_
                cover_url = ''.join(
                    data.xpath('.//img[@class="product_image"]/@src'))
                title = ''.join(
                    data.xpath('.//h4[@class="product_title"]/text()'))
                author = ', '.join(
                    data.xpath('.//span[@class="product_second"]/text()'))
                price = ''.join(data.xpath('.//span[@class="price"]/text()'))
                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.drm = SearchResult.DRM_LOCKED
                s.detail_item = id_
                s.formats = 'ePub'

                yield s
    def search(self, query, max_results=20, timeout=60):
        url = 'http://www.escapemagazine.pl/wyszukiwarka?query=' + urllib.quote_plus(
            query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="item item_short"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//h2[@class="title"]/a[1]/@href'))
                if not id:
                    continue

                title = ''.join(
                    data.xpath('.//h2[@class="title"]/a[1]/text()'))
                author = ''.join(data.xpath('.//div[@class="author"]/text()'))
                price = ''.join(
                    data.xpath(
                        './/span[@class="price_now"]/strong/text()')) + ' zł'
                cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src'))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.detail_item = 'http://www.escapemagazine.pl' + id.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = 'PDF'

                yield s
示例#14
0
    def search(self, query, max_results=12, timeout=60):
        url = 'http://virtualo.pl/?q=' + urllib.quote(query)

        br = browser()
        no_drm_pattern = re.compile(r'Watermark|Brak')

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="products-list-wrapper"]//li[@class="product "]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//div[@class="cover-wrapper"]//a/@href')).split(r'?q=')[0]
                if not id:
                    continue

                price = ''.join(data.xpath('.//div[@class="information"]//div[@class="price"]/text()'))
                cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src'))
                title = ''.join(data.xpath('.//div[@class="title"]/a//text()'))
                author = ', '.join(data.xpath('.//div[@class="information"]//div[@class="authors"]/a//text()'))
                formats = [form.strip() for form in data.xpath('.//div[@class="information"]//div[@class="format"]/a//text()')]
                nodrm = no_drm_pattern.search(''.join(data.xpath('.//div[@class="protection"]/text()')))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = re.sub('\.',',',price.strip())
                s.detail_item = 'http://virtualo.pl' + id
                s.formats = ', '.join(formats).upper()
                s.drm = SearchResult.DRM_UNLOCKED if nodrm else SearchResult.DRM_LOCKED

                yield s
示例#15
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.google.com/search?tbm=bks&q=' + urllib.quote_plus(
            query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//ol/li'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//h3/a/@href'))
                if not id:
                    continue

                title = ''.join(data.xpath('.//h3/a//text()'))
                authors = data.xpath(
                    './/span[contains(@class, "f")]//a//text()')
                while authors and authors[-1].strip().lower() in (
                        'preview', 'read', 'more editions'):
                    authors = authors[:-1]
                if not authors:
                    continue
                author = ', '.join(authors)

                counter -= 1

                s = SearchResult()
                s.title = title.strip()
                s.author = author.strip()
                s.detail_item = id.strip()
                s.drm = SearchResult.DRM_UNKNOWN

                yield s
示例#16
0
    def search(self, query, max_results=25, timeout=60):
        url = 'http://ebookpoint.pl/search.scgi?szukaj=' + urllib.quote_plus(query.decode('utf-8').encode('iso-8859-2')) + '&serwisyall=0&x=0&y=0'

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@class="book-list"]/ul[2]/li'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//a[@class="cover"]/@href'))
                if not id:
                    continue

                cover_url = ''.join(data.xpath('.//a[@class="cover"]/img/@src'))
                title = ''.join(data.xpath('.//h3/a/@title'))
                title = re.sub('eBook.', '', title)
                author = ''.join(data.xpath('.//p[@class="author"]/text()'))
                price = ''.join(data.xpath('.//p[@class="price"]/ins/text()'))

                formats = ', '.join(data.xpath('.//div[@class="ikony"]/span/text()'))

                counter -= 1

                s = SearchResult()
                s.cover_url = 'http://ebookpoint.pl' + re.sub('72x9', '65x8',cover_url)
                s.title = title.strip()
                s.author = author.strip()
                s.price = re.sub(r'\.',',',price)
                s.detail_item = id.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = formats.upper()

                yield s
示例#17
0
    def search(self, query, max_results=12, timeout=60):
        url = 'http://virtualo.pl/?q=' + urllib.quote(query) + '&f=format_id:4,6,3'

        br = browser()
        no_drm_pattern = re.compile(r'Znak wodny|Brak')

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//div[@id="content"]//div[@class="list_box list_box_border"]'):
                if counter <= 0:
                    break

                id = ''.join(data.xpath('.//div[@class="list_middle_left"]//a/@href'))
                if not id:
                    continue

                price = ''.join(data.xpath('.//span[@class="price"]/text() | .//span[@class="price abbr"]/text()'))
                cover_url = ''.join(data.xpath('.//div[@class="list_middle_left"]//a//img/@src'))
                title = ''.join(data.xpath('.//div[@class="list_title list_text_left"]/a/text()'))
                author = ', '.join(data.xpath('.//div[@class="list_authors list_text_left"]/a/text()'))
                formats = [ form.split('_')[-1].replace('.png', '') for form in data.xpath('.//div[@style="width:55%;float:left;text-align:left;height:18px;"]//a/span/img/@src')]
                nodrm = no_drm_pattern.search(''.join(data.xpath('.//div[@style="width:45%;float:right;text-align:right;height:18px;"]//span[@class="prompt_preview"]/text()')))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url.split('.jpg')[0] + '.jpg'
                s.title = title.strip()
                s.author = author.strip()
                s.price = price + ' zł'
                s.detail_item = 'http://virtualo.pl' + id.strip().split('http://')[0]
                s.formats = ', '.join(formats).upper()
                s.drm = SearchResult.DRM_UNLOCKED if nodrm else SearchResult.DRM_LOCKED

                yield s
示例#18
0
    def search(self, query, max_results=10, timeout=60):
        base_url = 'http://www.millsandboon.co.uk'
        url = base_url + '/search?format=ebook&q=' + urllib2.quote(query)
        #print(url)
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//article[contains(@class, "group")]'):
                if counter <= 0:
                    break
                id_ = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/@href')).strip()
                id_ = base_url + id_
                if not id_:
                    continue

                cover_url = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@src'))
                title =  ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@alt')).strip()
                author = ''.join(data.xpath('.//a[@class="author"]/text()'))
                price = ''.join(data.xpath('.//li[@class="productAttribute" and child::span[text()="eBook"]]/input/@value'))
                format_ = ''.join(data.xpath('.//p[@class="doc-meta-format"]/span[last()]/text()'))
                drm = SearchResult.DRM_LOCKED

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.detail_item = id_
                s.drm = drm
                s.formats = format_

                yield s
示例#19
0
def open_search(url, query, max_results=10, timeout=60):
    description = Description(url)

    url_template = description.get_best_template()

    if not url_template:
        return
    oquery = Query(url_template)

    # set up initial values
    oquery.searchTerms = query
    oquery.count = max_results
    url = oquery.url()

    counter = max_results
    br = browser()
    with closing(br.open(url, timeout=timeout)) as f:
        doc = safe_xml_fromstring(f.read())
        for data in doc.xpath('//*[local-name() = "entry"]'):
            if counter <= 0:
                break

            counter -= 1

            s = SearchResult()

            s.detail_item = ''.join(
                data.xpath('./*[local-name() = "id"]/text()')).strip()

            for link in data.xpath('./*[local-name() = "link"]'):
                rel = link.get('rel')
                href = BASE_URL + link.get('href')
                type = link.get('type')
                title = link.get('title')
                ext = None

                if rel and href and type:
                    if 'http://opds-spec.org/image/thumbnail' == rel:
                        s.cover_url = href
                    elif 'http://opds-spec.org/acquisition/open-access' == rel:
                        if type == 'application/epub+zip' and title == 'Recommended compatible epub':
                            ext = 'EPUB'
                        elif type == 'application/x-mobipocket-ebook':
                            ext = 'AZW3'

                        if ext:
                            s.downloads[ext] = href

            s.formats = ', '.join(s.downloads.keys()).strip()

            s.title = ' '.join(
                data.xpath('./*[local-name() = "title"]//text()')).strip()
            s.author = ', '.join(
                data.xpath(
                    './*[local-name() = "author"]//*[local-name() = "name"]//text()'
                )).strip()

            s.drm = SearchResult.DRM_UNLOCKED
            s.price = '$0.00'

            yield s
示例#20
0
def open_search(url, query, max_results=10, timeout=60):
    description = Description(url)
    url_template = description.get_best_template()
    if not url_template:
        return
    oquery = Query(url_template)

    # set up initial values
    oquery.searchTerms = query
    oquery.count = max_results
    url = oquery.url()

    counter = max_results
    br = browser()
    with closing(br.open(url, timeout=timeout)) as f:
        doc = safe_xml_fromstring(f.read())
        for data in doc.xpath('//*[local-name() = "entry"]'):
            if counter <= 0:
                break

            counter -= 1

            s = SearchResult()

            s.detail_item = ''.join(
                data.xpath('./*[local-name() = "id"]/text()')).strip()

            for link in data.xpath('./*[local-name() = "link"]'):
                rel = link.get('rel')
                href = link.get('href')
                type = link.get('type')

                if rel and href and type:
                    if 'http://opds-spec.org/thumbnail' in rel:
                        s.cover_url = href
                    elif 'http://opds-spec.org/image/thumbnail' in rel:
                        s.cover_url = href
                    elif 'http://opds-spec.org/acquisition/buy' in rel:
                        s.detail_item = href
                    elif 'http://opds-spec.org/acquisition/sample' in rel:
                        pass
                    elif 'http://opds-spec.org/acquisition' in rel:
                        if type:
                            ext = guess_extension(type)
                            if ext:
                                ext = ext[1:].upper().strip()
                                s.downloads[ext] = href
            s.formats = ', '.join(s.downloads.keys()).strip()

            s.title = ' '.join(
                data.xpath('./*[local-name() = "title"]//text()')).strip()
            s.author = ', '.join(
                data.xpath(
                    './*[local-name() = "author"]//*[local-name() = "name"]//text()'
                )).strip()

            price_e = data.xpath('.//*[local-name() = "price"][1]')
            if price_e:
                price_e = price_e[0]
                currency_code = price_e.get('currencycode', '')
                price = ''.join(price_e.xpath('.//text()')).strip()
                s.price = currency_code + ' ' + price
                s.price = s.price.strip()

            yield s
示例#21
0
    def search(self, query, max_results=10, timeout=60):
        url = self.search_url + query.encode(
            'ascii', 'backslashreplace').replace('%', '%25').replace(
                '\\x', '%').replace(' ', '+')
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            allText = f.read()
            doc = html.fromstring(allText)  #.decode('latin-1', 'replace'))

            format_xpath2 = ''
            if doc.xpath(
                    '//div[@id = "atfResults" and contains(@class, "grid")]'):
                #print('grid form')
                data_xpath = '//div[contains(@class, "prod")]'
                format_xpath = (
                    './/ul[contains(@class, "rsltGridList")]'
                    '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()'
                )
                asin_xpath = '@name'
                cover_xpath = './/img[contains(@class, "productImage")]/@src'
                title_xpath = './/h3[@class="newaps"]/a//text()'
                author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
                price_xpath = (
                    './/ul[contains(@class, "rsltGridList")]'
                    '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()'
                )
            elif doc.xpath(
                    '//div[@id = "atfResults" and contains(@class, "ilresults")]'
            ):
                #print('ilo form')
                data_xpath = '//li[(@class="ilo")]'
                format_xpath = (
                    './/ul[contains(@class, "rsltGridList")]'
                    '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()'
                )
                asin_xpath = '@name'
                cover_xpath = './div[@class = "ilf"]/a/img[contains(@class, "ilo")]/@src'
                title_xpath = './/h3[@class="newaps"]/a//text()'
                author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
                # Results can be in a grid (table) or a column
                price_xpath = (
                    './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]'
                    '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()'
                )
            elif doc.xpath(
                    '//div[@id = "atfResults" and contains(@class, "s-result-list-parent-container")]'
            ):
                #print('new list form')
                data_xpath = '//li[contains(@class, "s-result-item")]'
                format_xpath = './/a[contains(@class, "a-size-small")]/text()'
                format_xpath2 = './/h3[contains(@class, "s-inline")]/text()'
                asin_xpath = '@data-asin'
                cover_xpath = './/img[contains(@class, "cfMarker")]/@src'
                title_xpath = './/h2[contains(@class, "s-access-title")]/text()'
                author_xpath = (
                    './/div[contains(@class, "a-fixed-left-grid-col")]'
                    '/div/div/span//text()')
                price_xpath = (
                    './/div[contains(@class, "a-spacing-none")]/a/span[contains(@class, "s-price")]/text()'
                )
            elif doc.xpath(
                    '//div[@id = "atfResults" and contains(@class, "list")]'):
                #print('list form')
                data_xpath = '//li[@class="s-result-item"]'
                format_xpath = './/a[contains(@class, "a-size-small")]/text()'
                format_xpath2 = './/h3[contains(@class, "s-inline")]/text()'
                asin_xpath = '@data-asin'
                cover_xpath = './/img[contains(@class, "cfMarker")]/@src'
                title_xpath = './/h2[contains(@class, "s-access-title")]/text()'
                author_xpath = (
                    './/div[contains(@class, "a-fixed-left-grid-col")]'
                    '/div/div/span//text()')
                price_xpath = ('.//span[contains(@class, "s-price")]/text()')
            else:
                # URK -- whats this?
                print('unknown result table form for Amazon EU search')
                #with open("c:/amazon_search_results.html", "w") as out:
                #    out.write(allText)
                return

            for data in doc.xpath(data_xpath):
                if counter <= 0:
                    break

                # Even though we are searching digital-text only Amazon will still
                # put in results for non Kindle books (authors pages). Se we need
                # to explicitly check if the item is a Kindle book and ignore it
                # if it isn't.
                format_ = ''.join(data.xpath(format_xpath))
                if 'kindle' not in format_.lower():
                    if format_xpath2:
                        format_ = ''.join(data.xpath(format_xpath2))
                        if 'kindle' not in format_.lower():
                            # print(etree.tostring(data, pretty_print=True))
                            continue

                # We must have an asin otherwise we can't easily reference the
                # book later.
                asin = data.xpath(asin_xpath)
                if asin:
                    asin = asin[0]
                else:
                    continue

                cover_url = ''.join(data.xpath(cover_xpath))

                title = ''.join(data.xpath(title_xpath))

                authors = ''.join(data.xpath(author_xpath))
                authors = re.sub('^' + self.author_article, '', authors)
                authors = re.sub(self.and_word, ' & ', authors)
                mo = re.match(r'(.*)(\(\d.*)$', authors)
                if mo:
                    authors = mo.group(1).strip()

                price = ''.join(data.xpath(price_xpath)[-1])

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url.strip()
                s.title = title.strip()
                s.author = authors.strip()
                s.price = price.strip()
                s.detail_item = asin.strip()
                s.drm = SearchResult.DRM_UNKNOWN
                s.formats = 'Kindle'

                yield s
示例#22
0
    def search(self, query, max_results=10, timeout=60):
        url = ('http://www.ebook.de/de/pathSearch?nav=52122&searchString=' +
               urllib.parse.quote(query))
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath(
                    '//div[contains(@class, "articlecontainer")]'):
                if counter <= 0:
                    break

                details = data.xpath(
                    './div[contains(@class, "articleinfobox")]')
                if not details:
                    continue
                details = details[0]
                id_ = ''.join(details.xpath('./a/@name')).strip()
                if not id_:
                    continue
                title = ''.join(
                    details.xpath('./h3[@class="title"]/a/text()')).strip()

                author = ''.join(
                    details.xpath('.//div[@class="author"]/text()')).strip()
                if author.startswith('von'):
                    author = author[4:]

                pdf = details.xpath(
                    'boolean(.//span[@class="bindername" and contains(text(), "pdf")]/text())'
                )
                epub = details.xpath(
                    'boolean(.//span[@class="bindername" and contains(text(), "epub")]/text())'
                )
                mobi = details.xpath(
                    'boolean(.//span[@class="bindername" and contains(text(), "mobipocket")]/text())'
                )

                cover_url = ''.join(
                    data.xpath('.//div[@class="coverimg"]/a/img/@src'))
                price = ''.join(
                    data.xpath('.//div[@class="preis"]/text()')).replace(
                        '*', '').strip()

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                s.drm = SearchResult.DRM_UNKNOWN
                s.detail_item = id_
                formats = []
                if epub:
                    formats.append('ePub')
                if pdf:
                    formats.append('PDF')
                if mobi:
                    formats.append('MOBI')
                s.formats = ', '.join(formats)

                yield s
示例#23
0
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.diesel-ebooks.com/index.php?page=seek&id[m]=&id[c]=scope%253Dinventory&id[q]=' + urllib.quote_plus(query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            book_url = f.geturl()
            doc = html.fromstring(f.read())

            if doc.xpath('not(boolean(//select[contains(@id, "selection")]))'):
                # This is the page for an individual book
                id = ''.join(doc.xpath('//div[@class="price_fat"]//a/@href'))
                mo = re.search('(?<=id=).+?(?=&)', id)
                if not mo:
                    yield None
                id = mo.group()

                cover_url = ''.join(doc.xpath('//div[@class="cover"]/a/@href'))

                title = ''.join(doc.xpath('//div[@class="desc_fat"]//h1/text()'))
                author = ''.join(doc.xpath('//div[@class="desc_fat"]//span[@itemprop="author"]/text()'))
                price = ''.join(doc.xpath('//div[@class="price_fat"]//h1/text()'))

                formats = ', '.join(doc.xpath('//div[@class="desc_fat"]//p[contains(text(), "Format")]/text()'))
                a, b, formats = formats.partition('Format:')

                drm = SearchResult.DRM_LOCKED
                if 'drm free' in formats.lower():
                    drm = SearchResult.DRM_UNLOCKED

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = book_url
                s.formats = formats
                s.drm = drm

                yield s
            else:
                for data in doc.xpath('//div[contains(@class, "item")]'):
                    if counter <= 0:
                        break

                    id = ''.join(data.xpath('div[@class="cover"]/a/@href'))
                    if not id or '/item/' not in id:
                        continue

                    cover_url = ''.join(data.xpath('div[@class="cover"]//img/@src'))

                    title = ''.join(data.xpath('.//div[@class="content"]//h2/a/text()'))
                    author = ''.join(data.xpath('.//div[@class="content"]/span//a/text()'))
                    price = ''
                    price_elem = data.xpath('.//div[@class="price_fat"]//h1/text()')
                    if price_elem:
                        price = price_elem[0]

                    formats = ', '.join(data.xpath('.//div[@class="book-info"]//text()')).strip()
                    a, b, formats = formats.partition('Format:')
                    drm = SearchResult.DRM_LOCKED
                    if 'drm free' in formats.lower():
                        drm = SearchResult.DRM_UNLOCKED

                    counter -= 1

                    s = SearchResult()
                    s.cover_url = cover_url
                    s.title = title.strip()
                    s.author = author.strip()
                    s.price = price.strip()
                    s.detail_item = id.strip()
                    s.formats = formats
                    s.drm = drm

                    yield s
示例#24
0
    def search(self, query, max_results=10, timeout=60):
        '''
        XinXii's open search url is:
        http://www.xinxii.com/catalog-search/query/?keywords={searchTerms}&amp;pw={startPage?}&amp;doc_lang={docLang}&amp;ff={docFormat},{docFormat},{docFormat}

        This url requires the docLang and docFormat. However, the search itself
        sent to XinXii does not require them. They can be ignored. We cannot
        push this into the stanard OpenSearchOPDSStore search because of the
        required attributes.

        XinXii doesn't return all info supported by OpenSearchOPDSStore search
        function so this one is modified to remove parts that are used.
        '''

        url = 'http://www.xinxii.com/catalog-search/query/?keywords=' + quote_plus(
            query)

        counter = max_results
        br = browser()
        with closing(br.open(url, timeout=timeout)) as f:
            doc = etree.fromstring(f.read())
            for data in doc.xpath('//*[local-name() = "entry"]'):
                if counter <= 0:
                    break

                counter -= 1

                s = SearchResult()

                s.detail_item = ''.join(
                    data.xpath('./*[local-name() = "id"]/text()')).strip()

                for link in data.xpath('./*[local-name() = "link"]'):
                    rel = link.get('rel')
                    href = link.get('href')
                    type = link.get('type')

                    if rel and href and type:
                        if rel in ('http://opds-spec.org/thumbnail',
                                   'http://opds-spec.org/image/thumbnail'):
                            s.cover_url = href
                        if rel == 'alternate':
                            s.detail_item = href

                s.formats = 'EPUB, PDF'

                s.title = ' '.join(
                    data.xpath('./*[local-name() = "title"]//text()')).strip()
                s.author = ', '.join(
                    data.xpath(
                        './*[local-name() = "author"]//*[local-name() = "name"]//text()'
                    )).strip()

                price_e = data.xpath('.//*[local-name() = "price"][1]')
                if price_e:
                    price_e = price_e[0]
                    currency_code = price_e.get('currencycode', '')
                    price = ''.join(price_e.xpath('.//text()')).strip()
                    s.price = currency_code + ' ' + price
                    s.price = s.price.strip()

                yield s
示例#25
0
def search_amazon(query,
                  max_results=10,
                  timeout=60,
                  write_html_to=None,
                  base_url=SEARCH_BASE_URL,
                  base_query=SEARCH_BASE_QUERY,
                  field_keywords='field-keywords'):
    uquery = base_query.copy()
    uquery[field_keywords] = query

    def asbytes(x):
        if isinstance(x, type('')):
            x = x.encode('utf-8')
        return x

    uquery = {asbytes(k): asbytes(v) for k, v in uquery.iteritems()}
    url = base_url + '?' + urllib.urlencode(uquery).decode('ascii')
    br = browser(user_agent=get_user_agent())

    counter = max_results
    with closing(br.open(url, timeout=timeout)) as f:
        raw = f.read()
        if write_html_to is not None:
            with open(write_html_to, 'wb') as f:
                f.write(raw)
        doc = html.fromstring(raw)
        try:
            results = doc.xpath('//div[@id="atfResults" and @class]')[0]
        except IndexError:
            return

        if 's-result-list-parent-container' in results.get('class', ''):
            data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]"
            format_xpath = './/a[@title="Kindle Edition"]/@title'
            asin_xpath = '@data-asin'
            cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src"
            title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()"
            author_xpath = './/span[starts-with(text(), "by ")]/following-sibling::span//text()'
            price_xpath = (
                'descendant::div[@class="a-row a-spacing-none" and'
                ' not(span[contains(@class, "kindle-unlimited")])]//span[contains(@class, "s-price")]//text()'
            )
        else:
            return

        for data in doc.xpath(data_xpath):
            if counter <= 0:
                break

            # Even though we are searching digital-text only Amazon will still
            # put in results for non Kindle books (author pages). Se we need
            # to explicitly check if the item is a Kindle book and ignore it
            # if it isn't.
            format = ''.join(data.xpath(format_xpath))
            if 'kindle' not in format.lower():
                continue

            # We must have an asin otherwise we can't easily reference the
            # book later.
            asin = data.xpath(asin_xpath)
            if asin:
                asin = asin[0]
            else:
                continue

            cover_url = ''.join(data.xpath(cover_xpath))

            title = ''.join(data.xpath(title_xpath))
            author = ''.join(data.xpath(author_xpath))
            try:
                author = author.split('by ', 1)[1].split(" (")[0]
            except:
                pass

            price = ''.join(data.xpath(price_xpath))

            counter -= 1

            s = SearchResult()
            s.cover_url = cover_url.strip()
            s.title = title.strip()
            s.author = author.strip()
            s.price = price.strip()
            s.detail_item = asin.strip()
            s.formats = 'Kindle'

            yield s
示例#26
0
    def search(self, query, max_results=10, timeout=60):
        # check for cyrillic symbols before performing search
        uquery = type(u'')(query.strip(), 'utf-8')
        reObj = re.search(u'^[а-яА-Я\\d\\s]{2,}$', uquery)
        if not reObj:
            return

        base_url = 'http://e-knigi.net'
        url = base_url + '/virtuemart?page=shop.browse&search_category=0&search_limiter=anywhere&keyword=' + quote(
            query)

        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())

            # if the store finds only one product, it opens directly detail view
            for data in doc.xpath('//div[@class="prod_details"]'):
                s = SearchResult()
                s.cover_url = ''.join(
                    data.xpath(
                        './/div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@src'
                    )).strip()
                s.title = ''.join(
                    data.xpath(
                        './/div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@alt'
                    )).strip()
                s.author = ''.join(
                    data.xpath(
                        './/div[@class="td_bg clearfix"]/div[@class="gk_product_tab"]/div/table/tr[3]/td[2]/text()'
                    )).strip()
                s.price = ''.join(
                    data.xpath(
                        './/span[@class="productPrice"]/text()')).strip()
                s.detail_item = url
                s.drm = SearchResult.DRM_UNLOCKED

                yield s
                return

            # search in store results
            for data in doc.xpath('//div[@class="browseProductContainer"]'):
                if counter <= 0:
                    break
                id = ''.join(data.xpath('.//a[1]/@href')).strip()
                if not id:
                    continue

                title = ''.join(
                    data.xpath('.//a[@class="gk_vm_product_image"]/img/@title')
                ).strip()
                author = ''.join(
                    data.xpath('.//div[@style="float:left;width:90%"]/b/text()'
                               )).strip().replace('Автор: ', '')

                if title.lower().find(
                        query.lower()) == -1 and author.lower().find(
                            query.lower()) == -1:
                    continue

                counter -= 1

                s = SearchResult()
                s.cover_url = ''.join(
                    data.xpath('.//a[@class="gk_vm_product_image"]/img/@src')
                ).strip()
                s.title = title
                s.author = author
                s.price = ''.join(
                    data.xpath(
                        './/span[@class="productPrice"]/text()')).strip()
                s.detail_item = base_url + id
                s.drm = SearchResult.DRM_UNLOCKED

                yield s
示例#27
0
def search_kobo(query, max_results=10, timeout=60, write_html_to=None):
    from css_selectors import Select
    url = 'https://www.kobobooks.com/search/search.html?q=' + quote_plus(query)

    br = browser()

    with closing(br.open(url, timeout=timeout)) as f:
        raw = f.read()
        if write_html_to is not None:
            with open(write_html_to, 'wb') as f:
                f.write(raw)
        doc = html.fromstring(raw)
        select = Select(doc)
        for i, item in enumerate(select('.result-items .item-wrapper.book')):
            if i == max_results:
                break
            for img in select('.item-image img[src]', item):
                cover_url = img.get('src')
                if cover_url.startswith('//'):
                    cover_url = 'https:' + cover_url
                break
            else:
                cover_url = None

            for p in select('p.title', item):
                title = etree.tostring(p, method='text',
                                       encoding='unicode').strip()
                for a in select('a[href]', p):
                    url = a.get('href')
                    break
                else:
                    url = None
                break
            else:
                title = None

            authors = []
            for a in select('p.contributor-list a.contributor-name', item):
                authors.append(
                    etree.tostring(a, method='text',
                                   encoding='unicode').strip())
            authors = authors_to_string(authors)

            for p in select('p.price', item):
                price = etree.tostring(p, method='text',
                                       encoding='unicode').strip()
                break
            else:
                price = None

            if title and authors and url:
                s = SearchResult()
                s.cover_url = cover_url
                s.title = title
                s.author = authors
                s.price = price
                s.detail_item = url
                s.formats = 'EPUB'
                s.drm = SearchResult.DRM_UNKNOWN

                yield s
示例#28
0
    def search(self, query, max_results=10, timeout=60):
        url = self.search_url + query.encode(
            'ascii', 'backslashreplace').replace('%', '%25').replace(
                '\\x', '%').replace(' ', '+')
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())

            if doc.xpath(
                    '//div[@id = "atfResults" and contains(@class, "grid")]'):
                data_xpath = '//div[contains(@class, "prod")]'
                format_xpath = (
                    './/ul[contains(@class, "rsltGridList")]'
                    '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()'
                )
                asin_xpath = '@name'
                cover_xpath = './/img[contains(@class, "productImage")]/@src'
                title_xpath = './/h3[@class="newaps"]/a//text()'
                author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
                price_xpath = (
                    './/ul[contains(@class, "rsltGridList")]'
                    '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()'
                )
            elif doc.xpath(
                    '//div[@id = "atfResults" and contains(@class, "ilresults")]'
            ):
                data_xpath = '//li[(@class="ilo")]'
                format_xpath = (
                    './/ul[contains(@class, "rsltGridList")]'
                    '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()'
                )
                asin_xpath = '@name'
                cover_xpath = './div[@class = "ilf"]/a/img[contains(@class, "ilo")]/@src'
                title_xpath = './/h3[@class="newaps"]/a//text()'
                author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
                # Results can be in a grid (table) or a column
                price_xpath = (
                    './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]'
                    '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()'
                )
            elif doc.xpath(
                    '//div[@id = "atfResults" and contains(@class, "list")]'):
                data_xpath = '//div[contains(@class, "prod")]'
                format_xpath = (
                    './/ul[contains(@class, "rsltL")]'
                    '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()'
                )
                asin_xpath = '@name'
                cover_xpath = './/img[contains(@class, "productImage")]/@src'
                title_xpath = './/h3[@class="newaps"]/a//text()'
                author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
                price_xpath = (
                    './/ul[contains(@class, "rsltL")]'
                    '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()'
                )
            else:
                return

            for data in doc.xpath(data_xpath):
                if counter <= 0:
                    break

                # Even though we are searching digital-text only Amazon will still
                # put in results for non Kindle books (author pages). Se we need
                # to explicitly check if the item is a Kindle book and ignore it
                # if it isn't.
                format = ''.join(data.xpath(format_xpath))
                if 'kindle' not in format.lower():
                    continue

                # We must have an asin otherwise we can't easily reference the
                # book later.
                asin = data.xpath(asin_xpath)
                if asin:
                    asin = asin[0]
                else:
                    continue

                cover_url = ''.join(data.xpath(cover_xpath))

                title = ''.join(data.xpath(title_xpath))
                author = ''.join(data.xpath(author_xpath))
                try:
                    author = author.split('by ', 1)[1].split(" (")[0]
                except:
                    pass

                price = ''.join(data.xpath(price_xpath))

                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url.strip()
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = asin.strip()
                s.formats = 'Kindle'

                yield s
示例#29
0
    def search(self, query, max_results=10, timeout=60):
        # check for cyrillic symbols before performing search
        uquery = unicode(query.strip(), 'utf-8')
        reObj = re.search(u'^[а-яА-Я\\d\\s]{3,}$', uquery)
        if not reObj:
            return

        base_url = 'http://chitanka.info'
        url = base_url + '/search?q=' +  urllib2.quote(query)
        counter = max_results

        # search for book title
        br = browser()
        try:
            with closing(br.open(url, timeout=timeout)) as f:
                f = unicode(f.read(), 'utf-8')
                doc = html.fromstring(f)

                for data in doc.xpath('//ul[@class="superlist booklist"]/li'):
                    if counter <= 0:
                        break

                    id = ''.join(data.xpath('.//a[@class="booklink"]/@href')).strip()
                    if not id:
                        continue

                    counter -= 1

                    s = SearchResult()
                    s.cover_url = ''.join(data.xpath('.//a[@class="booklink"]/img/@src')).strip()
                    s.title = ''.join(data.xpath('.//a[@class="booklink"]/i/text()')).strip()
                    s.author = ''.join(data.xpath('.//span[@class="bookauthor"]/a/text()')).strip()
                    s.detail_item = id
                    s.drm = SearchResult.DRM_UNLOCKED
                    s.downloads['FB2'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-fb2"]/@href')).strip().replace('.zip', '')
                    s.downloads['EPUB'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-epub"]/@href')).strip().replace('.zip', '')
                    s.downloads['TXT'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-txt"]/@href')).strip().replace('.zip', '')
                    s.formats = 'FB2, EPUB, TXT, SFB'
                    yield s
        except urllib2.HTTPError as e:
            if e.code == 404:
                return
            else:
                raise

        # search for author names
        for data in doc.xpath('//ul[@class="superlist"][1]/li/dl/dt'):
            author_url = ''.join(data.xpath('.//a[contains(@href,"/person/")]/@href'))
            if author_url == '':
                continue
            if counter <= 0:
                break

            br2 = browser()
            with closing(br2.open(base_url + author_url, timeout=timeout)) as f:
                if counter <= 0:
                    break
                f = unicode(f.read(), 'utf-8')
                doc2 = html.fromstring(f)

                # search for book title
                for data in doc2.xpath('//ul[@class="superlist booklist"]/li'):
                    if counter <= 0:
                        break

                    id = ''.join(data.xpath('.//a[@class="booklink"]/@href')).strip()
                    if not id:
                        continue

                    title = ''.join(data.xpath('.//a[@class="booklink"]/i/text()')).strip()
                    author = ''.join(data.xpath('.//span[@class="bookauthor"]/a/text()')).strip()
                    if title.lower().find(query.lower()) == -1 and author.lower().find(query.lower()) == -1:
                        continue

                    counter -= 1

                    s = SearchResult()
                    s.cover_url = ''.join(data.xpath('.//a[@class="booklink"]/img/@src')).strip()
                    s.title = title
                    s.author = author
                    s.detail_item = id
                    s.drm = SearchResult.DRM_UNLOCKED
                    s.downloads['FB2'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-fb2"]/@href')).strip().replace('.zip', '')
                    s.downloads['EPUB'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-epub"]/@href')).strip().replace('.zip', '')
                    s.downloads['TXT'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-txt"]/@href')).strip().replace('.zip', '')
                    s.formats = 'FB2, EPUB, TXT, SFB'
                    yield s
示例#30
0
    def search(self, query, max_results=10, timeout=60):
        url = (
            'http://www.ebook.nl/store/advanced_search_result.php?keywords=' +
            urllib.parse.quote(query))
        br = browser()

        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath(
                    '//table[contains(@class, "productListing")]/tr'):
                if counter <= 0:
                    break

                details = data.xpath('./td/div[@class="prodImage"]/a')
                if not details:
                    continue
                details = details[0]
                id = ''.join(details.xpath('./@href')).strip()
                id = id[id.rfind('/') + 1:]
                i = id.rfind('?')
                if i > 0:
                    id = id[:i]
                if not id:
                    continue
                cover_url = 'http://www.ebook.nl/store/' + ''.join(
                    details.xpath('./img/@src'))
                title = ''.join(details.xpath('./img/@title')).strip()
                author = ''.join(
                    data.xpath(
                        './td/div[@class="prodTitle"]/h3/a/text()')).strip()
                price = ''.join(
                    data.xpath('./td/div[@class="prodTitle"]/b/text()'))
                pdf = data.xpath(
                    'boolean(./td/div[@class="prodTitle"]/'
                    'p[contains(text(), "Bestandsformaat: Pdf")])')
                epub = data.xpath(
                    'boolean(./td/div[@class="prodTitle"]/'
                    'p[contains(text(), "Bestandsformaat: ePub")])')
                nodrm = data.xpath('boolean(./td/div[@class="prodTitle"]/'
                                   'p[contains(text(), "zonder DRM") or'
                                   '  contains(text(), "watermerk")])')
                counter -= 1

                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price
                if nodrm:
                    s.drm = SearchResult.DRM_UNLOCKED
                else:
                    s.drm = SearchResult.DRM_LOCKED
                s.detail_item = id
                formats = []
                if epub:
                    formats.append('ePub')
                if pdf:
                    formats.append('PDF')
                s.formats = ','.join(formats)

                yield s