Пример #1
0
    def parse(self, url):
        info = {'url': url}
        html = pq(urlopen(url).content)

        # cover image url
        cover = html('#original-main-image')
        cover_url = cover.attr('src')
        info['cover_url'] = cover_url
        print cover_url

        # title and author
        title = html('.parseasinTitle')
        info['title'] = title.text()
        print info['title']

        title_and_author = title.parents('.buying').text()
        author = title_and_author.replace(info['title'], '').strip()
        info['author'] = author
        print author

        # price
        list_price = html('#listPriceValue').text()
        actual_price = html('#actualPriceValue').text()
        info['price'] = list_price or actual_price
        print info['price']

        key_mapping = {
            u'出版社': 'publisher',
            u'语种': 'language',
            u'条形码': 'isbn-13',
            u'商品尺寸': 'product dimensions',
            u'商品重量': 'shipping weight',
            u'外文书名': 'original title',
        }
        # basic info, such as publisher, lanuage, num of pages, IBSN, etc.
        for li in html('#SalesRank').parents('ul').children('li'):
            li = pq(li)
            k, v = parse_pair(pq(li).text())
            if k in (u'用户评分', u'亚马逊热销商品排名'):
                continue
            if k == u'平装':
                info['binding'] = 'paperback'
                try:
                    info['num of pages'] = int(v.replace('页', ''))
                except ValueError:
                    info['num of pages'] = v
            elif k == u'精装':
                info['binding'] = 'hardcover'
                try:
                    info['num of pages'] = int(v.replace('页', ''))
                except ValueError:
                    info['num of pages'] = v
            else:
                k = key_mapping.get(k, k).lower()
                info[k] = v
            print u'{} => {}'.format(k, v)

        return info
Пример #2
0
    def parse(self, url):
        info = {'url': url}
        html = pq(urlopen(url).content)

        # cover image url
        cover = html('#original-main-image')
        cover_url = cover.attr('src')
        info['cover_url'] = cover_url
        print cover_url

        # title and author
        title = html('.parseasinTitle')
        info['title'] = title.text()
        print info['title']

        title_and_author = title.parents('.buying').text()
        author = title_and_author.replace(info['title'], '').strip()
        info['author'] = author
        print author

        # price
        list_price = html('#listPriceValue').text()
        actual_price = html('#actualPriceValue').text()
        info['price'] = list_price or actual_price
        print info['price']

        key_mapping = {
            u'出版社': 'publisher',
            u'语种': 'language',
            u'条形码': 'isbn-13',
            u'商品尺寸': 'product dimensions',
            u'商品重量': 'shipping weight',
            u'外文书名': 'original title',
            }
        # basic info, such as publisher, lanuage, num of pages, IBSN, etc.
        for li in html('#SalesRank').parents('ul').children('li'):
            li = pq(li)
            k, v = parse_pair(pq(li).text())
            if k in (u'用户评分', u'亚马逊热销商品排名'):
                continue
            if k == u'平装':
                info['binding'] = 'paperback'
                try:
                    info['num of pages'] = int(v.replace('页', ''))
                except ValueError:
                    info['num of pages'] = v
            elif k == u'精装':
                info['binding'] = 'hardcover'
                try:
                    info['num of pages'] = int(v.replace('页', ''))
                except ValueError:
                    info['num of pages'] = v
            else:
                k = key_mapping.get(k, k).lower()
                info[k] = v
            print u'{} => {}'.format(k, v)

        return info
Пример #3
0
    def parse(self, url):
        info = {'url': url}
        html = pq(urlopen(url).content)

        # cover image url
        mainpic = html('#mainpic')
        cover_url = mainpic('a').attr('href')
        info['cover_url'] = cover_url
        print cover_url

        # title
        title = mainpic('img').attr('alt')
        info['title'] = title
        print title

        key_mapping = {
            u'作者': 'author',
            u'译者': 'translator',
            u'出版社': 'publisher',
            u'原作名': 'original title',
            u'出版年': 'published year',
            u'页数': 'num of pages',
            u'定价': 'price',
            u'装帧': 'paperback',
            u'丛书': 'series',
            u'副标题': 'subhead',
        }
        # author, publisher, pub year, price, isbn etc.
        desc = html('#info').html()
        for each in re.split(r'<br/?>', desc):
            each = each.strip()
            if each:
                k, v = parse_pair(pq(each).text())
                if k == u'页数':
                    try:
                        info['num of pages'] = int(v)
                    except ValueError:
                        info['num of pages'] = v
                else:
                    k = key_mapping.get(k, k).lower()
                    info[k] = v
                print u'{} => {}'.format(k, v)

        # tags
        tags = [pq(a).text() for a in html('#db-tags-section')('a')]
        info['tags'] = tags
        print ' '.join(tags)

        return info
Пример #4
0
    def parse(self, url):
        info = {'url': url}
        html = pq(urlopen(url).content)

        # cover image url
        mainpic = html('#mainpic')
        cover_url = mainpic('a').attr('href')
        info['cover_url'] = cover_url
        print cover_url

        # title
        title = mainpic('img').attr('alt')
        info['title'] = title
        print title

        key_mapping = {
            u'作者': 'author',
            u'译者': 'translator',
            u'出版社': 'publisher',
            u'原作名': 'original title',
            u'出版年': 'published year',
            u'页数': 'num of pages',
            u'定价': 'price',
            u'装帧': 'paperback',
            u'丛书': 'series',
            u'副标题': 'subhead',
            }
        # author, publisher, pub year, price, isbn etc.
        desc = html('#info').html()
        for each in re.split(r'<br/?>', desc):
            each = each.strip()
            if each:
                k, v = parse_pair(pq(each).text())
                if k == u'页数':
                    try:
                        info['num of pages'] = int(v)
                    except ValueError:
                        info['num of pages'] = v
                else:
                    k = key_mapping.get(k, k).lower()
                    info[k] = v
                print u'{} => {}'.format(k, v)

        # tags
        tags = [ pq(a).text() for a in html('#db-tags-section')('a') ]
        info['tags'] = tags
        print ' '.join(tags)
        
        return info
Пример #5
0
    def parse(self, url):
        info = {'url': url}
        html = pq(urlopen(url).content)

        # cover image url
        cover = html('#main-image')
        cover_url = cover.attr('src')
        info['cover_url'] = cover_url
        print cover_url

        # title and author
        title = html('.parseasinTitle')
        info['title'] = title.text()
        print info['title']

        author = title.siblings('span').text()
        info['author'] = author
        print author

        # price
        for label in html('.rentalPriceLabel'):
            label = pq(label)
            if label.text().strip().lower() == 'buy new':
                price = label.siblings('.rentPrice').text()
                info['price'] = price
                print price
                break

        # basic info, such as publisher, lanuage, num of pages, IBSN, etc.
        for li in html('#SalesRank').parents('ul').children('li'):
            li = pq(li)
            k, v = parse_pair(pq(li).text())
            k = k.lower()
            if k in ('shipping weight',
                     'average customer review',
                     'amazon best sellers rank'):
                continue
            if k in ('paperback', 'hardcover'):
                info['binding'] = k
                try:
                    info['num of pages'] = int(v.replace('pages', ''))
                except ValueError:
                    info['num of pages'] = v
            else:
                info[k] = v
            print u'{} => {}'.format(k, v)

        return info
Пример #6
0
    def parse(self, url):
        info = {'url': url}
        html = pq(urlopen(url).content)

        # cover image url
        cover = html('#main-image')
        cover_url = cover.attr('src')
        info['cover_url'] = cover_url
        print cover_url

        # title and author
        title = html('.parseasinTitle')
        info['title'] = title.text()
        print info['title']

        author = title.siblings('span').text()
        info['author'] = author
        print author

        # price
        for label in html('.rentalPriceLabel'):
            label = pq(label)
            if label.text().strip().lower() == 'buy new':
                price = label.siblings('.rentPrice').text()
                info['price'] = price
                print price
                break

        # basic info, such as publisher, lanuage, num of pages, IBSN, etc.
        for li in html('#SalesRank').parents('ul').children('li'):
            li = pq(li)
            k, v = parse_pair(pq(li).text())
            k = k.lower()
            if k in ('shipping weight', 'average customer review',
                     'amazon best sellers rank'):
                continue
            if k in ('paperback', 'hardcover'):
                info['binding'] = k
                try:
                    info['num of pages'] = int(v.replace('pages', ''))
                except ValueError:
                    info['num of pages'] = v
            else:
                info[k] = v
            print u'{} => {}'.format(k, v)

        return info
Пример #7
0
def get_search_result(url):
    r = urlopen(url)
    html = pq(r.content)

    hrefs = []
    for link in html('.r')('a'):
        href = link.attrib['href']
        hrefs.append(href)
        logger.info('result %s: %s', link.text, href)

    info = {}
    for href in hrefs:
        hostpath = get_url_hostpath(href)
        for site, parser in parsers.iteritems():
            if site not in info and hostpath.find(site) > -1:
                logger.debug('parsing %s', href)
                info[site] = parser.parse(href)

    return info
Пример #8
0
def get_search_result(url):
    r = urlopen(url)
    html = pq(r.content)

    hrefs = []
    for link in html('.r')('a'):
        href = link.attrib['href']
        hrefs.append(href)
        logger.info('result %s: %s', link.text, href)

    info = {}
    for href in hrefs:
        hostpath = get_url_hostpath(href)
        for site, parser in parsers.iteritems():
            if site not in info and hostpath.find(site) > -1:
                logger.debug('parsing %s', href)
                info[site] = parser.parse(href)

    return info
Пример #9
0
    def parse(self, url):
        info = {'url': url}
        html = pq(urlopen(url).content)

        # tilte
        title = html('h1.f14').text()
        info['title'] = title

        # download url
        btn = html('.download_btn_box')
        host = get_url_host(url)
        download_url = join_url_hostpath(host, btn('a').attr('href'))
        info['download_url'] = download_url
        print download_url

        # file size
        file_size = btn('span').text()
        info['file_size'] = file_size
        print file_size

        return info