Пример #1
0
def get_token(refresh=False):
    """

    :param refresh: 强制刷新,重新获取
    :return:
    """
    token = redis_pool.get('xueqiu_token')

    # 判断是否object对象
    if token:
        token = token.decode()
        try:
            token = ujson.loads(token)
        except ValueError:
            pass

    if refresh or not token:
        url = 'https://xueqiu.com/hq'
        result = spider(url)
        text = result.get('text')
        reg = re.escape('SNB.data.access_token =  $.cookie("xq_a_token") || "') + '(?P<m>[\w\d]+)";'
        token = re.search(reg, text).group('m')
        redis_pool.set('xueqiu_token', token)

    return token
Пример #2
0
    def get_max_page(self):
        """
        获取商品搜索最大页

        :return:
        """
        url = 'http://search.jd.com/Search?keyword=' + self.keyword
        max_page = query(spider(url).get('text')).html('#J_topPage > span > i').text()
        print('获取最大页:' + max_page)
        return max_page
Пример #3
0
    def get_page_goods(self, page):
        page = page * 2 - 1  # 京东商品搜索页每页请求两次
        url = 'http://search.jd.com/s_new.php'

        """ 获取上半部分商品 """
        params = {
            "keyword": self.keyword,
            "page": page,
            "click": "0",
            "enc": "utf-8",
            "qrst": "1",
            "rt": "1",
            "s": "110",
            "stop": "1",
            "vt": "2"
        }
        html = query(spider(url, params=params).get('text'))
        goods = self.process_goods(html)

        """ 获取下半部分商品 """
        params = {
            "keyword": self.keyword,
            "show_items": ','.join([g.get('id') for g in goods]),
            "enc": "utf-8",
            "page": "2",
            "log_id": "1510505434.63851",
            "qrst": "1",
            "rt": "1",
            "s": "28",
            "scrolling": "y",
            "stop": "1",
            "tpl": "2_M",
            "vt": "2"
        }
        html = query(spider(url, params=params).get('text'))
        goods.extend(self.process_goods(html))

        print('第 {page} 页商品数量:{count}'.format(page=page, count=len(goods)))
        return goods
Пример #4
0
def get_cards():
    cards = []
    page = 1

    start_time = time()

    while 1:
        params = {
            'resourceType': 1,  # 0是软件排行,1是游戏排行
            'page': page
        }
        ret = spider('http://www.wandoujia.com/api/top/more',
                     params=params,
                     fields='json')
        json_data = ret.get('json')
        content = unpack_dict(json_data, 'data.content')

        # 到最大页则 content 为空
        if not content:
            end_time = time()
            print('爬取完成,总页数:{page},游戏总数:{count},耗时:{times}'.format(
                page=page - 1, count=len(cards), times=end_time - start_time))
            return cards

        document = query(content)
        cards_dom = document('.card')
        for card_dom in cards_dom:
            # 游戏名称、下载量、图标、下载地址
            # 下载地址需安装豌豆荚
            card_dom = query(card_dom)
            download_btn = card_dom('a.i-source.install-btn')

            name = download_btn.attr('data-name')
            downloads = download_btn.attr('data-install')
            icon = download_btn.attr('data-app-icon')
            url = download_btn.attr('href')

            cards.append({
                'name': name,
                'downloads': downloads,
                'icon': icon,
                'url': url
            })

        print('完成第 {page} 页爬取,当前总数:{count}'.format(page=page,
                                                   count=len(cards)))
        page += 1
Пример #5
0
    def get_recommend_goods(self, page, keyword):
        """
        获取(左侧)商品精选

        :param page:
        :param keyword:
        :return:
        """
        goods = []

        url = 'http://x.jd.com/Search'
        params = {
            "page": page,
            "keyword": keyword,
            "_": time.time() * 1000,
            "adType": "7",
            "ad_ids": "291:20",
            "area": "1",
            "callback": "jQuery5618052",
            "enc": "utf-8",
            "xtest": "new_search"
        }

        ret = spider(url, fields='json', params=params, debug=True)
        if ret.get('code') == 200:
            # 解析商品数据,格式化保存
            json_data = ret.get('json')
            json_goods = json_data.get('291', [])
            for json_good in json_goods:
                good = {
                    'id': json_good.get('sku_id'),  # 通过 ID 取商品评价
                    'title': query(json_good.get('ad_title')).text(),
                    'icon': 'http://img1.360buyimg.com/n1/' + json_good.get('image_url'),
                    'price': json_good.get('pc_price'),
                    'url': json_good.get('click_url')
                }
                # 获取商品评价
                good.setdefault('comments', self.get_comments(good.get('id')))
                goods.append(good)
                print('已获取商品:' + good.get('title'))
            else:
                print('第 {page} 页商品获取完成'.format(page=page))
        else:
            print('获取商品出错:' + str(ret.get('err')))

        return goods
Пример #6
0
def get_cards():
    cards = []
    page = 1

    start_time = time()

    while 1:
        params = {
            'resourceType': 1,  # 0是软件排行,1是游戏排行
            'page': page
        }
        ret = spider('http://www.wandoujia.com/api/top/more', params=params, fields='json')
        json_data = ret.get('json')
        content = unpack_dict(json_data, 'data.content')

        # 到最大页则 content 为空
        if not content:
            end_time = time()
            print('爬取完成,总页数:{page},游戏总数:{count},耗时:{times}'.format(page=page - 1, count=len(cards), times=end_time-start_time))
            return cards

        document = query(content)
        cards_dom = document('.card')
        for card_dom in cards_dom:
            # 游戏名称、下载量、图标、下载地址
            # 下载地址需安装豌豆荚
            card_dom = query(card_dom)
            download_btn = card_dom('a.i-source.install-btn')

            name = download_btn.attr('data-name')
            downloads = download_btn.attr('data-install')
            icon = download_btn.attr('data-app-icon')
            url = download_btn.attr('href')

            cards.append({
                'name': name,
                'downloads': downloads,
                'icon': icon,
                'url': url
            })

        print('完成第 {page} 页爬取,当前总数:{count}'.format(page=page, count=len(cards)))
        page += 1
Пример #7
0
def get_page_content(page):
    """
    股票代码 symbol
    股票名称 name
    当前价 current
    涨跌额 change
    涨跌幅 percent
    当日股价幅度 low - high
    52周股价幅度 low52w - high52w
    市值 marketcapital
    市盈率 pettm
    成交量 volume
    成交额 amount
    """
    url = 'https://xueqiu.com/stock/cata/stocklist.json'
    params = {
        'page': page,
        'size': '90',
        'order': 'desc',
        'orderby': 'percent',
        'type': '11,12',
        '_': int(time() * 1000)
    }
    cookies = {
        'xq_a_token': get_token()
    }
    result = spider(url, fields=['json', 'text'], params=params, cookies=cookies)
    json = result.get('json')
    items = json.get('stocks')
    print(result.get('text'))

    if items:
        # 股票详情URL
        [obj.setdefault('url', 'https://xueqiu.com/S/' + obj.get('symbol')) for obj in items]

    return items
Пример #8
0
    def get_comments(self, good_id):
        """
        获取商品评价

        :param good_id:
        :return:
        """
        url = 'https://sclub.jd.com/comment/productPageComments.action'
        params = {
            "productId": good_id,
            "page": "0",
            "pageSize": "10",
            "sortType": "3",
            "isShadowSku": "0",
            "score": "0",
            "callback": "fetchJSON_comment98vv14677",
        }
        rp = spider(url, params=params, fields='json')
        if rp.get('code') == 200:
            comments = [c.get('content') for c in unpack_dict(rp.get('json'),'comments')]
            print('商品ID {good_id} 评价获取完成'.format(good_id=good_id))
            return comments
        else:
            print('获取商品评价出错:' + str(rp.err))
Пример #9
0
def get_page_content(page):
    items = []

    domain = 'https://www.qiushibaike.com'

    # 获取原始链接,内容详情页
    url = 'https://www.qiushibaike.com/8hr/page/{0}/'.format(page)
    ret = spider(url)
    text = ret.get('text')
    index_document = query(text)

    articles = index_document('.article')
    for article in articles:
        href = query(article)('a.contentHerf').attr('href')
        items.append({'url': domain + href})
    print('第 {page} 页获取链接数:{count}'.format(page=page, count=len(items)))
    if len(items) == 0:
        print(ret)

    for index, item in enumerate(items):
        for i in range(0, 2):
            # 访问详情页,获取内容 + 评论
            text = spider(item['url']).get('text')
            document = query(text)
            # 内容
            content = document('#single-next-link > div').text()
            if not content:
                print('获取失败,重试获取,进度:{index}/{maxlength}'.format(
                    index=index + 1, maxlength=len(items)))
                continue

            # 内容配图
            img_href = document('#single-next-link > div.thumb > img').attr(
                'src') or ''
            if img_href:
                img_href = 'https:' + img_href

            # 评论
            comments = []
            comments_dom = document('.comment-block > div.replay > span.body')
            for span in comments_dom:
                comments.append(query(span).text())

            item.update({
                'content': content,
                'img_href': img_href,
                'comments': comments
            })
            print('获取第 {page} 页,进度:{index}/{maxlength}'.format(
                page=page, index=index + 1, maxlength=len(items)))
            break

    print('第 {page} 页获取完成'.format(page=page))

    if page == 1:
        max_page = int(
            index_document(
                '#content-left > ul > li:nth-child(7) > a > span').text())
        print('最大页:' + str(max_page))
        return max_page, items

    return items