def get_token(refresh=False): """ :param refresh: 强制刷新,重新获取 :return: """ token = redis_pool.get('xueqiu_token') # 判断是否object对象 if token: token = token.decode() try: token = ujson.loads(token) except ValueError: pass if refresh or not token: url = 'https://xueqiu.com/hq' result = spider(url) text = result.get('text') reg = re.escape('SNB.data.access_token = $.cookie("xq_a_token") || "') + '(?P<m>[\w\d]+)";' token = re.search(reg, text).group('m') redis_pool.set('xueqiu_token', token) return token
def get_max_page(self): """ 获取商品搜索最大页 :return: """ url = 'http://search.jd.com/Search?keyword=' + self.keyword max_page = query(spider(url).get('text')).html('#J_topPage > span > i').text() print('获取最大页:' + max_page) return max_page
def get_page_goods(self, page): page = page * 2 - 1 # 京东商品搜索页每页请求两次 url = 'http://search.jd.com/s_new.php' """ 获取上半部分商品 """ params = { "keyword": self.keyword, "page": page, "click": "0", "enc": "utf-8", "qrst": "1", "rt": "1", "s": "110", "stop": "1", "vt": "2" } html = query(spider(url, params=params).get('text')) goods = self.process_goods(html) """ 获取下半部分商品 """ params = { "keyword": self.keyword, "show_items": ','.join([g.get('id') for g in goods]), "enc": "utf-8", "page": "2", "log_id": "1510505434.63851", "qrst": "1", "rt": "1", "s": "28", "scrolling": "y", "stop": "1", "tpl": "2_M", "vt": "2" } html = query(spider(url, params=params).get('text')) goods.extend(self.process_goods(html)) print('第 {page} 页商品数量:{count}'.format(page=page, count=len(goods))) return goods
def get_cards(): cards = [] page = 1 start_time = time() while 1: params = { 'resourceType': 1, # 0是软件排行,1是游戏排行 'page': page } ret = spider('http://www.wandoujia.com/api/top/more', params=params, fields='json') json_data = ret.get('json') content = unpack_dict(json_data, 'data.content') # 到最大页则 content 为空 if not content: end_time = time() print('爬取完成,总页数:{page},游戏总数:{count},耗时:{times}'.format( page=page - 1, count=len(cards), times=end_time - start_time)) return cards document = query(content) cards_dom = document('.card') for card_dom in cards_dom: # 游戏名称、下载量、图标、下载地址 # 下载地址需安装豌豆荚 card_dom = query(card_dom) download_btn = card_dom('a.i-source.install-btn') name = download_btn.attr('data-name') downloads = download_btn.attr('data-install') icon = download_btn.attr('data-app-icon') url = download_btn.attr('href') cards.append({ 'name': name, 'downloads': downloads, 'icon': icon, 'url': url }) print('完成第 {page} 页爬取,当前总数:{count}'.format(page=page, count=len(cards))) page += 1
def get_recommend_goods(self, page, keyword): """ 获取(左侧)商品精选 :param page: :param keyword: :return: """ goods = [] url = 'http://x.jd.com/Search' params = { "page": page, "keyword": keyword, "_": time.time() * 1000, "adType": "7", "ad_ids": "291:20", "area": "1", "callback": "jQuery5618052", "enc": "utf-8", "xtest": "new_search" } ret = spider(url, fields='json', params=params, debug=True) if ret.get('code') == 200: # 解析商品数据,格式化保存 json_data = ret.get('json') json_goods = json_data.get('291', []) for json_good in json_goods: good = { 'id': json_good.get('sku_id'), # 通过 ID 取商品评价 'title': query(json_good.get('ad_title')).text(), 'icon': 'http://img1.360buyimg.com/n1/' + json_good.get('image_url'), 'price': json_good.get('pc_price'), 'url': json_good.get('click_url') } # 获取商品评价 good.setdefault('comments', self.get_comments(good.get('id'))) goods.append(good) print('已获取商品:' + good.get('title')) else: print('第 {page} 页商品获取完成'.format(page=page)) else: print('获取商品出错:' + str(ret.get('err'))) return goods
def get_cards(): cards = [] page = 1 start_time = time() while 1: params = { 'resourceType': 1, # 0是软件排行,1是游戏排行 'page': page } ret = spider('http://www.wandoujia.com/api/top/more', params=params, fields='json') json_data = ret.get('json') content = unpack_dict(json_data, 'data.content') # 到最大页则 content 为空 if not content: end_time = time() print('爬取完成,总页数:{page},游戏总数:{count},耗时:{times}'.format(page=page - 1, count=len(cards), times=end_time-start_time)) return cards document = query(content) cards_dom = document('.card') for card_dom in cards_dom: # 游戏名称、下载量、图标、下载地址 # 下载地址需安装豌豆荚 card_dom = query(card_dom) download_btn = card_dom('a.i-source.install-btn') name = download_btn.attr('data-name') downloads = download_btn.attr('data-install') icon = download_btn.attr('data-app-icon') url = download_btn.attr('href') cards.append({ 'name': name, 'downloads': downloads, 'icon': icon, 'url': url }) print('完成第 {page} 页爬取,当前总数:{count}'.format(page=page, count=len(cards))) page += 1
def get_page_content(page): """ 股票代码 symbol 股票名称 name 当前价 current 涨跌额 change 涨跌幅 percent 当日股价幅度 low - high 52周股价幅度 low52w - high52w 市值 marketcapital 市盈率 pettm 成交量 volume 成交额 amount """ url = 'https://xueqiu.com/stock/cata/stocklist.json' params = { 'page': page, 'size': '90', 'order': 'desc', 'orderby': 'percent', 'type': '11,12', '_': int(time() * 1000) } cookies = { 'xq_a_token': get_token() } result = spider(url, fields=['json', 'text'], params=params, cookies=cookies) json = result.get('json') items = json.get('stocks') print(result.get('text')) if items: # 股票详情URL [obj.setdefault('url', 'https://xueqiu.com/S/' + obj.get('symbol')) for obj in items] return items
def get_comments(self, good_id): """ 获取商品评价 :param good_id: :return: """ url = 'https://sclub.jd.com/comment/productPageComments.action' params = { "productId": good_id, "page": "0", "pageSize": "10", "sortType": "3", "isShadowSku": "0", "score": "0", "callback": "fetchJSON_comment98vv14677", } rp = spider(url, params=params, fields='json') if rp.get('code') == 200: comments = [c.get('content') for c in unpack_dict(rp.get('json'),'comments')] print('商品ID {good_id} 评价获取完成'.format(good_id=good_id)) return comments else: print('获取商品评价出错:' + str(rp.err))
def get_page_content(page): items = [] domain = 'https://www.qiushibaike.com' # 获取原始链接,内容详情页 url = 'https://www.qiushibaike.com/8hr/page/{0}/'.format(page) ret = spider(url) text = ret.get('text') index_document = query(text) articles = index_document('.article') for article in articles: href = query(article)('a.contentHerf').attr('href') items.append({'url': domain + href}) print('第 {page} 页获取链接数:{count}'.format(page=page, count=len(items))) if len(items) == 0: print(ret) for index, item in enumerate(items): for i in range(0, 2): # 访问详情页,获取内容 + 评论 text = spider(item['url']).get('text') document = query(text) # 内容 content = document('#single-next-link > div').text() if not content: print('获取失败,重试获取,进度:{index}/{maxlength}'.format( index=index + 1, maxlength=len(items))) continue # 内容配图 img_href = document('#single-next-link > div.thumb > img').attr( 'src') or '' if img_href: img_href = 'https:' + img_href # 评论 comments = [] comments_dom = document('.comment-block > div.replay > span.body') for span in comments_dom: comments.append(query(span).text()) item.update({ 'content': content, 'img_href': img_href, 'comments': comments }) print('获取第 {page} 页,进度:{index}/{maxlength}'.format( page=page, index=index + 1, maxlength=len(items))) break print('第 {page} 页获取完成'.format(page=page)) if page == 1: max_page = int( index_document( '#content-left > ul > li:nth-child(7) > a > span').text()) print('最大页:' + str(max_page)) return max_page, items return items