def getCommentDetail_OtherPage(self): proxy_port = myProxy.is_proxy_exists() proxy = random.sample(proxy_port, 1)[0] proxy = proxy[0] + ':' + proxy[1] proxyHandler = urllib2.ProxyHandler({'http': r'http://%s' % proxy}) cookies = urllib2.HTTPCookieProcessor opener = urllib2.build_opener(cookies, proxyHandler) opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0')] while queue_skuPageUrl_commentDetail.qsize() > 0: if queue_skuPageUrl_commentDetail.qsize() % 1000 == 0: print('=' * 15 + u'还剩下%s个待抓评论页面!' % queue_skuPageUrl_commentDetail.qsize()) print('+' * 15 + u'已抓取评论数量总计%s条!' % queue_commentDetail_result.qsize()) # 评论信息持久化,动态清除内存 sku, url = queue_skuPageUrl_commentDetail.get() try: res_temp = opener.open(url) src = res_temp.read() res_temp.close() jsonFile = src.split('(', 1)[1][:-2] except: queue_skuPageUrl_commentDetail.put((sku, url)) proxy = random.sample(proxy_port, 1)[0] # print(u'更换代理:%s:%s'%(proxy[0],proxy[1])) proxy = proxy[0] + ':' + proxy[1] proxyHandler = urllib2.ProxyHandler({'http': r'http://%s' % proxy}) opener = urllib2.build_opener(cookies, proxyHandler) continue try: jsonFile = jsonFile.decode('GBK', 'ignore') jsonFile = json.loads(jsonFile) except: continue commentList = jsonFile['comments'] for item in commentList: userId = item['id'] userGuid = item['guid'] content = item['content'] createTime = item['creationTime'] referenceId = item['referenceId'] referenceTime = item['referenceTime'] replyCount = item['replyCount'] score = item['score'] userLevelId = item['userLevelId'] userProvince = item['userProvince'] try: productColor = item['productColor'] except: productColor = '-' userLevelName = item['userLevelName'] userClientShow = item['userClientShow'] userClientShow = userClientShow.split('>', 1)[1].split('<', 1)[0] if userClientShow else '-' isMobile = item['isMobile'] resultTemp = [sku, userId, userGuid, content, createTime, referenceId, referenceTime, replyCount, score, userLevelId, userProvince, productColor, userLevelName, userClientShow, isMobile, url ] queue_commentDetail_result.put(resultTemp)
def getProductDetail(self): proxy_port = myProxy.is_proxy_exists() proxy = random.sample(proxy_port, 1)[0] proxy = proxy[0] + ':' + proxy[1] print('=' * 5 + u'开始使用的代理:' + 'http://%s' % proxy) proxyHandler = urllib2.ProxyHandler({'http': r'http://%s' % proxy}) cookies = urllib2.HTTPCookieProcessor opener = urllib2.build_opener(cookies, proxyHandler) opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0')] while queue_for_url_target.qsize() > 0: print('=' * 15 + u'还剩下%s个待抓网址!' % queue_for_url_target.qsize()) temp = '' try: temp = queue_for_url_target.get() topic, url = temp except: print(temp) continue try: res_temp = opener.open(url) src = res_temp.read() except: queue_for_url_target.put(url) proxy = random.sample(proxy_port, 1)[0] print(u'更换代理:%s:%s' % (proxy[0], proxy[1])) proxy = proxy[0] + ':' + proxy[1] proxyHandler = urllib2.ProxyHandler({'http': r'http://%s' % proxy}) opener = urllib2.build_opener(cookies, proxyHandler) continue res_temp.close() d = pq(src) frames = d.find('.gl-i-wrap.j-sku-item') for item in frames: d = pq(item) productName = d.find('.p-name>a>em').text() # productFunction = d.find('.p-name>a>em').text() price = d.find('.J_price>i').text() commentCount = d.find('.p-commit>strong>a').text() sku = d.find('.gl-item').attr('data-sku') productHref = d.find('.p-img>a').attr('href') # print [productName,sku,productHref,price,commentCount,topic,url] queue_for_result.put([productName, sku, productHref, price, commentCount, topic, url]) print([productName, sku, productHref, price, commentCount, topic, url])
def getCommentDetail_FirstPage(self): # 抓取策略:因评论信息都是动态的,为尽量避免因动态添加的评论而引起的重复抓取问题,以每个产品对应内容为单个抓取单元 proxy_port = myProxy.is_proxy_exists() proxy = random.sample(proxy_port, 1)[0] proxy = proxy[0] + ':' + proxy[1] proxyHandler = urllib2.ProxyHandler({'http': r'http://%s' % proxy}) cookies = urllib2.HTTPCookieProcessor opener = urllib2.build_opener(cookies, proxyHandler) opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0')] while queue_sku_commentDetail.qsize() > 0: print('=' * 15 + u'还剩下%s个待抓SKU!' % queue_sku_commentDetail.qsize()) sku = queue_sku_commentDetail.get() # 第一个评论页面地址;;参数p-0 url = 'http://s.club.jd.com/productpage/p-' + str(sku) + '-s-0-t-0-p-0.html?callback=fetchJSON_comment' # 返回第一页评论数据及评论总量 try: res_temp = opener.open(url) src = res_temp.read() res_temp.close() jsonFile = src.split('(', 1)[1][:-2] except: queue_sku_commentDetail.put(sku) proxy = random.sample(proxy_port, 1)[0] print(u'更换代理:%s:%s' % (proxy[0], proxy[1])) proxy = proxy[0] + ':' + proxy[1] proxyHandler = urllib2.ProxyHandler({'http': r'http://%s' % proxy}) opener = urllib2.build_opener(cookies, proxyHandler) continue try: jsonFile = jsonFile.decode('GBK', 'ignore') jsonFile = json.loads(jsonFile) except: continue # 评论总量提取并依据此数生成url,并put至queue备用 commentCount = jsonFile['productCommentSummary']['commentCount'] pageCount = commentCount / 10 if commentCount % 10 == 0 else 1 + commentCount / 10 if pageCount > 1: for i in range(1, pageCount): url = 'http://s.club.jd.com/productpage/p-' + str(sku) + '-s-0-t-0-p-' + str( i) + '.html?callback=fetchJSON_comment' queue_skuPageUrl_commentDetail.put((sku, url)) # 评论信息提取 commentList = jsonFile['comments'] for item in commentList: userId = item['id'] userGuid = item['guid'] content = item['content'] createTime = item['creationTime'] referenceId = item['referenceId'] referenceTime = item['referenceTime'] replyCount = item['replyCount'] score = item['score'] userLevelId = item['userLevelId'] userProvince = item['userProvince'] try: productColor = item['productColor'] except: productColor = '-' userLevelName = item['userLevelName'] userClientShow = item['userClientShow'] userClientShow = userClientShow.split('>', 1)[1].split('<', 1)[0] if userClientShow else '-' isMobile = item['isMobile'] resultTemp = [sku, userId, userGuid, content, createTime, referenceId, referenceTime, replyCount, score, userLevelId, userProvince, productColor, userLevelName, userClientShow, isMobile, url ] queue_commentDetail_result.put(resultTemp)
def getDetail(self): proxy_port = myProxy.is_proxy_exists() proxy = random.sample(proxy_port, 1)[0] proxy = proxy[0] + ':' + proxy[1] print('=' * 5 + u'开始使用的代理:' + 'http://%s' % proxy) proxyHandler = urllib2.ProxyHandler({'http': r'http://%s' % proxy}) cookies = urllib2.HTTPCookieProcessor opener = urllib2.build_opener(cookies, proxyHandler) opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0')] while queue_for_InnerPageProdctDetail.qsize() > 0: print('=' * 15 + u'还剩下%s个待抓网址!' % queue_for_InnerPageProdctDetail.qsize()) url = queue_for_InnerPageProdctDetail.get() try: res_temp = opener.open(url) src = res_temp.read() res_temp.close() d = pq(src) except: queue_for_url_target.put(url) proxy = random.sample(proxy_port, 1)[0] print(u'更换代理:%s:%s' % (proxy[0], proxy[1])) proxy = proxy[0] + ':' + proxy[1] proxyHandler = urllib2.ProxyHandler({'http': r'http://%s' % proxy}) opener = urllib2.build_opener(cookies, proxyHandler) continue # 第三方卖家则获取公司名称 companyName = d.find('.text.J-shop-name').text() companyName = companyName if companyName else '-' # 第三方卖家则获取评分信息 scoreSum = '-' scoreProduct = '-' scoreProductAvg = '-' scoreService = '-' scoreServiceAvg = '-' scoreExpress = '-' scoreExpressAvg = '-' scoreFrame = d.find('.score-infor>div').my_text() if scoreFrame: upDownFrame = d.find('.score-infor>div span i') # .attr('class') upDownFrame = [pq(item).attr('class') for item in upDownFrame] # 总分 scoreSum = scoreFrame[0] scoreProduct = scoreFrame[3] scoreService = scoreFrame[6] scoreExpress = scoreFrame[9] scoreProductAvg = scoreFrame[4] if upDownFrame[0] == 'up' else '-' + scoreFrame[4] scoreServiceAvg = scoreFrame[7] if upDownFrame[1] == 'up' else '-' + scoreFrame[7] scoreExpressAvg = scoreFrame[10] if upDownFrame[2] == 'up' else '-' + scoreFrame[10] # 获取商品简介信息 frames = d.find('#parameter2>li') commondityName = '-' commondityCode = '-' shelvesTime = '-' goodsWeight = '-' shopName = '-' function = '-' type = '-' originOfGoods = '-' usage = '-' system = '-' productNo = '-' compatibility = '-' applicableCrowd = '-' brand = '-' theoreticalEndurance = '-' rateOfWork = '-' for item in frames: d = pq(item) text = d.text() text = text.split(':') textTest = text[0] textTarget = text[1] # 用于字段测试 # queue_for_test.put([url,textTest]) if textTest == u'商品名称': commondityName = textTarget elif textTest == u'商品编号': commondityCode = textTarget elif textTest == u'上架时间': shelvesTime = textTarget elif textTest == u'商品毛重': goodsWeight = textTarget elif textTest == u'店铺': shopName = textTarget elif textTest == u'功能': function = textTarget elif textTest == u'类型': type = textTarget elif textTest == u'商品产地': originOfGoods = textTarget elif textTest == u'使用方式': usage = textTarget elif textTest == u'系统': system = textTarget elif textTest == u'货号': productNo = textTarget elif textTest == u'兼容性': compatibility = textTarget elif textTest == u'适用人群' or textTest == u'适用对象': applicableCrowd = textTarget elif textTest == u'品牌': brand = textTarget elif textTest == u'理论续航': theoreticalEndurance = textTarget elif textTest == u'功率': rateOfWork = textTarget else: pass res_temp = [url, commondityName, commondityCode, shelvesTime, goodsWeight, shopName, function, type, originOfGoods, usage, system, productNo, compatibility, applicableCrowd, brand, theoreticalEndurance, rateOfWork, scoreSum, scoreProduct, scoreProductAvg, scoreService, scoreServiceAvg, scoreExpress, scoreExpressAvg, companyName] queue_for_InnerPageProdctDetail_result.put(res_temp)