Пример #1
0
    def getCommentDetail_OtherPage(self):
        proxy_port = myProxy.is_proxy_exists()
        proxy = random.sample(proxy_port, 1)[0]
        proxy = proxy[0] + ':' + proxy[1]
        proxyHandler = urllib2.ProxyHandler({'http': r'http://%s' % proxy})
        cookies = urllib2.HTTPCookieProcessor
        opener = urllib2.build_opener(cookies, proxyHandler)
        opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0')]

        while queue_skuPageUrl_commentDetail.qsize() > 0:
            if queue_skuPageUrl_commentDetail.qsize() % 1000 == 0:
                print('=' * 15 + u'还剩下%s个待抓评论页面!' % queue_skuPageUrl_commentDetail.qsize())
                print('+' * 15 + u'已抓取评论数量总计%s条!' % queue_commentDetail_result.qsize())
            # 评论信息持久化,动态清除内存

            sku, url = queue_skuPageUrl_commentDetail.get()
            try:
                res_temp = opener.open(url)
                src = res_temp.read()
                res_temp.close()
                jsonFile = src.split('(', 1)[1][:-2]
            except:
                queue_skuPageUrl_commentDetail.put((sku, url))
                proxy = random.sample(proxy_port, 1)[0]
                # print(u'更换代理:%s:%s'%(proxy[0],proxy[1]))
                proxy = proxy[0] + ':' + proxy[1]
                proxyHandler = urllib2.ProxyHandler({'http': r'http://%s' % proxy})
                opener = urllib2.build_opener(cookies, proxyHandler)
                continue
            try:
                jsonFile = jsonFile.decode('GBK', 'ignore')
                jsonFile = json.loads(jsonFile)
            except:
                continue
            commentList = jsonFile['comments']
            for item in commentList:
                userId = item['id']
                userGuid = item['guid']
                content = item['content']
                createTime = item['creationTime']
                referenceId = item['referenceId']
                referenceTime = item['referenceTime']
                replyCount = item['replyCount']
                score = item['score']
                userLevelId = item['userLevelId']
                userProvince = item['userProvince']
                try:
                    productColor = item['productColor']
                except:
                    productColor = '-'
                userLevelName = item['userLevelName']
                userClientShow = item['userClientShow']
                userClientShow = userClientShow.split('>', 1)[1].split('<', 1)[0] if userClientShow else '-'
                isMobile = item['isMobile']
                resultTemp = [sku, userId, userGuid, content, createTime, referenceId,
                              referenceTime, replyCount, score, userLevelId, userProvince, productColor,
                              userLevelName, userClientShow, isMobile, url
                              ]
                queue_commentDetail_result.put(resultTemp)
Пример #2
0
    def getProductDetail(self):
        proxy_port = myProxy.is_proxy_exists()
        proxy = random.sample(proxy_port, 1)[0]
        proxy = proxy[0] + ':' + proxy[1]
        print('=' * 5 + u'开始使用的代理:' + 'http://%s' % proxy)
        proxyHandler = urllib2.ProxyHandler({'http': r'http://%s' % proxy})
        cookies = urllib2.HTTPCookieProcessor
        opener = urllib2.build_opener(cookies, proxyHandler)
        opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0')]

        while queue_for_url_target.qsize() > 0:
            print('=' * 15 + u'还剩下%s个待抓网址!' % queue_for_url_target.qsize())
            temp = ''
            try:
                temp = queue_for_url_target.get()
                topic, url = temp
            except:
                print(temp)
                continue
            try:
                res_temp = opener.open(url)
                src = res_temp.read()
            except:
                queue_for_url_target.put(url)
                proxy = random.sample(proxy_port, 1)[0]
                print(u'更换代理:%s:%s' % (proxy[0], proxy[1]))
                proxy = proxy[0] + ':' + proxy[1]
                proxyHandler = urllib2.ProxyHandler({'http': r'http://%s' % proxy})
                opener = urllib2.build_opener(cookies, proxyHandler)
                continue

            res_temp.close()
            d = pq(src)
            frames = d.find('.gl-i-wrap.j-sku-item')
            for item in frames:
                d = pq(item)
                productName = d.find('.p-name>a>em').text()
                # productFunction = d.find('.p-name>a>em').text()
                price = d.find('.J_price>i').text()
                commentCount = d.find('.p-commit>strong>a').text()
                sku = d.find('.gl-item').attr('data-sku')
                productHref = d.find('.p-img>a').attr('href')
                # print [productName,sku,productHref,price,commentCount,topic,url]
                queue_for_result.put([productName, sku, productHref, price, commentCount, topic, url])
                print([productName, sku, productHref, price, commentCount, topic, url])
Пример #3
0
    def getCommentDetail_FirstPage(self):
        # 抓取策略:因评论信息都是动态的,为尽量避免因动态添加的评论而引起的重复抓取问题,以每个产品对应内容为单个抓取单元
        proxy_port = myProxy.is_proxy_exists()
        proxy = random.sample(proxy_port, 1)[0]
        proxy = proxy[0] + ':' + proxy[1]
        proxyHandler = urllib2.ProxyHandler({'http': r'http://%s' % proxy})
        cookies = urllib2.HTTPCookieProcessor
        opener = urllib2.build_opener(cookies, proxyHandler)
        opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0')]

        while queue_sku_commentDetail.qsize() > 0:
            print('=' * 15 + u'还剩下%s个待抓SKU!' % queue_sku_commentDetail.qsize())
            sku = queue_sku_commentDetail.get()
            # 第一个评论页面地址;;参数p-0
            url = 'http://s.club.jd.com/productpage/p-' + str(sku) + '-s-0-t-0-p-0.html?callback=fetchJSON_comment'
            # 返回第一页评论数据及评论总量
            try:
                res_temp = opener.open(url)
                src = res_temp.read()
                res_temp.close()
                jsonFile = src.split('(', 1)[1][:-2]
            except:
                queue_sku_commentDetail.put(sku)
                proxy = random.sample(proxy_port, 1)[0]
                print(u'更换代理:%s:%s' % (proxy[0], proxy[1]))
                proxy = proxy[0] + ':' + proxy[1]
                proxyHandler = urllib2.ProxyHandler({'http': r'http://%s' % proxy})
                opener = urllib2.build_opener(cookies, proxyHandler)
                continue
            try:
                jsonFile = jsonFile.decode('GBK', 'ignore')
                jsonFile = json.loads(jsonFile)
            except:
                continue
            # 评论总量提取并依据此数生成url,并put至queue备用
            commentCount = jsonFile['productCommentSummary']['commentCount']
            pageCount = commentCount / 10 if commentCount % 10 == 0 else 1 + commentCount / 10
            if pageCount > 1:
                for i in range(1, pageCount):
                    url = 'http://s.club.jd.com/productpage/p-' + str(sku) + '-s-0-t-0-p-' + str(
                        i) + '.html?callback=fetchJSON_comment'
                    queue_skuPageUrl_commentDetail.put((sku, url))

            # 评论信息提取
            commentList = jsonFile['comments']
            for item in commentList:
                userId = item['id']
                userGuid = item['guid']
                content = item['content']
                createTime = item['creationTime']
                referenceId = item['referenceId']
                referenceTime = item['referenceTime']
                replyCount = item['replyCount']
                score = item['score']
                userLevelId = item['userLevelId']
                userProvince = item['userProvince']
                try:
                    productColor = item['productColor']
                except:
                    productColor = '-'
                userLevelName = item['userLevelName']
                userClientShow = item['userClientShow']
                userClientShow = userClientShow.split('>', 1)[1].split('<', 1)[0] if userClientShow else '-'
                isMobile = item['isMobile']
                resultTemp = [sku, userId, userGuid, content, createTime, referenceId,
                              referenceTime, replyCount, score, userLevelId, userProvince, productColor,
                              userLevelName, userClientShow, isMobile, url
                              ]
                queue_commentDetail_result.put(resultTemp)
Пример #4
0
    def getDetail(self):
        proxy_port = myProxy.is_proxy_exists()
        proxy = random.sample(proxy_port, 1)[0]
        proxy = proxy[0] + ':' + proxy[1]
        print('=' * 5 + u'开始使用的代理:' + 'http://%s' % proxy)
        proxyHandler = urllib2.ProxyHandler({'http': r'http://%s' % proxy})
        cookies = urllib2.HTTPCookieProcessor
        opener = urllib2.build_opener(cookies, proxyHandler)
        opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0')]

        while queue_for_InnerPageProdctDetail.qsize() > 0:
            print('=' * 15 + u'还剩下%s个待抓网址!' % queue_for_InnerPageProdctDetail.qsize())
            url = queue_for_InnerPageProdctDetail.get()
            try:
                res_temp = opener.open(url)
                src = res_temp.read()
                res_temp.close()
                d = pq(src)
            except:
                queue_for_url_target.put(url)
                proxy = random.sample(proxy_port, 1)[0]
                print(u'更换代理:%s:%s' % (proxy[0], proxy[1]))
                proxy = proxy[0] + ':' + proxy[1]
                proxyHandler = urllib2.ProxyHandler({'http': r'http://%s' % proxy})
                opener = urllib2.build_opener(cookies, proxyHandler)
                continue
            # 第三方卖家则获取公司名称
            companyName = d.find('.text.J-shop-name').text()
            companyName = companyName if companyName else '-'
            # 第三方卖家则获取评分信息
            scoreSum = '-'
            scoreProduct = '-'
            scoreProductAvg = '-'
            scoreService = '-'
            scoreServiceAvg = '-'
            scoreExpress = '-'
            scoreExpressAvg = '-'
            scoreFrame = d.find('.score-infor>div').my_text()
            if scoreFrame:
                upDownFrame = d.find('.score-infor>div span i')  # .attr('class')
                upDownFrame = [pq(item).attr('class') for item in upDownFrame]
                # 总分
                scoreSum = scoreFrame[0]
                scoreProduct = scoreFrame[3]
                scoreService = scoreFrame[6]
                scoreExpress = scoreFrame[9]
                scoreProductAvg = scoreFrame[4] if upDownFrame[0] == 'up' else '-' + scoreFrame[4]
                scoreServiceAvg = scoreFrame[7] if upDownFrame[1] == 'up' else '-' + scoreFrame[7]
                scoreExpressAvg = scoreFrame[10] if upDownFrame[2] == 'up' else '-' + scoreFrame[10]
            # 获取商品简介信息
            frames = d.find('#parameter2>li')
            commondityName = '-'
            commondityCode = '-'
            shelvesTime = '-'
            goodsWeight = '-'
            shopName = '-'
            function = '-'
            type = '-'
            originOfGoods = '-'
            usage = '-'
            system = '-'
            productNo = '-'
            compatibility = '-'
            applicableCrowd = '-'
            brand = '-'
            theoreticalEndurance = '-'
            rateOfWork = '-'
            for item in frames:
                d = pq(item)
                text = d.text()
                text = text.split(':')
                textTest = text[0]
                textTarget = text[1]
                # 用于字段测试
                # queue_for_test.put([url,textTest])
                if textTest == u'商品名称':
                    commondityName = textTarget
                elif textTest == u'商品编号':
                    commondityCode = textTarget
                elif textTest == u'上架时间':
                    shelvesTime = textTarget
                elif textTest == u'商品毛重':
                    goodsWeight = textTarget
                elif textTest == u'店铺':
                    shopName = textTarget
                elif textTest == u'功能':
                    function = textTarget
                elif textTest == u'类型':
                    type = textTarget
                elif textTest == u'商品产地':
                    originOfGoods = textTarget
                elif textTest == u'使用方式':
                    usage = textTarget
                elif textTest == u'系统':
                    system = textTarget
                elif textTest == u'货号':
                    productNo = textTarget
                elif textTest == u'兼容性':
                    compatibility = textTarget
                elif textTest == u'适用人群' or textTest == u'适用对象':
                    applicableCrowd = textTarget
                elif textTest == u'品牌':
                    brand = textTarget
                elif textTest == u'理论续航':
                    theoreticalEndurance = textTarget
                elif textTest == u'功率':
                    rateOfWork = textTarget
                else:
                    pass
            res_temp = [url, commondityName, commondityCode, shelvesTime, goodsWeight, shopName, function, type,
                        originOfGoods, usage, system, productNo, compatibility, applicableCrowd, brand,
                        theoreticalEndurance, rateOfWork,
                        scoreSum, scoreProduct, scoreProductAvg, scoreService, scoreServiceAvg, scoreExpress,
                        scoreExpressAvg, companyName]
            queue_for_InnerPageProdctDetail_result.put(res_temp)