Exemplo n.º 1
0
    def parse(self, response):
        items = response.css('.gl-warp.clearfix > li.gl-item')
        for item in items:
            dic = JdItem()
            name = item.css('.p-name a i::text').extract_first()
            image = item.css(
                '.p-img a img::attr(source-data-lazy-img)').extract_first()
            price = item.css('.p-price i::text').extract_first()
            deal = item.css('.p-commit strong a::text').extract_first()
            shop = item.css('.p-shop a::text').extract_first()
            dic['name'] = name
            dic['image'] = image
            dic['price'] = price
            dic['deal'] = deal
            dic['shop'] = shop
            yield dic

        # for page in range(2,101):
        #     url = self.start_urls[0] + '&page=%s' % page
        #
        #     yield scrapy.Request(url=url, callback=self.parse)
        if self.page <= 100:
            self.page += 1
            yield scrapy.Request(self.url + str(self.page),
                                 callback=self.parse,
                                 dont_filter=True)
Exemplo n.º 2
0
    def parse_item(self, response):
        reg_num = re.compile('page=(\d+?)')
        num = re.search(reg_num,response.url).group(1)
        print("正在翻页%s"%(num))

        gl_items = response.xpath('//li[@class="gl-item"]')

        base_price_url = 'https://p.3.cn/prices/mgets?callback=jQuery%s&skuIds=J_%s&pduid=%s'

        for gl_item in gl_items:
            item = JdItem()
            #店铺名字
            item['jd_shop_name'] = gl_item.xpath('./div/div[@class="p-shop"]/@data-shop_name').extract_first()

            #物品id
            item['product_id'] = gl_item.xpath('.//div/@data-sku').extract_first()

            #详情页面url
            item['jd_page_url'] = 'http://' + gl_item.xpath('.//div[@class="p-img"]/a/@href').extract_first()

            #每个商品价格url
            price_url = base_price_url%(item['product_id'],item['product_id'],random.randint(0,1000000))

            #商品图片url
            jd_img_url = gl_item.xpath('.//img[@height="220"]/@src | .//img[@height="220"]/@data-lazy-img').extract_first()
            item['jd_img_url'] = 'https:' + jd_img_url

            yield Request(url=price_url,callback=self.parse_price,meta={'item':item})
Exemplo n.º 3
0
    def parse(self, response):
        selector = Selector(response)
        lis = selector.xpath('//ul[class="gl-warp clearfix"]/li')
        base_price_url = 'https://p.3.cn/prices/mgets?callback=jQuery%s&skuIds=J_%s'
        for li in lis:
            item = JdItem()

            item['name'] = li.xpath(
                'div/div[class="p-img"]/a/@title').extract()[0]
            item['comment_num'] = li.xpath(
                'div/div[class="p-commit"]/strong/a/text()').extract()[0]
            item['url'] = li.xpath(
                'div/div[class="p-img"]/a/@href').extract()[0]
            item['info'] = li.xpath(
                'div/div[class="p-name p-name-type-2"]/a/em/text()').extract(
                )[0]

            print('解析异常。。。。。')

            price_url = base_price_url % (
                item['product_id'], item['product_id'])  # 这样就获得了每一款sku的价格的链接

            yield Request(url=price_url,
                          callback=self.parse,
                          meta={
                              'item': item,
                              'dont_redirect': True,
                              'handle_httpstatus_list': [302]
                          })

        pass
Exemplo n.º 4
0
    def parse(self, response):

        products = response.xpath(
            '//div[@id="J_searchWrap"]//div[@id="J_goodsList"]//li')

        for product in products:

            item = JdItem()

            item['image'] = ''.join(
                product.xpath(
                    './/div[@class="p-img"]//a/@href').extract()).strip()  #dei
            item['price'] = ''.join(
                product.xpath('.//div[@class="p-price"]//i/text()').extract()
            ).strip()  #dei
            item['shop'] = ''.join(
                product.xpath('.//div[contains(@class, "shop")]//a/text()').
                extract()).strip()  #dei
            item['title'] = ''.join(
                product.xpath('.//div[contains(@class,"p-name")]//em//text()').
                extract()).strip()  #dei
            item['deal'] = ''.join(
                product.xpath('.//div[@class="p-commit"]//text()').extract()
            ).strip()  #dei

            yield item
Exemplo n.º 5
0
    def parse(self, response):
        # 获取一个页面中每本书的名字和价格
        for sel in response.css('ul.gl-warp.clearfix > li.gl-item'):
            item = JdItem()
            name= sel.css('div.p-name').xpath('string(.//em)').extract_first()
            price= sel.css('div.p-price i::text').extract_first()
            try:
                remark=sel.xpath('.//div[(@class="p-commit" or @class="p-comm")]').xpath('string(.)').extract_first()
                if remark:
                    remark=remark.strip()
            except:
                remark=None
            try:
                price=float(price)
            except:
                price=price

            # 自营
            # shop=sel.css('div.p-shopnum span::text').extract_first()

            # 出版社

            publish=sel.css('div.p-shopnum a::text').extract_first()
            if publish is None:
                publish=sel.css('div.p-shop a::text').extract_first()
            # if shop is None:
            #     shop=sel.css('div.p-shopnum a::text').extract_first()
            #     publish=None

            item['name']=name
            item['price']=price
            item['remark']=remark
            item['publish']=publish
            # item['shop']=shop
            yield item
Exemplo n.º 6
0
 def parse(self, response):
     # 实例化item
     item = JdItem()
     # 注意imgurls是一个集合也就是多张图片
     imgurls = response.css(".post img::attr(src)").extract()
     item['imgurl'] = imgurls
     yield item
Exemplo n.º 7
0
 def parse(self, response):
     for el in response.css('.gl-item'):
         yield JdItem(
             url=el.css('.p-name > a::attr("href")').extract_first(),
             name=el.css('.p-name > a::attr("title")').extract_first(),
             price=float(el.css('.p-price i::text').extract_first()),
         )
Exemplo n.º 8
0
 def parse(self, response):
     print("正在抓取%s的页面内容" % (response.url))
     self.page += 1
     #获取包含商品的ul
     ul = response.xpath('//ul[contains(@class, "gl-warp")]/li')
     for li in ul:
         items = JdItem()
         items['shop_name'] = li.xpath(
             './div/div[@class="p-shop"]/@data-shop_name').extract_first()
         items['product_name'] = li.xpath(
             './div[1]/div[4]/a/em/text()').extract_first().strip()
         items['product_id'] = li.xpath(
             './div[@class="gl-i-wrap j-sku-item"]/@data-sku'
         ).extract_first()
         items['product_url'] = li.xpath(
             './div//div[@class="p-img"]/a/@href').extract_first()
         #获取价格,进入价格页面
         url = 'https://p.3.cn/prices/mgets?callback=jQuery1493916&skuIds=J_' + items[
             'product_id'] + '&pduid=%s' % (time.time())
         yield scrapy.Request(url=url,
                              callback=self.parse_price,
                              meta={"item": items})
     #获取下一页(只抓取10页)
     if self.page < 10:
         try:
             next_url = response.xpath(
                 '//a[@class="pn-next"]/@href').extract_first()
             yield scrapy.Request(url='https://list.jd.com' + next_url,
                                  callback=self.parse)
         except Exception as e:
             print("获取下一页链接失败>>%s" % e)
     else:
         print("抓取%s页数据结束!" % self.page)
Exemplo n.º 9
0
    def parse(self, response):
        ids = []
        # with open('html_code.html', 'w', encoding='utf-8') as f:
        #     f.write(response.text)
        # 有预售 class="gl-item gl-item-presell"
        data_list = response.xpath('//li[contains(@class,"gl-item")]')
        # 测试查看每页数据书否30
        # print(len(data_list), self.page, response.url)

        for data in data_list:
            item = JdItem()
            # 产品多款data-pid 单款data-sku
            item_id = data.xpath('./@data-pid').extract()
            if not item_id:
                item_id = data.xpath('./@data-sku').extract()

            item_name = data.xpath(
                './/div[contains(@class,"p-name")]//em/text()[1]').extract()
            item_price = data.xpath(
                './/div[@class="p-price"]//i/text()').extract()
            if item_id:
                item['item_id'] = item_id[0]
                item_url = 'https://item.jd.com/' + item_id[0] + '.html'
                item['item_url'] = item_url
            if item_name:
                item['item_name'] = item_name[0]
            if item_price:
                item['item_price'] = item_price[0]
            # item['page'] = self.page
            yield item
        headers = {'referer': response.url}
        self.page += 1
        self.s += 30
        url = self.next_url % (self.keyword, self.page, self.s, ','.join(ids))
        yield scrapy.Request(url, callback=self.next_parse, headers=headers)
Exemplo n.º 10
0
    def next_parse(self, response):
        data_list = response.xpath('//li[contains(@class,"gl-item")]')

        # print(len(data_list), self.page, response.url)

        for data in data_list:
            item = JdItem()
            item_id = data.xpath('./@data-pid').extract()
            if not item_id:
                item_id = data.xpath('./@data-sku').extract()

            item_name = data.xpath(
                './/div[contains(@class,"p-name")]//em/text()[1]').extract()
            item_price = data.xpath(
                './/div[@class="p-price"]//i/text()').extract()
            if item_id:
                item['item_id'] = item_id[0]
                item_url = 'https://item.jd.com/' + item_id[0] + '.html'
                item['item_url'] = item_url
            if item_name:
                item['item_name'] = item_name[0]
            if item_price:
                item['item_price'] = item_price[0]
            # item['page'] = self.page
            yield item

        if self.page < 200:
            self.page += 1
            self.s += 30
            url = self.url % (self.keyword, self.page, self.s)
            yield scrapy.Request(url, callback=self.parse)


# scrapy.cmdline.execute(['scrapy', 'crawl', 'jd_scrapy_redis', '-o', 'wiki.csv', '-t', 'csv'])
Exemplo n.º 11
0
    def parse(self, response):

        contents = response.xpath(
            '//div[@id="J_goodsList"]//li[@class="gl-item"]')
        #contents = response.xpath('//i[@class="goods-icons4 J-picon-tips"]/text()')
        for content in contents:
            item = JdItem()
            #r = content.extract()
            eachs = content.xpath(
                './/i[@class="goods-icons4 J-picon-tips"]/text()')
            for each in eachs:
                result = each.extract()
                if result == self.flag:
                    item['price'] = content.xpath(
                        './/div[@class="p-price"]//i/text()').extract()[0]
                    name_pre = content.xpath(
                        './/div[@class="p-name"]//font/text()').extract()
                    if len(name_pre) > 0:
                        name_pre = name_pre[0]
                        item['name'] = name_pre + content.xpath(
                            './/div[@class="p-name p-name-type-2"]//em/text()'
                        ).extract()[0]
                    else:
                        item['name'] = content.xpath(
                            './/div[@class="p-name p-name-type-2"]//em/text()'
                        ).extract()[0]
                    yield item

        if self.page < 200:
            self.page += 1
            yield scrapy.Request(self.url + str(self.page),
                                 callback=self.parse)
Exemplo n.º 12
0
    def parse(self, response):
        products = response.xpath(
            './/div[@id="plist"]/ul[contains(@class, "gl-warp")]/li[@class="gl-item"]'
        )
        # print('products\t\t',products)
        for product in products:
            item = JdItem()
            item['name'] = product.xpath(
                './/div[@class="p-name"]/a/em/text()').extract_first().strip()
            item['url'] = 'https:' + product.xpath(
                './/div[@class="p-name"]/a/@href').extract_first()
            item['price'] = product.xpath(
                './/div[@class="p-price"]/strong[@class="J_price"][1]/i/text()'
            ).extract_first()
            item['comment'] = product.xpath(
                './/a[@class="comment"]/text()').extract_first()
            item['shop'] = product.xpath(
                './/div[@class="p-shop"]/span/a/text()').extract_first()
            # print(item)
            yield item

        next_url = response.xpath(
            './/a[@class="pn-next"]/@href').extract_first()
        if next_url != None:
            url = response.urljoin(next_url)
            yield scrapy.Request(url=url, callback=self.parse)
Exemplo n.º 13
0
    def parse(self, response):
        sel = scrapy.Selector(response)
        # url="https://item.jd.hk/18739277759.html"    #京东全球购与普通网址不同,不同的地方为“https://item.jd.com/4251335.html”
        goods_info = sel.xpath(".//div[@id='plist']/ul/li")
        for goods in goods_info:
            ProductID = goods.xpath(
                ".//div[@class='gl-i-wrap j-sku-item']/@data-sku").extract()[
                    0]  #商品编号
            if len(ProductID) != 0:
                goods_web = "https://item.jd.com/" + str(
                    ProductID) + ".html"  #商品链接   包含商品型号,店铺名称,类别,品牌,型号等
                item = JdItem(ProductID=ProductID)
                request = scrapy.Request(url=goods_web,
                                         callback=self.goods,
                                         meta={'item': item},
                                         dont_filter=True)
                yield request
            else:
                print("parse中ProductID为空  没有读到")
        # #测试用
        # ProductID='1552845'
        # item = JdtestItem(ProductID=ProductID)
        # # url="https://item.jd.hk/1971910764.html"
        # url="https://item.jd.com/1552845.html"
        # request = scrapy.Request(url=url, callback=self.goods,meta={'item':item}, dont_filter=True)
        # yield request

        #翻页功能
        next_page = sel.xpath(
            ".//div[@class='p-wrap']/span[@class='p-num']/a[@class='pn-next']/@href"
        ).extract()
        if next_page:
            next = "https://list.jd.com/" + next_page[0]
            yield scrapy.Request(next, callback=self.parse)
Exemplo n.º 14
0
 def parse(self, response):
     # 解码方式是GBK,默认utf8会显示乱码
     result = json.loads(response.body.decode('GBK'))
     item = JdItem()
     if result['data']:
         # 分类都在datas里面
         datas = result["data"]
         # 首先是大分类
         for data in datas:
             b_cates = data['s'][0]
             b_cate = b_cates["n"]
             # print("大分类:{}".format(b_cate))
             # 专门写个函数来获得分类的url和名称,把分类传给该函数
             item["b_cate_url"], item["b_cate_name"] = self.get_info_cate(
                 b_cate)
             # 中分类在大分类下面一级,和n同一级
             m_catess = b_cates['s']
             # 中分类下有很多分类,先遍历中分类
             for m_cates in m_catess:
                 m_cate = m_cates["n"]
                 # print("中分类:{}".format(m_cate))
                 # 调用分类函数
                 item["m_cate_url"], item[
                     "m_cate_name"] = self.get_info_cate(m_cate)
                 # 小分类在中分类下和n同一级
                 s_catess = m_cates['s']
                 # 遍历小分类
                 for s_cates in s_catess:
                     s_cate = s_cates["n"]
                     # print("小分类:{}".format(s_cate))
                     item["s_cate_url"], item[
                         "s_cate_name"] = self.get_info_cate(s_cate)
                     # print(item)
                     yield item
Exemplo n.º 15
0
    def parse(self, response):
        #商品列表
        products = response.xpath('//ul[@class="gl-warp clearfix"]/li')
        #列表迭代
        for product in products:
            item = JdItem()
            try:
                name = ''.join(
                    product.xpath(
                        './/div[@class="p-name p-name-type-2"]/a/em/text()').
                    extract()).strip().replace(' ', '')
            except:
                name = ''
            try:
                price = product.xpath(
                    './/div[@class="p-price"]//i/text()').extract()[0]
            except:
                price = ''

            try:
                store = product.xpath(
                    './/div[@class="p-shop"]//a/@title').extract()[0]
            except:
                store = ''
            try:
                evaluate_num = product.xpath(
                    './/div[@class="p-commit"]/strong/a/text()').extract()[0]
            except:
                evaluate_num = ''
            try:
                detail_url = product.xpath(
                    './/div[@class="p-name p-name-type-2"]/a/@href').extract(
                    )[0]
            except:
                detail_url = ''
            try:
                if product.xpath('.//div[@class="p-icons"]/i/text()').extract(
                )[0] == '自营':
                    support = '自营'
                else:
                    support = '非自营'
            except:
                support = '非自营'

            item['name'] = name
            item['price'] = price
            item['store'] = store
            item['evaluate_num'] = evaluate_num
            item['detail_url'] = detail_url
            item['support'] = support
            yield item
            print(item)
        if self.page < 100:
            self.page += 1
            print(self.page)
            yield scrapy.Request(url=self.base_url,
                                 callback=self.parse,
                                 meta={'page': self.page},
                                 dont_filter=True)
Exemplo n.º 16
0
 def parse_shop(self, response):
     item = JdItem()
     sel = Selector(response)
     shop_title = sel.xpath('//div[@class="J-hove-wrap EDropdown fr"]/div/div/a/text()').extract_first()
     shop_url = 'https:' + sel.xpath('//div[@class="J-hove-wrap EDropdown fr"]/div/div/a/@href').extract_first()
     item['title'] = shop_title
     item['url'] = shop_url
     yield item
Exemplo n.º 17
0
    def parse_page(self, response):
        item_list = response.css('#plist > ul > li > div')
        data = []
        tmp_count = 0
        check_chat = 'https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds='
        for one in item_list:
            tmp_count += 1
            my_items = JdItem()
            my_items['SkuId'] = one.css('::attr(data-sku)').extract_first()
            my_items['img'] = one.css(
                'div.p-img > a::attr(href)').extract_first()
            my_items['name'] = one.css(
                'div.p-name > a > em::text').extract_first()
            my_items['url'] = one.css(
                'div.p-name > a::attr(href)').extract_first()
            try:
                tmp = one.css('div.p-icons.J-pro-icons > img::attr(data-tips)'
                              ).extract_first()
                my_items['is_jd'] = True
            except Exception as e:
                self.logger.info(e)
                my_items['is_jd'] = False
            data.append(my_items)
            if tmp_count >= 30:
                check_chat = ','.join([
                    check_chat, my_items['SkuId'] +
                    '&callback=jQuery270940&_=1492343539522'
                ])
                yield scrapy.Request(url=check_chat,
                                     callback=self.parse_one,
                                     meta={
                                         'cookies': False,
                                         'data': data
                                     })
                check_chat = 'https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds='
                tmp_count = 0
                data = []
            else:
                if tmp_count == 1:
                    check_chat = ''.join([check_chat, my_items['SkuId']])
                else:
                    check_chat = ','.join([check_chat, my_items['SkuId']])

        next_page = response.css(
            '#J_bottomPage > span.p-num > a.pn-next::attr(href)'
        ).extract_first()
        if next_page:
            self.logger.info('next {}'.format(next_page))
            url = ''.join(['https://list.jd.com', next_page])
            yield scrapy.Request(url=url,
                                 callback=self.parse_page,
                                 meta={
                                     'authority': 'list.jd.com',
                                     'method': 'GET',
                                     'path': next_page,
                                     'scheme': 'https',
                                     'cookies': True
                                 })
Exemplo n.º 18
0
    def parse(self, response):
        comments_json = response.text[len('fetchJSON_comment98vv'+self.commentVersion+'('):-2]
        
        productCommentSummary = json.loads(comments_json).get('productCommentSummary')
        comments = json.loads(comments_json).get('comments')
        item=JdItem()
        
        item['goodRateShow']=productCommentSummary.get('goodRateShow')
        item['generalRateShow']=productCommentSummary.get('generalRateShow')
        item['poorRateShow']=productCommentSummary.get('poorRateShow')
        item['commentCount']=productCommentSummary.get('commentCount') 
        item['productId']=productCommentSummary.get('productId')
        item['referenceName']=[]
        item['referenceId']=[] 
        item['content']=[]
        item['creationTime']=[]
        item['nickname']=[]
        item['userLevelName']=[]
        item['userClientShow']=[]
        item['id']=[]         
        item['score']=[]                
        item['guid']=[]
        
        for comment in comments:
            # 商品名称
            item['referenceName'].append(comment.get('referenceName'))
            # 商品ID
            item['referenceId'].append(comment.get('referenceId'))  
            # 评论内容
            item['content'].append(comment.get('content'))
            
            # 评论时间
            item['creationTime'].append(comment.get('creationTime'))
            # 评论人昵称
            item['nickname'].append(comment.get('nickname'))
            # 顾客会员等级
            item['userLevelName'].append(comment.get('userLevelName'))
            # 购物使用的平台
            item['userClientShow'].append(comment.get('userClientShow'))
            # 用户id
            item['id'].append(comment.get('id'))        
            # 评分
            item['score'].append(comment.get('score'))                 
            # guid"13cd40e8-93b0-4078-96e9-d33748566516"
            item['guid'].append(comment.get('guid'))          
        yield item
        
        maxPage=json.loads(comments_json).get('productCommentSummary').get('commentCount')
        
        if maxPage % 10 == 0:  # 算出评论的页数,一页10条评论
            page = maxPage/10           
        else:
            page = maxPage/10 + 1
        for k in range(1,50):

            yield Request(self.generate_product_comment_url(self.commentVersion,self.productID,k),callback=self.parse)
Exemplo n.º 19
0
    def parse(self, response):
        results = response.css('#plist ul.gl-warp li.gl-item')
        for result in results:
            item = JdItem()

            item['title'] = result.css('div.p-name em::text').extract_first()
            item['price'] = result.css('div.p-price i::text').extract_first()
            item['commit'] = result.css('div.p-name i::text').extract_first()
            item['img'] = result.css('div.p-img img::attr(src)').extract_first()
            item['shop'] = result.css('div.p-shop a::attr(title)').extract_first()
            yield item
Exemplo n.º 20
0
 def parse_detail(self, response):
     json_data = response.body.decode("GBK")
     print(json_data)
     item = JdItem()
     meta = response.meta
     item["title"] = meta["title"]
     item["merchant_name"] = meta["merchant_name"]
     item["price"] = meta["price"]
     item["commodity_id"] = meta["commodity_id"]
     item["comment_count"] = json_data
     yield item
Exemplo n.º 21
0
 def parse_comment(self, response):
     item = JdItem()
     meta = response.meta
     response_json = demjson.decode(txt=response.text, encoding='utf-8')
     if meta['page'] < int(response_json['maxPage']) and meta['page'] < 100:
         meta['page'] += 1
         url = u'='.join(response._url.split(u'=')[:-1])+u'='+str(meta['page'])
         yield Request(url, meta=meta, callback=self.parse_comment)
     for c in response_json['comments']:
         content = ''.join(c['content']).strip().replace(u'\n', u'').replace(u'\r', u'').encode('utf-8')
         item['info'] = '{} {}'.format(meta['score'], content)
         yield item
Exemplo n.º 22
0
 def parse_item(self, response):
     try:
         i = JdItem()
         thisurl = response.url
         pat = 'item.jd.com.*?(\d+).html'
         x = re.search(pat, thisurl)
         if x:
             thisid = re.compile(pat).findall(thisurl)[0]
             print(thisid)
             title = response.xpath('//html/head/title/text()').extract()
             shop = response.xpath(
                 '//div[@class="name"]/a/text()').extract()
             shoplink = response.xpath(
                 '//div[@class="name"]/a/@href').extract()
             priceurl = 'https://p.3.cn/prices/mgets?callback=jQuery8766554&type=1&area=1_72_4137_0&pdtk=&pduid=15048784180911725795195&pdpin=&pin=null&pdbp=0&skuIds=J_' + str(
                 thisid)
             commenturl = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv10&productId=' + str(
                 thisid
             ) + '&score=0&sortType=5&page=3&pageSize=10&isShadowSku=100001918171&rid=0&fold=1'
             #print(priceurl)
             #print(commenturl)
             pricedata = urllib.request.urlopen(priceurl).read().decode(
                 'utf-8', 'ignore')
             commentdata = urllib.request.urlopen(commenturl).read().decode(
                 'utf-8', 'ignore')
             pricepat = '"p":"(.*?)"'
             commentpat = '"goodRateShow":(.*?),'
             price = re.compile(pricepat).findall(pricedata)
             comment = re.compile(commentpat).findall(commentdata)
             if (len(title)) and (len(shop)) and (len(shoplink)) and (
                     len(price)) and (len(comment)):
                 i['goodsid'] = thisid
                 i['title'] = title[0]
                 i['shop'] = shop[0]
                 i['goodslink'] = thisurl
                 i['shoplink'] = 'https:' + shoplink[0]
                 i['price'] = price[0]
                 i['goodrate'] = comment[0]
                 print(title[0])
                 print(thisurl)
                 print(shop[0])
                 print('https:' + shoplink[0])
                 print(price[0])
                 print(comment[0])
                 print('-----------')
             else:
                 pass
         else:
             pass
         yield i
     except Exception as e:
         print(e)
Exemplo n.º 23
0
    def parse(self, response):
        """
        爬取每页的前三十个商品,数据直接展示在原网页中
        :param response:
        :return:
        """
        ids = []
        x = 0
        for li in response.xpath('//*[@id="J_goodsList"]/ul/li'):
            x = x + 1
            item = JdItem()
            price = li.xpath('div/div/strong/i/text()').extract()  # 价格
            shop = li.xpath(
                'div/div[@class="p-shop"]/span/a/text()').extract()  # 店铺
            tags = li.xpath(
                'div/div[@class="p-icons"]/i/text()').extract()  # 标签
            title = li.xpath('div/div/a/em/text()').extract()  # 标题

            id = li.xpath('@data-pid').extract()  # id

            ids.append(''.join(id))

            url = li.xpath('div/div[@class="p-name p-name-type-2"]/a/@href'
                           ).extract()  # 需要跟进的链接

            item['title'] = ''.join(title)
            item['keyword'] = ''.join(self.keyword)
            item['shop'] = ''.join(shop)
            item['price'] = ''.join(price)
            item['tags'] = ''.join(tags)
            item['url'] = ''.join(url)

            if item['url'].startswith('//'):
                item['url'] = 'https:' + item['url']
                yield item

        print('京东采集 :' + self.keyword + '  显示页面已采集' + str(x) + '条,' +
              'Page = ' + str(self.page))

        if x < 1:
            self.crawler.engine.close_spider(self, '已爬取所有信息!')
        else:

            headers = {'referer': response.url}
            # 后三十页的链接访问会检查referer,referer是就是本页的实际链接
            # referer错误会跳转到:https://www.jd.com/?se=deny
            self.page += 1
            yield scrapy.Request(
                self.next_url %
                (self.keyword, self.keyword, self.page, ','.join(ids)),
                callback=self.next_parse,
                headers=headers)
Exemplo n.º 24
0
    def parse(self, response):
        products = response.xpath(
            '//div[@id="J_goodsList"]//li[@class="gl-item"]/div[contains(@class, "gl-i-wrap")]')
        
        print('-' * 20)
        print(len(products))

        for product in products:
            item = JdItem()
            item['price'] = ''.join(product.xpath('.//div[contains(@class, "p-price")]//i[1]/text()').extract()).strip()
            item['title'] = ''.join(product.xpath('.//div[contains(@class, "p-name")]//text()').extract()).strip()
            # item['shop'] = ''.join(product.xpath('.//div[contains(@class, "shop")]//text()').extract()).strip()
            # item['image'] = ''.join(product.xpath('.//div[@class="pic"]//img[contains(@class, "img")]/@data-src').extract()).strip()
            # item['deal'] = product.xpath('.//div[contains(@class, "deal-cnt")]//text()').extract_first()
            # item['location'] = product.xpath('.//div[contains(@class, "location")]//text()').extract_first()
            yield item
Exemplo n.º 25
0
 def parse(self, response):
     goods = response.xpath('//li[@class="gl-item"]')
     for good in goods:
         item1 = JdItem()
         gid = good.xpath('./@data-sku').extract_first()
         link = f'https://item.jd.com/{gid}.html'
         item1['good_ID'] = gid
         item1['title'] = good.xpath(
             'string(.//div[@class="p-name p-name-type-2"]//em)'
         ).extract_first()
         item1['link'] = link
         item1['price'] = good.xpath(
             './/div[@class="p-price"]//i//text()').extract_first()
         yield scrapy.Request(url=link,
                              meta={'item': item1},
                              callback=self.parse_detail)
Exemplo n.º 26
0
    def parse_item(self, response):
        res = response
        c = response.meta
        item = JdItem()
        bs4_response = BeautifulSoup(
            res.text.encode('gbk', 'ignore').decode('gbk',
                                                    errors='ignore').replace(
                                                        '&nbsp;', ''), 'lxml')
        jd = bs4_response.find("div", {
            "class": "job_part"
        }).find("div", {
            "class": "job_detail"
        }).get_text()
        jd = re.sub('\n+', '\n', jd, re.S).strip("\n").strip()
        try:
            r = c['cate']
            mkdir(self.FolderName + '/' + r)
            item['jd'] = re.sub('\s+', '', jd, re.S)
            item['job_title'] = bs4_response.find("div", {
                "class": "new_job_name"
            }).get_text().strip()
            #print(item['jd'])
            global cn
            cn += 1
            print(cn)

            path = str(r) + '/' + str(cn) + '.txt'
            with open(self.FolderName + '/' + str(r) + '/' + str(cn) + '.txt',
                      'a',
                      encoding="utf-8") as fw:
                fw.write(item['jd'])
            fw.close()
            print(item['jd'])
            JT = [item['job_title'], path]
            csv_writer = csv.writer(self.out, dialect='excel')
            csv_writer.writerow(JT)
            '''
            with open(str(cn) + '.txt', 'a', encoding='utf-8') as fwe:
                fwe.write(item['jd'])
            '''
            if '@' in item['jd']:
                with open(str(cn) + '.txt', 'a', encoding='utf-8') as fwe:
                    fwe.write(item['jd'])
            yield item
        except:
            print('职位爬取失败。。')
            print('失败页数:', self.PAGES)
Exemplo n.º 27
0
 def parse(self, response):
     goodslist = response.xpath('//ul[@class="gl-warp.clearfix"]//li')
     if list:
         for onegoods in goodslist:
             item = JdItem()
             item['price'] = ''.join(
                 onegoods.xpath('.//div[@class="p-price"]/strong/text()').
                 extract()).strip()
             item['title'] = ''.join(
                 onegoods.xpath('.//div[@class="p-name"]//em/text()').
                 extract()).strip()
             item['comment'] = ''.join(
                 onegoods.xpath('.//div[@class="p-commit"]/strong/text()').
                 extract()).strip()
             item['boss'] = ''.join(
                 onegoods.xpath('.//div[@class="p-shopnum"]/a/text()').
                 extract()).strip()
             yield item
Exemplo n.º 28
0
    def parse_item(self, response):
        try:
            item = JdItem()
            thisurl = response.url
            pat = "item.jd.com/(.*?).html"
            x = re.search(pat, thisurl)
            if (x):
                # 获取商品ID
                thisid = re.compile(pat).findall(thisurl)[0]

                # 获取商品title
                item['title'] = response.xpath(
                    "//div[@class='sku-name']/text()").extract()
                # 获取商品
                item['shop'] = response.xpath(
                    "//div[@class='name']/a/text()").extract()
                # 商品链接
                item['shoplink'] = response.xpath(
                    "//div[@class='name']/a/@href").extract()

                # 价格接口链接
                priceurl = "https://p.3.cn/prices/get?type=1&area=1_72_2799&pdtk=&pduid=1888909243&pdpin=&pdbp=0&skuid=J_" + str(
                    thisid) + "&callback=cnp"

                # 获取价格
                pricedata = urllib.request.urlopen(priceurl).read().decode(
                    "utf-8", "ignore")
                pricepat = '"p":"(.*?)"'
                item['price'] = re.compile(pricepat).findall(pricedata)

                # 评论接口链接
                commenturl = "https://sclub.jd.com/comment/productPageComments.action?productId=" + str(
                    thisid
                ) + "&score=0&sortType=3&page=0&pageSize=10&callback=fetchJSON_comment98vv4956"

                # 获取评论
                commentdata = urllib.request.urlopen(commenturl).read().decode(
                    "utf-8", "ignore")
                commentpat = 'goodRateShow":(.*?),'
                item['comment'] = re.compile(commentpat).findall(commentdata)
                yield item

        except Exception as e:
            print(e)
Exemplo n.º 29
0
 def parse_detail(self, response):
     sel = response
     pageindex = response.meta['pageindex']
     print(
         "-------------------------------response-----------------------------------"
     )
     price = sel.xpath(
         '*//div[@id="J_goodsList"]//div[@class="p-price"]//i/text()'
     ).extract()
     name = sel.xpath(
         '*//div[@id="J_goodsList"]//div[@class="p-name p-name-type-3"]//em/text()'
     ).extract()
     shopid = sel.xpath(
         '*//div[@id="J_goodsList"]//div[@class="p-shop"]').extract()
     picture = sel.xpath(
         '*//div[@id="J_goodsList"]//div[@class="p-img"]//img/@data-lazy-img'
     ).extract()
     productId = sel.xpath(
         '*//div[@id="J_goodsList"]//li[@class="gl-item"]/@data-sku'
     ).extract()
     for i in range(0, len(price)):
         item = JdItem()
         item["pageindex"] = pageindex
         item['index'] = str(i + 1)
         item['ok'] = 'False'
         item['价格'] = price[i]
         item['品牌'] = name[i]
         shop = re.compile(r'title="(.+?)">').findall(shopid[i])
         if (len(shop) == 0):
             shop = ""
         else:
             shop = shop[0]
         item['店铺'] = shop
         item['图片'] = picture[i]
         item['ID'] = productId[i]
         item['page_url'] = response.url
         item['url'] = "https://item.jd.com/" + item['ID'] + ".html"
         json_url = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds=" + item[
             "ID"]
         yield Request(url=json_url,
                       callback=self.parse_json,
                       meta={'item': item},
                       headers=self.default_headers)
Exemplo n.º 30
0
    def next_parse(self, response):
        """
        爬取每页的后三十个商品,数据展示在一个特殊链接中:url+id(这个id是前三十个商品的id)
        :param response:
        :return:
        """
        y = 0
        for li in response.xpath('//li[@class="gl-item"]'):
            y = y + 1
            item = JdItem()
            price = li.xpath('div/div/strong/i/text()').extract()  # 价格
            shop = li.xpath(
                'div/div[@class="p-shop"]/span/a/text()').extract()  # 店铺
            tags = li.xpath(
                'div/div[@class="p-icons"]/i/text()').extract()  # 标签
            title = li.xpath('div/div/a/em/text()').extract()  # 标题
            url = li.xpath('div/div[@class="p-name p-name-type-2"]/a/@href'
                           ).extract()  # 需要跟进的链接

            item['title'] = ''.join(title)
            item['keyword'] = ''.join(self.keyword)
            item['shop'] = ''.join(shop)
            item['price'] = ''.join(price)
            item['tags'] = ''.join(tags)
            item['url'] = ''.join(url)

            if item['url'].startswith('//'):
                item['url'] = 'https:' + item['url']
                yield item
            # print(item)
        print('京东采集 :' + self.keyword + '  隐藏页面已采集' + str(y) + '条,' +
              'Page = ' + str(self.page))
        if y < 1:
            self.crawler.engine.close_spider(self, '已爬取所有信息!')
        else:
            #if self.page < 200:
            self.page += 1
            yield scrapy.Request(self.url %
                                 (self.keyword, self.keyword, self.page),
                                 callback=self.parse)