Exemplo n.º 1
0
    def parse(self, response):
        goods = response.css('ul li.gl-item')
        #self.logger.debug(goods.extract())
        item = JingdongItem()
        for good in goods:
            item['keyword'] = response.meta['keyword']
            item['id'] = good.xpath('@data-sku').extract_first()
            item['price'] = good.xpath(
                'div[@class = "gl-i-wrap"]/div[@class = "p-price"]/strong/i/text()'
            ).extract_first()
            item['title'] = good.xpath(
                'div[@class = "gl-i-wrap"]/div[@class = "p-name p-name-type-2"]/a/em/text()'
            ).extract_first()
            item['promo_words'] = good.xpath(
                'div[@class = "gl-i-wrap"]/div[@class = "p-name p-name-type-2"]/a/i/text()'
            ).extract_first()
            item['href'] = good.xpath(
                'div[@class = "gl-i-wrap"]/div[@class = "p-img"]/a/@href'
            ).extract_first()
            item['comment_num'] = good.xpath(
                'div[@class = "gl-i-wrap"]/div[@class = "p-commit"]/strong/a/text()'
            ).extract_first()
            item['shop'] = good.xpath(
                'div[@class = "gl-i-wrap"]/div[@class = "p-shop"]/span/a/text()'
            ).extract_first()
            item['img'] = good.xpath(
                'div[@class = "gl-i-wrap"]/div[@class = "p-img"]/a/img/@source-data-lazy-img'
            ).extract_first()
            yield item

        pass
Exemplo n.º 2
0
    def parse(self, response):
        print(response.text)
        good_id = re.findall('/(\d+).html', response.url)[0]
        print('进入详情页的id:', good_id)
        #商品名称,有的第一个为空
        filename = response.xpath(
            '//div[contains(@class,"itemInfo-wrap")]/div[contains(@class,"sku-name")]/text()'
        ).extract()[-1].strip()
        if '/' in filename:
            filename = filename.replace('/', '-')
        if ':' in filename:
            filename = filename.replace(':', '')
        if '?' in filename:
            filename = filename.replace('?', '')
        if '|' in filename:
            filename = filename.replace('|', '_')
        intruduce = response.xpath(
            '//div[@id="detail"]//div[contains(@class,"p-parameter")]/ul[contains(@class,"parameter2")]/li/text()'
        ).extract()
        intruduce = self.deal_intruduce(intruduce)
        pic_ids = response.xpath(
            '//div[@id="choose-attr-1"]/div[contains(@class,"dd")]/div/@data-sku'
        ).extract()
        pic_names = response.xpath(
            '//div[@id="choose-attr-1"]/div[contains(@class,"dd")]/div/a/img/@alt'
        ).extract()
        if pic_ids:
            for n, value in enumerate(pic_ids):
                url = "https://item.jd.com/" + pic_ids[n] + ".html"
                print(pic_ids[n], pic_names[n])
                yield scrapy.Request(url,
                                     callback=self.detailpage,
                                     meta={
                                         'filename': filename,
                                         'intruduce': intruduce,
                                         'name': pic_names[n],
                                         'good_id': pic_ids[n]
                                     },
                                     dont_filter=True)
        else:
            item = JingdongItem()
            item['filename'] = filename
            item['intruduce'] = self.deal_intruduce(intruduce)
            img_urls1 = response.xpath(
                '//div[@id="spec-list"]/ul/li/img/@src').extract()
            item['img_urls'] = self.deal_img(img_urls1)  # 每种款式的所有图片
            item['good_id'] = good_id
            item['img_name'] = item['filename'][-10:]
            yield item

        #评论部分
        url = "https://club.jd.com/discussion/getProductPageImageCommentList.action?productId=" + good_id + "&isShadowSku=0&page=1&pageSize=10&_=" + str(
            time.time() * 1000)[:-4]
        yield scrapy.Request(url,
                             callback=self.pinglun,
                             meta={
                                 'filename': filename,
                                 'good_id': good_id,
                             },
                             dont_filter=True)
Exemplo n.º 3
0
 def parse(self, response):
     item = JingdongItem()
     id = response.xpath(
         '//div[@class="gl-i-wrap j-sku-item"]/@data-sku').extract()
     item['name'] = response.xpath(
         '//a[@target="_blank"][@title=""]/em/text()').extract()
     item['shopname'] = response.xpath(
         '//div[@class="p-shop"]/@data-shop_name').extract()
     '''for item_id in (id):
         url1="https://p.3.cn/prices/mgets?callback=jQuery6324226&type=1&area=1_72_4137_0&skuIds=J_"+item_id
         url2 = "http://club.jd.com/comment/productCommentSummaries.action?my=pinglun2&referenceIds=" + item_id
         pat1='"p":"(.*?)"'
         pat2='"CommentCount":(.*?),"'
         pat3='"GoodRate":(.*?),"'
         price=re.compile(pat1).findall(urllib.request.urlopen(url1).read().decode("GBK","ignore"))
         comment=re.compile(pat2).findall(urllib.request.urlopen(url2).read().decode("GBK","ignore"))
         rate=re.compile(pat3).findall(urllib.request.urlopen(url2).read().decode("GBK","ignore"))
         url="//item.jd.com/"+item_id+".html"
         price_list=[]
         comment_list=[]
         rate_list=[]
         url_list=[]
         price_list.append(price)
         comment_list.append(comment)
         rate_list.append(rate)
         url_list.append(url)
     print(price_list)'''
     print(item)
     return item
Exemplo n.º 4
0
    def next(self, response):
        item = JingdongItem()
        #html = urllib2.urlopen(response.url).read()
        #tree = lxml.html.fromstring(html)
        #item["title"] = tree.cssselect("[@class='tb-main-title']/@data-title")

        item["title"] = response.xpath('//div[@class="sku-name"]/text()').extract()[0].encode('utf-8').strip()
        item["link"] = response.url
        item['shop'] = response.xpath('//div[@class="name"]/a/text()').extract()[0].encode('utf-8').strip()
        item['shopLink'] = 'https:' + response.xpath('//div[@class="name"]/a/@href').extract()[0]
        item['compositeScore'] = response.xpath('//em[@class="evaluate-grade"]/span/a/text()').extract()[0]

        tdata = response.url.split('/')
        skuids = tdata[3][:-5]
        purl = 'https://p.3.cn/prices/mgets?skuIds=J_' + skuids
        pricedata = urllib2.urlopen(purl).read()
        jdata = json.loads(pricedata)
        item["price"] = jdata[0]["p"]
        commenturl = "http://club.jd.com/comment/productPageComments.action?productId={}&score=0&sortType=5&page=0&pageSize=10".format(skuids)
        commentdata = urllib2.urlopen(commenturl).read().decode("GBK", "ignore")
        tempdata = re.findall('("content":".*?),"isTop"', commentdata)
        item['commentdata'] = ""
        for data in tempdata:
            item['commentdata'] += data.encode('utf-8')
        
        print item["title"]
        print item["link"]
        print item["shop"]
        print item["shopLink"]
        print item["compositeScore"]
        print item["price"]
        print item["commentdata"]
        
        yield item
Exemplo n.º 5
0
    def next_parse(self, response):

        all_goods = response.xpath('/html/body/li')

        for one_good in all_goods:
            item = JingdongItem()
            try:
                data = one_good.xpath('div/div/a/em')
                item['title'] = data.xpath('string(.)').extract()[
                    0]  # 提取出该标签所有文字内容
                item['comment_count'] = one_good.xpath(
                    'div/div[@class="p-commit"]/strong/a/text()').extract()[
                        0]  # 评论数
                item['goods_url'] = 'http:' + one_good.xpath(
                    'div/div[4]/a/@href').extract()[0]  # 商品链接
                item['shops_id'] = one_good.xpath(
                    'div/div[@class="p-shop"]/@data-shopid').extract()[
                        0]  # 店铺ID
                item['shop_url'] = self.shop_url.format(
                    shop_id=item['shops_id'])
                goods_id = one_good.xpath(
                    'div/div[2]/div/ul/li[1]/a/img/@data-sku').extract()[0]
                if goods_id:
                    item['goods_id'] = goods_id
                price = one_good.xpath(
                    'div/div[3]/strong/i/text()').extract()  # 价格
                if price:  # 有写商品评论数是0,价格也不再源代码里面,应该是暂时上首页的促销商品,每页有三四件,我们忽略掉
                    item['price'] = price[0]

                yield item
                # print(item)
            except Exception as e:
                pass
Exemplo n.º 6
0
    def parse(self, response):
        pids = []
        for li in response.xpath('//*[@id="J_goodsList"]/ul/li/div'):
            item = JingdongItem()
            price = li.xpath('div[3]/strong/i/text()').extract()
            store = li.xpath('div[7]/span/a/text()').extract()
            url = li.xpath(
                'div[@class="p-name p-name-type-2"]/a/@href').extract()

            pid = li.xpath('@data-pid').extract()
            pids.append(''.join(pid))

            item['pid'] = ''.join(pid)
            # item['title'] = ''.join(title)
            item['price'] = ''.join(price)
            item['store'] = ''.join(store)
            item['url'] = ''.join(url)

            if item['url'].startswith('//'):
                item['url'] = 'https:' + item['url']
            elif not item['url'].startswith('https:'):
                item['info'] = None
                yield item
                continue

            yield scrapy.Request(self.comment_url % ''.join(pid),
                                 callback=self.comment_parse,
                                 meta={"item": item})

        headers = {'referer': response.url}
        yield scrapy.Request(self.next_url %
                             (self.key_word, self.page, ','.join(pids)),
                             callback=self.next_parse,
                             headers=headers)
Exemplo n.º 7
0
    def getitemInfo(self, response):
        item = JingdongItem()

        init_priceUrl = 'https://p.3.cn/prices/mgets?skuIds=J_{}'
        itemid = re.findall('(\d+)', response.url)[0]

        try:
            item['title'] = "".join(
                response.xpath(
                    '//div[@class="sku-name"]/text()').extract()).replace(
                        ' ', '').replace('\r', '').replace('\n', '')
            item['goods_url'] = response.url
            item['goods_id'] = itemid
            item['itemDetail'] = ";".join(
                response.xpath(
                    '//div[@class="p-parameter"]/ul[@class="parameter2 p-parameter-list"]/li/text()'
                ).extract())

            # yield item
            yield scrapy.Request(url=init_priceUrl.format(itemid),
                                 meta={'item': item},
                                 callback=self.getitemPrice,
                                 dont_filter=True)
        except Exception as e:
            print('没有基础数据')
Exemplo n.º 8
0
    def next_parse(self, response):
        for li in response.xpath('//li[@class="gl-item"]'):
            item = JingdongItem()
            # title = li.xpath('div/div/a/em/text()').extract()
            price = li.xpath('div/div/strong/i/text()').extract()
            store = li.xpath('div/div/span/a/text()').extract()
            url = li.xpath(
                'div/div[@class="p-name p-name-type-2"]/a/@href').extract()

            pid = li.xpath('@data-pid').extract()

            item['pid'] = ''.join(pid)
            item['price'] = ''.join(price)
            item['store'] = ''.join(store)
            item['url'] = ''.join(url)

            if item['url'].startswith('//'):
                item['url'] = 'https:' + item['url']
            elif not item['url'].startswith('https:'):
                item['info'] = None
                yield item
                continue

            yield scrapy.Request(self.comment_url % ''.join(pid),
                                 callback=self.comment_parse,
                                 meta={"item": item})

        if self.page < 200:
            self.page += 2
            yield scrapy.Request(self.url %
                                 (self.key_word, self.key_word, self.page),
                                 callback=self.parse)
Exemplo n.º 9
0
    def parse(self, response):
        all_goods = response.xpath('//div[@id="J_goodsList"]/ul/li')
        for one_good in all_goods:
            item = JingdongItem()

            try:
                data = one_good.xpath('div/div/a/em')
                item['title'] = data.xpath('string(.)').extract()[
                    0]  #提取出该标签所有文字内容
                item['comment_count'] = one_good.xpath(
                    'div/div[@class="p-commit"]/strong/a/text()').extract()[
                        0]  #评论数
                item['goods_url'] = 'http:' + one_good.xpath(
                    'div/div[4]/a/@href').extract()[0]  #商品链接
                item['shop_url'] = 'http:' + one_good.xpath(
                    'div/div[7]/span/a/@href').extract()[0]  #店铺链接
                item['shops_id'] = self.find_shop_id(item['shop_url'])  #店铺ID
                goods_id = one_good.xpath(
                    'div/div[2]/div/ul/li[1]/a/img/@data-sku').extract()[0]
                if goods_id:
                    item['goods_id'] = goods_id
                price = one_good.xpath(
                    'div/div[3]/strong/i/text()').extract()  #价格
                if price:  #有写商品评论数是0,价格也不再源代码里面,应该是暂时上首页的促销商品,每页有三四件,我们忽略掉
                    item['price'] = float(price[0])

                yield item
            except Exception as e:
                pass
Exemplo n.º 10
0
    def parse(self, response):
        item=JingdongItem()
        listdata=response.body.decode('utf-8', 'ignore')
        # 频道1/2
        pd=response.xpath('//span[@class="curr"]/text()').extract()
        if len(pd)==0:
            pd=['缺省', '缺省']
        if len(pd)==1:
            pda=pd[0]
            pd=[pda, '缺省']
        pd1=pd[0]
        pd2=pd[1]
        # 图书名
        bookname=response.xpath('//div[@class="p-name"]/a/em/text()').extract()
        #print(bookname)
        # 价格
        allskupat='<a data-sku="(.*?)"'
        allsku=re.compile(allskupat).findall(listdata)
        #print(allsku)
        # 评论数

        # 作者的信息
        author=response.xpath('//span[@class="author_type_1"]/a/@title').extract()
        # 出版社的信息
        pub=response.xpath('//span[@class="p-bi-store"]/a/@title').extract()
        # 店家
        seller=response.xpath('//span[@class="curr-shop"]/text()').extract()
        # 处理当前页的数据
        for n in range(0, len(seller)):
            name=bookname[n+3]
            thissku=allsku[n]
            priceurl='https://p.3.cn/prices/mgets?callback=jQuery7839616&type=1&skuIds=J_'+str(thissku)
            pricedata=urllib.request.urlopen(priceurl).read().decode('utf-8', 'ignore')
            pricepat='"p":"(.*?)"'
            price=re.compile(pricepat).findall(pricedata)[0]

            commenturl = 'https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds=' + str(thissku)+'&callback=jQuery8841347'
            commentdata = urllib.request.urlopen(commenturl).read().decode('utf-8', 'ignore')
            commentpat = '"CommentCount":(.*?),'
            commentnum = re.compile(commentpat).findall(commentdata)[0]
            thisauthor=author[n]
            thispub=pub[n]
            thisseller=seller[n]
            print(pd1)
            print(pd2)
            print(name)
            print(price)
            print(commentnum)
            print(thisauthor)
            print(thispub)
            print(thisseller)
            item['channel1']=pd1
            item['channel2']=pd2
            item['name']=bookname
            item['comment_num']=commentnum
            item['author'] = thisauthor
            item['pub'] = thispub
            item['seller'] = thisseller
            yield item
Exemplo n.º 11
0
 def coupon_detail(self, response):
     next_page = response.xpath(
         '/html/body/div[5]/div/div[2]/div[2]/div/div[2]/div/a[9]//@href'
     ).extract()
     for sel in response.xpath('//*[@id="coupons-list"]'):
         name = sel.xpath('//div[1]/div[2]/div[1]/p//@title').extract()
         item = JingdongItem()
         item['name'] = name
         yield item
Exemplo n.º 12
0
 def parse(self, response):
     # css解析还是要多看看
     lilist = response.css('div#plist ul li')
     for li in lilist:
         item = JingdongItem()
         item['name'] = li.css('div.p-name a em::text').extract_first()
         item['price'] = li.css(
             'div.p-price strong i::text').extract_first()
         item['shop'] = li.css('div.p-shop a::text').extract_first()
         yield item
Exemplo n.º 13
0
    def parse(self, response):
        # 用来存放所有的pid, 拼接ajax时需要使用
        pid_list = []
        i = 1
        phone_list = response.xpath('//li[@data-sku]')
        # 最终页页码
        end_page = response.xpath(
            "//span[@class='p-skip']//b/text()").extract()

        for item in phone_list:
            jd = JingdongItem()
            # 获取item内容

            # PID
            jd['pid'] = item.xpath('./@data-pid').extract_first()
            # 商品图片链接
            jd['image_link'] = self._get_phone_image(item)
            # 价格
            jd['price'] = item.xpath(
                ".//div[@class='p-price']//i/text()").extract_first()
            # 名称
            jd['title'] = item.xpath(
                ".//div[@class='p-name p-name-type-2']//em/text()").extract()
            # 评论数
            jd['comment_num'] = item.xpath(
                ".//div[@class='p-commit']/strong/a/text()").extract_first()
            # 商铺名称(存在店铺为None, 这些商品为广告)
            jd['shop_name'] = item.xpath(
                ".//div[@class='p-shop']/span/a/text()").extract_first()
            # 商铺链接
            jd['shop_link'] = item.xpath(
                ".//div[@class='p-shop']/span/a/@href").extract_first()
            print '*' * 30, jd['shop_link']
            # 二手链接(存在返回链接, 否则返回None)
            jd['second_link'] = item.xpath(
                ".//div[@class='p-commit']/a/@href").extract_first()
            # 广告
            jd['ad'] = item.xpath(
                './/span[@class="p-promo-flag"]/text()').extract_first()

            # 拼接前30pid,用来ajax请求发送
            pid_list.append(jd['pid'])
            self.show_items = ','.join(pid_list)[:-1]
            logger.info(i)
            yield jd
            i += 1

            # 当拼接数为30时, 直接请求当前页剩余30条信息
            if len(pid_list) == 30:
                self.search_page = self.page + 1
                yield scrapy.Request(self.next_url.format(
                    self.search_page, self.show_items),
                                     callback=self.parse_other_info,
                                     meta={'end_page': end_page})
Exemplo n.º 14
0
 def parse(self, response):
     li_list = response.selector.xpath('//div[@id="J_goodsList"]/ul//li')
     for l_list in li_list:
         goods = JingdongItem()
         img_src = ''.join(
             l_list.xpath(
                 './/div[@class="p-img"]/a/img/@src').extract_first())
         price = ''.join(
             l_list.xpath(
                 './/div[@class="p-price"]//i/text()').extract_first())
         goods['img_src'] = img_src
         goods['price'] = price
         yield goods
Exemplo n.º 15
0
 def parsefirstPage(self, response):
     infos = response.xpath('//li[@class="gl-item"]/div/div[@class="p-img"]/a')
     for info in infos:
         item = JingdongItem()
         url = info.xpath('@href').extract()
         goods_link = response.urljoin(url[0])
         item['link'] = goods_link  # 商品链接
         for link in url:
             url = response.urljoin(link)
             yield Request(url, meta={'meta': item}, callback=self.parsegoods)
         if self.page<200:    
             self.page +=2 #翻页
             yield scrapy.Request(self.url % (self.page), callback=self.parsefirstPage)
Exemplo n.º 16
0
    def parse_item(self, response):

        #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
        #item['name'] = response.xpath('//div[@id="name"]').get()
        #item['description'] = response.xpath('//div[@id="description"]').get()
        try:
            i = JingdongItem()
            this_url = response.url  #获取当前爬取的页面
            form = "item.jd.com/(.*?).html"  #正则表达式
            x = re.search(form, this_url)

            #判断能够找到网页
            if (x):
                goods_id = re.compile(form).findall(this_url)[0]
                title = response.xpath(
                    "//div[@class='sku-name']/text()").extract()
                shop = response.xpath(
                    "//div[@class='usertopleft']/h2/text()").extract()
                shop_link = response.xpath(
                    "//div[@class='usertopleft']/a[@target='_blank']/@href"
                ).extract()

                #抓包分析商品价格的网址
                price_url = "https://p.3.cn/prices/mgets?callback=jQuery1888176&type=1&area=19_1601_3633_0&pdtk=&pduid=323942232&pdpin=&pin=null&pdbp=0&skuIds=J_" + str(
                    goods_id) + "&ext=11100000&source=item-pc"
                #讲网址信息读取出来
                price_data = urllib.request.urlopen(price_url).read().decode(
                    "utf-8")
                #价格的正则
                price_form = '"p":"(.*?)"'
                #提取出来的价格
                price = re.compile(price_form).findall(price_data)

                if (len(title) and len(shop) and len(shop_link)
                        and len(price)):
                    print(title[0])
                    print(shop[0])
                    print(shop_link[0])
                    print(price[0])
                    print("………………")

                else:
                    pass

            else:
                pass

            return i

        except Exception as e:
            print(e)
Exemplo n.º 17
0
 def parse_item(self,response):
     sel = Selector(response)
     filename = response.url.split("/")[-1]
     item = JingdongItem()
     item["url"] = [response.url]
     item["name"] = sel.xpath('//*[@id="name"]/h1/text()').extract()
     # js生成的价格。。。
     # item["price"] = sel.xpath('//div[2]/div[2]/strong/text()').extract()
     # 参考 网文 http://blog.csdn.net/lanshanlei/article/details/42741179
     productid = os.path.splitext(filename)[-2]  #response.url[19:29]
     priceUrl = 'http://p.3.cn/prices/mgets?skuIds=J_' + productid + 'J_'
     r = Request(priceUrl,callback= self.parsePrice)
     r.meta['item'] = item
     yield r
Exemplo n.º 18
0
 def parse(self, response):
     dt_list = response.xpath("//div[@class='mc']/dl/dt")
     for dt in dt_list:
         # item ={}
         item = JingdongItem()
         item["b_cate"] = dt.xpath("./a/text()").extract_first()
         dd_list = dt.xpath("./following-sibling::dd[1]/em")
         for dd in dd_list:
             item["s_cate"] = dd.xpath("./a/text()").extract_first()
             item["s_href"] = dd.xpath("./a/@href").extract_first()
             if item["s_href"] is not None:
                 item["s_href"] = "https:" + item["s_href"]
                 yield scrapy.Request(item["s_href"],
                                      callback=self.book_detail_list,
                                      meta={"item": deepcopy(item)})
Exemplo n.º 19
0
    def parsebody(self, response):
        t = re.findall('^fetchJSON_comment98vv\d*\((.*)\);', response.text)  # 去掉json的头信息,变成一个单一的列表
        json_data = json.loads(t[0])

        for comment in json_data['comments']:  # 列表套字典格式
            item = JingdongItem()
            try:
                item['content'] = comment['content']
                item['creationTime'] = comment['creationTime']
                item['productColor'] = comment['productColor']
                item['productSize'] = comment['productSize']
                item['userClientShow'] = comment['userClientShow']
                item['userLevelName'] = comment['userLevelName']
                # yield item
            except:
                continue
Exemplo n.º 20
0
 def parse(self, response):
     print("数据解析")
     count = 0
     computers = response.selector.xpath(".//div[@id='J_goodsList']/ul[@class='gl-warp clearfix']/"
                                         "li[@class='gl-item']/div[@class='gl-i-wrap']")
     for computer in computers:
         item = JingdongItem()
         item["name"] = computer.xpath(".//div[@class='p-name']/a/em/text()").extract_first().replace('\n', '')\
             .replace('\t', '')
         item['url'] = "https:" + computer.xpath(".//div[@class='p-name']/a/@href").extract_first()
         item['price'] = computer.xpath(".//div[@class='p-price']/strong/i/text()").extract_first()
         # item['image'] = "https:" + computer.xpath(".//div[@class='p-img']/a/img/@src").extract_first()
         item['comments'] = computer.xpath(".//div[@class='p-commit']/strong/a/text()").extract_first()
         count += 1
         print(count)
         yield item
Exemplo n.º 21
0
 def parse_item(self, response):
     try:
         item = JingdongItem()
         thisUrl = response.url
         pat = 'item.jd.com/(.*?).html'
         x = re.search(pat, thisUrl)
         if x:
             thisid = re.compile(pat).findall(thisUrl)[0]
             # print(thisUrl)
             # print(thisid)
             title = response.xpath('//html/head/title/text()').extract()
             shop = response.xpath(
                 "//*[@id='popbox']/div/div[1]/h3/a/text()").extract()
             shoplink = response.xpath(
                 "//*[@id='popbox']/div/div[1]/h3/a/@href").extract()
             # print(title)
             # print(shop)
             # print(shoplink)
             priceUrl = "https://p.3.cn/prices/mgets?callback=jQuery6964855&type=1&area=1&pdtk=&pduid=50528027&pdpin=&pin=null&pdbp=0&ext=11000000&source=item-pc&skuIds=J_" + thisid
             commentUrl = "https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv1463&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1&productId=" + thisid
             # print(priceUrl)
             # print(commentUrl)
             priceData = urllib.request.urlopen(priceUrl).read().decode(
                 'utf-8', 'ignore')
             commentData = urllib.request.urlopen(commentUrl).read().decode(
                 'utf-8', 'ignore')
             pricePat = '"p":"(.*?)"'
             commentPat = '"goodRateShow":(.*?),'
             price = re.compile(pricePat).findall(priceData)
             comment = re.compile(commentPat).findall(commentData)
             # print(price)
             # print(comment)
             # print('')
             if len(title) and len(shop) and len(shoplink) and len(
                     price) and len(comment):
                 item['title'] = title[0]
                 item['shop'] = shop[0]
                 item['shoplink'] = shoplink[0]
                 item['price'] = price[0]
                 item['comment'] = comment[0]
             else:
                 pass
         else:
             pass
         return item
     except Exception as err:
         print(err)
Exemplo n.º 22
0
    def crawl_info(self, response):
        '''
        抓取页面内的书名,价格 评论数
        :param response:
        :return:
        '''
        item = JingdongItem()
        id = "12398725"  # re.findall('\d+',response.url)[0]
        header = {'Referer': response.url}
        info_url = 'https://club.jd.com/comment/skuProductPageComments.action?callback=fetchJSON_comment98vv426&productId=' + str(
            id) + '&score=0&sortType=6&page=0&pageSize=10&isShadowSku=0&fold=1'
        resp = get(info_url, headers=header).content.decode('GBK')
        print(resp)
        data = re.sub('^fetchJSON_comment98vv\d+\(|\);', '', resp)
        pat = re.compile('\);').sub('', data)
        # data2 = data.sub('', pat).sub()
        elements = json.loads(pat)
        count_comment = int(elements['productCommentSummary']['commentCount'])
        print(count_comment)

        # if count_comment % 10 == 0:
        #     page = count_comment // 10
        # if count_comment % 10 != 10:
        #     page = (count_comment // 10) + 1

        for i in range(0, 20):
            info_url = 'https://club.jd.com/comment/skuProductPageComments.action?callback=fetchJSON_comment98vv426&productId=' + str(
                id) + '&score=0&sortType=6&page=' + str(
                    i) + '&pageSize=10&isShadowSku=0&fold=1'
            resp = get(info_url, headers=header).content.decode('GBK')
            data = re.sub('^fetchJSON_comment98vv\d+\(|\);', '', resp)
            pat = re.compile('\);').sub('', data)
            # data2 = data.sub('', pat).sub()
            elements = json.loads(pat)

            for element in elements['comments']:
                item['user_name'] = element['nickname']
                item['user_id'] = element['id']
                item['id'] = element['referenceId']
                item['good_name'] = element['referenceName']
                item['score'] = element['score']
                item['userLevelId'] = element['userLevelId']
                item['userClientShow'] = element['referenceName']
                item['isMobile'] = element['isMobile']
                # print(item['user_name'])
        return item
Exemplo n.º 23
0
 def detailpage(self, response):
     item = JingdongItem()
     item['filename'] = response.meta['filename']
     item['intruduce'] = response.meta['intruduce']
     item['good_id'] = response.meta['good_id']  # 每种款式的id
     img_urls1 = response.xpath(
         '//div[@id="spec-list"]/ul/li/img/@src').extract()
     item['img_urls'] = self.deal_img(img_urls1)  # 每种款式的所有图片
     name = response.meta['name']  # 款式名称
     if "/" in name:
         name = name.replace("/", "_")
     if ':' in name:
         name = name.replace(':', '')
     if '|' in name:
         name = name.replace('|', '_')
     item['img_name'] = name
     yield item
Exemplo n.º 24
0
    def parse(self, response):
        node_list = response.xpath('//div[@class="mc"]/dl/dt')
        # print(node_list)
        for node in node_list:
            item = JingdongItem()
            item["b_type"] = node.xpath('./a/text()').extract_first()
            s_tpye_list = node.xpath('./following-sibling::dd[1]/em')
            # print(s_tpye_list)
            for s_tpye in s_tpye_list:
                item["s_type"] = s_tpye.xpath('./a/text()').extract_first()
                s_tpye_url = "http:" + s_tpye.xpath(
                    './a/@href').extract_first()
                # print(s_tpye_url)

                yield scrapy.Request(s_tpye_url,
                                     callback=self.book_list,
                                     meta={"item": item})
Exemplo n.º 25
0
    def parse(self, response):
        lis = response.xpath('//*[@id="plist"]/ul/li/div')
        item = JingdongItem()
        for li in lis:
            name = li.xpath('string(.//a/em)').extract_first().strip()
            price = li.xpath('string(./div/strong/i)').extract_first()
            discuss_num = li.xpath('string(./div[5]/strong/a)').extract_first()
            img = li.xpath('string(./div[1]/a/img/@src)').extract_first(
            ) if li.xpath(
                'string(./div[1]/a/img/@src)').extract_first() else li.xpath(
                    'string(./div[1]/a/img/@data-lazy-img)').extract_first()
            shop_name = li.xpath('string(./div[7])').extract_first()
            item['name'] = name
            item['price'] = price
            item['discuss_num'] = discuss_num
            item['img'] = img
            item['shop_name'] = shop_name

            yield item
Exemplo n.º 26
0
 def next2(self, response):
     item = JingdongItem()
     item['title'] = response.xpath(
         '//head/title/text()').extract()[0].replace(
             '【图片 价格 品牌 报价】-京东', '').replace('【行情 报价 价格 评测】-京东', '')
     item['link'] = response.url
     #价格抓包
     ture_id = re.findall(r'https://item.jd.com/(.*?).html',
                          item['link'])[0]
     price_url = "https://p.3.cn/prices/mgets?skuIds=J_" + str(ture_id)
     price_txt = urllib.request.urlopen(price_url).read().decode(
         'utf-8', 'ignore')
     item['price'] = re.findall(r'"p":"(.*?)"', price_txt)[0]
     #评论抓包
     comment_url = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds=" + str(
         ture_id)
     comment_txt = urllib.request.urlopen(comment_url).read().decode(
         'utf-8', 'ignore')
     item['comment'] = re.findall(r'"CommentCount":(.*?),"', comment_txt)[0]
     return item
Exemplo n.º 27
0
 def get_page(self, response):  #解析商品页
     #获取商品名称
     bookname = response.xpath("//title/text()").get().split(r"(")[0]
     #获取作者
     author = response.xpath("//div[@class='p-author']/a/text()").get()
     #获取商品Id
     bookid = re.findall("https://item.jd.com/(.*?).html", str(response))
     bookid = "".join(bookid)
     #通过调用json文件获取商品价格
     price = self.get_book_price(bookid)
     #通过调用json文件获取商品评价数
     commentcount = self.get_commentcount(bookid)
     #获取出版社
     putlish = response.xpath(
         "//div[@class='p-parameter']//li/@title").get()
     item = JingdongItem()
     item["bookname"] = bookname
     item["author"] = author
     item["price"] = price
     item["commentcount"] = commentcount
     item["putlish"] = putlish
     item["bookurl"] = response.meta["bookurl"]
     yield item
Exemplo n.º 28
0
    def parse_comment(self, response):
        comments = json.loads(response.text)
        commentsCount = comments.get("CommentsCount")[0]
        comments_str = commentsCount.get("CommentCountStr")
        good_comments_str = commentsCount.get("GoodCountStr")
        good_comments_rate = commentsCount.get("GoodRate")
        poor_comments_str = commentsCount.get("PoorCountStr")
        poor_comments_rate = commentsCount.get("PoorRate")
        average_score = commentsCount.get("AverageScore")

        item = JingdongItem()
        p = response.meta
        item['pro_id'] = p.get("pro_id")
        item['head_img'] = p.get("head_img")
        item['pro_url'] = p.get("pro_url")
        item['pro_name'] = p.get("pro_name")
        item['shop_id'] = p.get("shop_id")
        if len(item['shop_id']) > 7:
            item['is_ziying'] = u"自营"
        else:
            item['is_ziying'] = u"非自营"

        item['shop_url'] = p.get("shop_url")
        item['category_1'] = p.get("category_1")
        item['category_2'] = p.get("category_2")
        item['category_3'] = p.get("category_3")
        item['pro_price'] = p.get("price")
        item['shop_name'] = p.get("shop_name")
        item['shop_score'] = p.get("shop_score")
        item['comments_str'] = comments_str
        item['good_comments_str'] = good_comments_str
        item['good_comments_rate'] = good_comments_rate
        item['poor_comments_str'] = poor_comments_str
        item['poor_comments_rate'] = poor_comments_rate
        item['average_score'] = average_score

        yield item
Exemplo n.º 29
0
 def parse_item(self, response):
     # i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
     # i['name'] = response.xpath('//div[@id="name"]').extract()
     # i['description'] = response.xpath('//div[@id="description"]').extract()
     item = JingdongItem()
     item["title"] = response.xpath(
         "//div[@class='p-name']/a/em/text()").extract()
     item["url"] = response.xpath(
         "//div[@class='p-name']/a/@href").extract()
     item["p_id"] = response.xpath(
         "//div[@class='p-operate']/a[@class='p-o-btn contrast J_contrast "
         "contrast-hide']/@data-sku").extract()
     priceurl = self.pricebaseurl
     commiturl = self.commitbaseurl
     for i in item["p_id"]:
         priceurl = priceurl + "J_" + i + "%2C"
         commiturl = commiturl + i + ","
     # print priceurl
     # print commiturl
     item["price"] = urllib2.urlopen(priceurl).read().decode(
         'utf-8', 'ignore')
     item['commit'] = urllib2.urlopen(commiturl).read().decode(
         'utf-8', 'ignore')
     yield item
Exemplo n.º 30
0
    def get_content(self,response):
        '''获取商品的详情信息'''
        item = JingdongItem()
        # 爬取时间
        item['date'] = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))
        item['url'] = response.request.url
        item['shop_name'] = self.judje_res(response.xpath('//div[@class="w"]//div[@class="item"]/div[@class="name"]/a/text()').extract())

        title = ''.join(response.xpath('//div[@class="w"]//div[@class="itemInfo-wrap"]/div[@class="sku-name"]/text()').extract())
        item['goods'] = title.strip()

        item['brand'] = self.judje_res(response.xpath('//ul[@id="parameter-brand"]/li/a/text()').extract())
        item['price'] = self.get_price(item['url'])

        comment = self.get_comment(item['url'])
        item['comment_count'] = comment[0]
        item['good_rate'] = comment[1]
        item['poor_rate'] = comment[2]

        item['select_shop'] = self.join_list(response.xpath('//div[@class="summary p-choose-wrap"]//div[@id="choose-attr-1"]/div[@class="dd"]/div/a/i/text()').extract())
        item['image'] = self.join_list(response.xpath('//div[@id="spec-list"]/ul/li/img/@src').extract())
        # 食品才有
        pars = response.xpath('//div[@class="p-parameter"]/ul[2]/li')
        p = {}
        for par in pars:
            detail = par.xpath('./text()').get().split(':')
            p[detail[0]] = detail[1]

        item['weight'] = p.get('商品毛重','不存在')
        item['category'] = p.get('类别','不存在')
        item['sugar'] = p.get('是否含糖','不存在')
        item['fat'] = p.get('脂肪含量','不存在')
        item['addr'] = p.get('商品产地','不存在')

        print(item['url'],item['shop_name'],item['goods'])
        yield item