예제 #1
0
    def parse(self, response):
        pass
        review_list = response.meta['review_list']  ##产品集合
        goods_id = response.meta['goods_id']  ##店铺ID
        page = response.meta['page']  ##每返回一次页面数据 记录页数

        goods_reviews = response.body.decode('utf-8')  ##bytes转换为str
        goods_reviews = json.loads(goods_reviews)

        if 'data' not in goods_reviews.keys() or len(
                goods_reviews['data']) == 0:
            self.ssdb.hdel(self.hash_name, goods_id)
            if review_list:
                item = GoodsSalesItem()
                item['goods_list'] = review_list
                item['mall_id'] = goods_id
                yield item
        else:
            review_list = review_list + goods_reviews['data']  ##合并评论列表
            page += 1
            url = self.build_url(goods_id, page)
            meta = {
                'page': page,
                'goods_id': goods_id,
                'review_list': review_list
            }
            yield scrapy.Request(url, meta=meta, callback=self.parse)
예제 #2
0
 def parse(self, response):
     goods_data = response.body.decode('utf-8')  ##bytes转换为str
     goods_data = json.loads(goods_data)
     if 'goods_id' in goods_data.keys():
         item = GoodsSalesItem()
         item['goods_list'] = goods_data
         yield item
예제 #3
0
	def parse(self, response):
		pass
		goods_list=response.meta['goods_list'] ##产品集合
		mall_id  = response.meta['mall_id'] ##店铺ID
		page 	 = response.meta['page'] ##每返回一次页面数据 记录页数

		mall_goods = response.body.decode('utf-8') ##bytes转换为str
		mall_goods = json.loads(mall_goods)
		mall_goods = mall_goods['goods']
		if 'goods_list' not in mall_goods.keys():
			return None

		goods_len  = len(mall_goods['goods_list'])

		if goods_len > 0:
			goods_list = goods_list + mall_goods['goods_list'] ##合并产品列表

		if goods_len > self.pageSize - 100:
			page += 1
			##继续采集下一页面
			##url = 'http://apiv4.yangkeduo.com/api/turing/mall/query_cat_goods?category_id=0&type=0&sort_type=_sales&mall_id='+str(mall_id)+'&page_no='+str(page)+'&page_size=500'
			#url  ='http://apiv3.yangkeduo.com/api/turing/mall/query_cat_goods?category_id=0&type=0&mall_id='+str(mall_id)+'&page_no='+str(page)+'&page_size=500&sort_type=_sales&anti_content='+self.anti_content+'&pdduid='+str(self.log_user_id) 
			url = self.build_url(mall_id, page, 500)
			meta = {'page':page, 'mall_id':mall_id, 'goods_list':goods_list}
			headers = self.make_headers()
			yield scrapy.Request(url, meta=meta, callback=self.parse, headers=headers,dont_filter=True,errback=self.errback_httpbin)
		else:
			if goods_list:
				item = GoodsSalesItem()
				item['goods_list'] = goods_list
				item['mall_id']    = mall_id
				yield item
예제 #4
0
    def parse(self, response):
        pass
        goods_list = response.meta['goods_list']  ##产品集合
        mall_id = response.meta['mall_id']  ##店铺ID
        page = response.meta['page']  ##每返回一次页面数据 记录页数

        mall_goods = response.body.decode('utf-8')  ##bytes转换为str
        mall_goods = json.loads(mall_goods)

        goods_len = len(mall_goods['goods_list'])

        if goods_len > 0:
            goods_list = goods_list + mall_goods['goods_list']  ##合并产品列表

        if goods_len > self.pageSize - 100:
            page += 1
            ##继续采集下一页面
            url = 'http://apiv4.yangkeduo.com/api/turing/mall/query_cat_goods?category_id=0&type=0&sort_type=_sales&mall_id=' + str(
                mall_id) + '&page_no=' + str(page) + '&page_size=500'
            meta = {'page': page, 'mall_id': mall_id, 'goods_list': goods_list}
            headers = self.make_headers()
            yield scrapy.Request(url,
                                 meta=meta,
                                 callback=self.parse,
                                 headers=headers)
        else:
            if goods_list:
                item = GoodsSalesItem()
                item['goods_list'] = goods_list
                item['mall_id'] = mall_id
                yield item
예제 #5
0
    def parse(self, response):
        pass
        mall_id = response.meta['mall_id']
        mall_goods = response.body.decode('utf-8')
        self.save_mall_log(mall_id, mall_goods)
        self.success_count += 1
        self.save_mall_success_log(
            mall_id,
            json.dumps({
                'success_count': self.success_count,
                'request_count': self.request_count,
                'error_count': self.error_count
            }))
        goods_list = response.meta['goods_list']  ##产品集合
        mall_id = response.meta['mall_id']  ##店铺ID
        page = response.meta['page']  ##每返回一次页面数据 记录页数
        proxy = response.meta['proxy']  ##使用原始代理

        mall_goods = response.body.decode('utf-8')  ##bytes转换为str
        #self.save_mall_log(mall_id, mall_goods)

        mall_goods = json.loads(mall_goods)
        mall_goods = mall_goods['goods']

        if 'goods_list' not in mall_goods.keys():
            #self.ssdb_client.hset(self.fail_hash, mall_id, mall_id)
            return None

        goods_len = len(mall_goods['goods_list'])

        if goods_len > 0:
            goods_list = goods_list + mall_goods['goods_list']  ##合并产品列表
        else:
            return None

        if goods_len > self.pageSize - 100:
            page += 1
            ##继续采集下一页面
            url = self.build_url(mall_id, page, 500)
            meta = {
                'page': page,
                'mall_id': mall_id,
                'goods_list': goods_list,
                'proxy': proxy
            }
            headers = self.make_headers()
            yield scrapy.Request(url,
                                 meta=meta,
                                 callback=self.parse,
                                 headers=headers,
                                 dont_filter=True,
                                 errback=self.errback_httpbin)
        else:
            if goods_list:
                item = GoodsSalesItem()
                item['goods_list'] = goods_list
                item['mall_id'] = mall_id
                #print(item)
                yield item
예제 #6
0
 def parse(self, response):
     pass
     meta = response.meta
     html = response.body.decode('utf-8')  ##bytes转换为str
     # print(1, html)
     regex_content = re.search('window\.rawData= (.*)\;\s*\<\/script\>',
                               html)
     if not regex_content:
         self.err_after(meta)
         return None
     rawData = json.loads(regex_content.group(1))
     if 'initDataObj' not in rawData.keys():
         self.err_after(meta)
         return None
     initDataObj = rawData['initDataObj']  ##str转为字典
     # print(2,initDataObj)
     if 'needLogin' in initDataObj.keys():
         self.err_after(meta, True)
         return None
     if 'goods' not in initDataObj.keys():
         self.err_after(meta)
         return None
     goods_data = initDataObj['goods']  ##str转为字典
     # print(goods_data)
     if 'skus' not in goods_data.keys():
         self.err_after(meta)
         return None
     time = response.meta['time']  ##产品集合
     goods_id = response.meta['goods_id']  ##店铺ID
     skus = []
     for sku in goods_data['skus']:
         spec = ''
         for specX in sku['specs']:
             spec += str(specX['spec_value'])
         skuDetail = {
             'sku_id': sku['skuID'],
             'thumb_url': sku['thumbUrl'],
             'quantity': sku['quantity'],
             'is_onsale': sku['isOnSale'],
             'spec': spec,
             'normal_price': sku['normalPrice'],
             'group_price': sku['groupPrice'],
             'specs': sku['specs'],
             'weight': 0,
         }
         skus.append(skuDetail)
     # print(skus)
     item = GoodsSalesItem()
     goods_data['sku'] = skus
     goods_data['time'] = time
     item['goods_list'] = goods_data
     item['mall_id'] = goods_id
     yield item
예제 #7
0
    def parse(self, response):
        pass
        time = response.meta['time']  ##产品集合
        goods_id = response.meta['goods_id']  ##店铺ID

        goods_data = response.body.decode('utf-8')  ##bytes转换为str
        goods_data = json.loads(goods_data)

        if 'goods_id' in goods_data.keys():
            item = GoodsSalesItem()
            goods_data['time'] = time
            item['goods_list'] = goods_data
            item['mall_id'] = goods_id
            yield item
예제 #8
0
    def parse(self, response):
        pass
        mall_id = response.meta['mall_id']  ##店铺ID
        page = response.meta['page']  ##每返回一次页面数据 记录页数
        proxy = response.meta['proxy']  ##使用原始代理

        mall_goods = response.body.decode('utf-8')  ##bytes转换为str
        #self.save_mall_log(mall_id, mall_goods)

        mall_goods = json.loads(mall_goods)

        if 'goods_list' not in mall_goods.keys():
            #self.ssdb_client.hset(self.fail_hash, mall_id, mall_id)
            return None

        goods_len = len(mall_goods['goods_list'])

        # print(goods_len)

        if goods_len > 0:
            goods_list = mall_goods['goods_list']  ##合并产品列表
        else:
            return None

        if goods_list:
            item = GoodsSalesItem()
            item['goods_list'] = goods_list
            item['mall_id'] = mall_id
            #print(item)
            yield item

        if goods_len >= 30 and page < self.max_page:
            page += 1
            ##继续采集下一页面
            url = self.build_url(mall_id, page, 50)
            meta = {'page': page, 'mall_id': mall_id, 'proxy': proxy}
            headers = self.make_headers()
            yield scrapy.Request(url,
                                 meta=meta,
                                 callback=self.parse,
                                 headers=headers,
                                 dont_filter=True,
                                 errback=self.errback_httpbin)
예제 #9
0
    def parse(self, response):
        mall_id = response.meta['mall_id']  ##店铺ID
        page = response.meta['page']  ##每返回一次页面数据 记录页数
        proxy = response.meta['proxy']  ##使用原始代理

        mall_goods = response.body.decode('utf-8')  ##bytes转换为str
        #self.save_mall_log(mall_id, mall_goods)

        mall_goods = json.loads(mall_goods)
        if 'goods_list' not in mall_goods.keys():
            # self.ssdb_client.qpush_back('', '')
            return None

        mall_goods = mall_goods['goods_list']
        goods_len = len(mall_goods)

        item = GoodsSalesItem()
        item['goods_list'] = mall_goods
        item['mall_id'] = mall_id
        self.save_mall_log(page, json.dumps(mall_goods))
        # print(item)
        yield item
        if goods_len < 50:
            return None

        else:
            page += 1
            ##继续采集下一页面
            url = self.build_url(mall_id, page, 50)
            meta = {'page': page, 'mall_id': mall_id, 'proxy': proxy}
            headers = self.make_headers()
            yield scrapy.Request(url,
                                 meta=meta,
                                 callback=self.parse,
                                 headers=headers,
                                 dont_filter=True,
                                 errback=self.errback_httpbin)