Exemplo n.º 1
0
 def start_requests(self):
     log.info('获取商品详情')
     if self.fromDB:
         [self.productIds.append(p.id) for p in Product.select()]
     for pid in self.productIds:
         log.info(f'获取商品详情request {pid}')
         url = get_product_info_url(pid)
         log.info(f'商品详情request url:{url}')
         log.info("headers ---> {0}".format(headers()))
         yield Request(url, headers=headers())
Exemplo n.º 2
0
	def parse_brandInfo(self, response):
		data = json.loads(response.body_as_unicode())['data']
		unionId = response.meta.get('unionId')
		name = response.meta.get('name')

		num = data['total']
		page = math.ceil(num / 20)
		log.success(f'品牌:{name} 编号:{unionId} 商品总数:{num} 页面数:{page}')

		for page in range(1, page + 1):
			yield Request(page_url(unionId, page), callback=self.parse_productId, meta={
				'unionId': unionId,
				'name': self.brandIds[unionId]
			}, headers=headers())
Exemplo n.º 3
0
    def start_requests(self):
        log.info(f'选取商品销量高于 {self.soldNum_min} 开始追踪')
        pools = self.get_items()

        for p in pools:
            # log.info(f'production url: {p.url}')
            yield scrapy.Request(p.url,
                                 meta={
                                     'productId': p.id,
                                     'title': p.title,
                                     'brand': p.brand,
                                     'serie': p.serie,
                                     'articleNumber': p.articleNumber
                                 },
                                 headers=headers())
Exemplo n.º 4
0
	def parse_brandList(self, response):
		brandList = json.loads(response.body_as_unicode())['data']['list']
		for brand in brandList:
			unionId = brand['brand']['goodsBrandId']
			name = brand['brand']['brandName']
			self.brandIds[unionId] = name
			log.success(f'品牌:{name} 编号:{unionId}')

		if not self.auto:
			ids = prompt('输入需要爬取的品牌编号', default='').strip().split(' ')
			if ids == ['']: return IgnoreRequest()
		else:
			ids = self.Ids
			if not ids: return IgnoreRequest()

		# log.info(f'品牌列表 {self.brandIds}')
		# log.info(f'获取 {ids} 品牌包含商品')
		for unionId in ids:
			log.info(f'unionId: {unionId}')
			unionId = int(unionId)
			yield Request(page_url(unionId), callback=self.parse_brandInfo, meta={
				'unionId': unionId,
				'name': self.brandIds[unionId]
			}, headers=headers())
Exemplo n.º 5
0
	def parse_serieList(self, response):
		serieList = json.loads(response.body_as_unicode())['data']['list']
		# log.info(f'{serieList}')
		for data in serieList:
			for serie in data['seriesList']:
				unionId = serie['productSeriesId']
				name = serie['name']
				self.serieIds[unionId] = name
				log.success(f'系列:{name} 编号:{unionId}')
		if not self.auto:
			ids = prompt('输入需要爬取的系列编号', default='').strip().split(' ')
			if ids == ['']: return IgnoreRequest()
		else:
			ids = self.Ids
			if not ids: return IgnoreRequest()

		log.info(f'获取 {ids} 系列包含商品')
		for unionId in ids:
			unionId = int(unionId)
			log.info(f'unionId: {unionId}')
			yield Request(page_url(unionId), callback=self.parse_serieInfo, meta={
				'unionId': unionId,
				'name': self.serieIds[unionId]
			}, headers=headers())
Exemplo n.º 6
0
	def start_requests(self):
		log.info('获取品牌列表')
		for url in self.start_urls:
			yield Request(url, dont_filter=True, callback=self.parse_brandList, meta={
				'dont_retry': True
			}, headers=headers())
Exemplo n.º 7
0
	def start_requests(self):
		log.info('获取系列列表')
		for url in self.start_urls:
			yield Request(url, dont_filter=True, callback=self.parse_serieList, headers=headers())