def process_response(self, request, response, spider): selferrorlog = SelfLog('error') selfinfolog = SelfLog(spider.name) # 把cookie从能用的库中转移到不能用的库里 cookie_redis_key_hash = request.meta['cookies_redis'] cookie_redis_key = request.meta['cookies_redis'] + "_list" unuse_cookie_redis_key = request.meta['useless_cookies'] if response.status == 302: print(response.text) selferrorlog.logger.error( "{spidername}-被封,302重定向到登录界面{cookie}:".format( spidername=spider.name, cookie=request.cookies)) request = self.dealcookie(request, response, spider) return request elif "c=com&m=limitPage" in response.text: selferrorlog.logger.error( "{spidername}-重定向到限制界面, cookie值:{cookie}".format( spidername=spider.name, cookie=request.cookies)) request = self.dealcookie(request, response, spider) return request elif "请重新登录" in response.text: selferrorlog.logger.error( "{spidername}-cookie:{cookies}过期,或者IP不一致,到登录界面".format( spidername=spider.name, cookies=request.cookies)) request = self.dealcookie(request, response, spider) return request selfinfolog.logger.info("请求url:{url}使用的cookie:{cookie}".format( url=response.url, cookie=request.cookies)) return response
def get_cookies(self, request, spider): selflog = SelfLog('error') cookie_redis_key_list = request.meta['cookies_redis'] + "_list" cookie_redis_key_hash = request.meta['cookies_redis'] unuse_cookie_redis_key = request.meta['useless_cookies'] try: # cookies_dict = random.choice(self.cookies_deal_r.zscan(cookie_redis_key)[1]) # 把cookie 取出来然后放到队尾 popcookie = self.cookies_deal_r.lpop(cookie_redis_key_list) self.cookies_deal_r.rpush(cookie_redis_key_list, popcookie) except Exception as e: selflog.logger.error( "spidername:{spidername} 的cookie 耗尽请补充, 错误信息:{e}".format( spidername=spider.name, e=e)) # 发送邮件通知,并且最好处理能关闭爬虫 sendEmail(content="{cookname}cookie耗尽,请尽快处理".format( cookname=cookie_redis_key_list)) spider.crawler.engine.close_spider( spider, "{cookname}cookie耗尽,关闭爬虫".format( cookname=cookie_redis_key_list)) else: dicts = json.loads(popcookie) phonenum = self.cookies_deal_r.hget(cookie_redis_key_hash, popcookie) print("{cookie_redis}--手机号:{phonenum}--cookie:{cookie}".format( cookie_redis=cookie_redis_key_hash, phonenum=phonenum, cookie=dicts)) return dicts
def dealcookie(self, request, response, spider): selflog = SelfLog('error') cookie_redis_key_hash = request.meta['cookies_redis'] cookie_redis_key_list = request.meta['cookies_redis'] + "_list" unuse_cookie_redis_key = request.meta['useless_cookies'] redis_member = json.dumps(request.cookies) # 查到在hash 中的 手机号 zset_phone = self.cookies_deal_r.hget(cookie_redis_key_hash, redis_member) # 移除在list 和 有用hash 中的 数据 # 在不能用的hash中添加 self.cookies_deal_r.lrem(cookie_redis_key_list, 0, redis_member) self.cookies_deal_r.hdel(cookie_redis_key_hash, redis_member) self.cookies_deal_r.hset(unuse_cookie_redis_key, redis_member, zset_phone) # 再从redis中取出一个cookie构建request对象 try: popcookie = self.cookies_deal_r.lpop(cookie_redis_key_list) self.cookies_deal_r.rpush(cookie_redis_key_list, popcookie) except Exception as e: selflog.logger.error("{spidername}--cookie 耗尽请补充, 错误信息:{e}".format( spidername=spider.name, e=e)) # 发送邮件通知,并且最好处理能关闭爬虫 sendEmail(content="{cookname}cookie耗尽,请尽快处理".format( cookname=cookie_redis_key_list)) spider.crawler.engine.close_spider( spider, "{cookname}cookie耗尽,关闭爬虫".format( cookname=cookie_redis_key_list)) else: request.cookies = json.loads(popcookie) return request
def process_request(self, request, spider): selflog = SelfLog(spider.name) # 根据爬虫名字分别处理爬虫代理和cookie if spider.name == "nnqc": request.meta['http_proxy'] = self.get_proxy() print("{}使用的代理为{},请求url:{}".format(spider.name, request.meta['http_proxy'], request.url)) elif spider.name == "chezhen": request.meta['http_proxy'] = self.get_proxy() print("{}使用的代理为{},请求url:{}".format(spider.name, request.meta['http_proxy'], request.url)) elif spider.name == "car168": pass # # 添加cookie request.cookies = self.get_cookies(request, spider) # cookie_dict = self.get_cookies(request, spider) # cookie = "" # for k, v in cookie_dict.items(): # cookie = cookie + k + "=" + v +";" # request.headers["User-Agent"] = UserAgent().random # request.headers["Content-Type"] = "text/html; charset=utf-8" # request.headers["Cookie"] = cookie # url = request.url # redis_key = request.meta['url_redis'] # if self.add_url_r.sismember(redis_key, url): # spider.logger.info("该url已经爬取,舍弃:%s"%url) # raise IgnoreRequest return None
def process_item(self, item, spider): selflog = SelfLog(spider.name) selflog_error = SelfLog("error") keys = [ "brand", "type", "year", "style", "guide_price", "displacement", "configuration", "version", "status" ] values = [item[i] for i in keys] # 插入车型数据前先在数据库中进行匹配是否存在,若存在则判断是否符合更新条件进行更新, 若不存在则保存 sql_search_style = "select id, brand, type from car_style where brand=%s and type=%s and year=%s and style=%s and guide_price=%s" search_db_result = Mysqlpython().readall(sql_search_style, [ item["brand"], item['type'], item['year'], item['style'], item['guide_price'] ]) if search_db_result: commit_id = search_db_result[0][0] # selflog.logger.info("出现的重复车型:%s", search_db_result[0]) update_set = "update car_style set status=null " update_where = " where id=%s " sqlparam = [commit_id] if not item['guide_price'] and item['guide_price'] != "None": update_set += " , guide_price=%s" update_where += " and (guide_price is null or guide_price='None')" sqlparam.append(item['guide_price']) if not item['displacement'] and item['displacement'] != "None": update_set += " , displacement=%s" update_where += " and (displacement is null or displacement='None')" sqlparam.append(item['displacement']) if item['version'] and item['version'] != "None": update_set += " , version=%s" update_where += " and (version is null or version='None')" sqlparam.append(item['version']) # 如果update_set 还是原来的值则不执行更新操作 if update_set == "update car_style set status=null ": pass else: sql_update = update_set + update_where # selflog.logger.info( # "执行更新sql:{sql_update}, values:{values}, id:{commit_id}".format(sql_update=sql_update, # values=sqlparam, # commit_id=commit_id)) try: self.cur.execute(sql_update, sqlparam) self.conn.commit() except Exception as e: selflog_error.logger.error( "更新sql出错:{sql_update}, values:{values}, id:{commit_id},e:{e}" .format(sql_update=sql_update, values=sqlparam, commit_id=commit_id, e=e)) # 如果查不到则进行插入车型、价钱、更新时间等详情操作 else: # 插入车型表 sql = "insert into `{}` ({}) values ({})".format( item.table_name, ','.join(keys), # 使用占位符插入的方式是保证兼容除字符串格式以外数据 ','.join(['%s'] * len(values)), ) try: self.cur.execute(sql, values) self.conn.commit() except Exception as e: selflog_error.logger.info( "{spidername} 插入车型表出错e:{e}sql:{sql}, --values:{values}". format(spidername=spider.name, e=e, sql=sql, values=values)) commit_id = None else: commit_id = self.cur.lastrowid # 如果没有车价格和交易量则不进行插入 if commit_id and (item['price'] or item['volume']): # 在redis中判断这个详情页hash: detail-url: 更新时间##数据库保存id是否存在, # 如果存在则判断更新时间是否符合,不过不符合则进行数据插入然后更新redis数据 # 如果不存在则存入redis和数据库 requesturl = item['detail_url'] rediskey = item['rediskey'] # 从 中取出更新时间和保存的id hash_value = self.save_url_r.hget(rediskey, requesturl) if hash_value: hash_value = hash_value.decode() updatetime = hash_value.split('##')[0] save_id = hash_value.split('##')[1] if updatetime == item['updatetime']: pass # 执行插入 else: self.insert_detaildata(spider, commit_id, item, selflog_error, rediskey) # sql_update_detail = "update car_detail set updatetime=%s, price=%s, volume=%s where id=%s".format( # updatetime=item['updatetime'], price=item['price'],volume=item['volume'], save_id=save_id) # sql_insert_detail = "insert " # try: # self.cur.execute(sql_insert_detail, [item['updatetime'], item['price'],item['volume'], save_id]) # # 更新redis的值 # self.save_url_r.hset(rediskey, requesturl, item['updatetime'] + "##" + str(self.cur.lastrowid)) # print("{spidername}--更新爬取价格数据:{updata}".format(spidername=spider.name, updata=item['updatetime'] + "##" + save_id)) # # # selflog.logger.info("更新爬取价格数据:{updata}".format(updata=item['updatetime'] + "##" + save_id)) # except Exception as e: # selflog_error.logger.error("{spidername}更新爬取车价详情出错e:{e}__sql:{sql}".format(spidername=spider.name, e=e, sql=sql_update_detail)) # redis中查不到这条数据,直接写入数据库中, 并在redis中添加 detail_url:更新时间##save_id 数据 else: self.insert_detaildata(spider, commit_id, item, selflog_error, rediskey) # selflog.logger.info("写入爬取价格数据:{updata}".format(updata=item['updatetime'] + "##" + str(self.cur.lastrowid))) # self.save_url_r.sadd(item['rediskey'], requesturl) else: selflog.logger.info( "没有交易价格和交易量数据,car_detial表不进行插入key:%s, --values:%s" % (keys, values)) return item
class ChezhenSpider(scrapy.Spider): name = 'chezhen' selflog = SelfLog(name) cookies = { "CzcyAutoLogin": "******", } # 查询车进口类型接口 type_url = "http://www.chezhency.com/Home/ncar/gettype?cat_id=%s" # 车型详情列表页的url detail_requesturl = "http://www.chezhency.com/Home/ncar/index/cat_id/{catid}/type_id/{versionkey}/timer/2.html" allowed_domains = ['www.chezhency.com'] start_urls = [ 'http://www.chezhency.com/', "http://www.chezhency.com/home/index/indexbei.html" ] # 构造请求头 def get_headers(self): sql = "select ua from useragent order by rand() limit 1;" headers = { # "User-Agent": dbhelper.readall(sql)[0][0], "User-Agent": UserAgent().random, } return headers # # 构造cookie # def get_cookies(self, lastcookie): # # cookie = json.loads(cookie_r.srandmember("cookies_168")) # while cookie == lastcookie: # cookie = json.loads(cookie_r.srandmember("cookies_168")) # return cookie # 构造第一个请求 def start_requests(self): url = self.start_urls[1] request = Request(url=url, callback=self.parse_brand, headers=self.get_headers()) #, cookies=self.cookies) # 存放url request.meta['url_redis'] = url_redis # 存放cookie request.meta['cookies_redis'] = cookies_chezhen # 存放不能用的cookie request.meta['useless_cookies'] = unuseless_cookies_chezhen yield request def parse_brand(self, response): brand_base = response.xpath("//dl/dd/a") for brand in brand_base: brandname = brand.xpath('./text()').extract_first() url = brand.xpath('./@href').extract_first() request = response.follow( url=url, callback=self.parse_type, headers=self.get_headers()) #, cookies=self.cookies) request.meta['brand'] = brandname # 存放url request.meta['url_redis'] = url_redis # 存放cookie request.meta['cookies_redis'] = cookies_chezhen # 存放不能用的cookie request.meta['useless_cookies'] = unuseless_cookies_chezhen yield request def parse_type(self, response): car_type_dd = response.xpath('//dl[@class="listdl"]/dd') for dd in car_type_dd: car_type = dd.xpath('./text()').extract_first() car_type_id = dd.xpath('./@data-id').extract_first() request_url = self.type_url % car_type_id result = requests.get(request_url, headers=self.get_headers()) version_data = eval(result.text).get('data') if len(version_data): for version in version_data: key = version.get("key") value = version.get('value') detail_url = self.detail_requesturl.format( catid=car_type_id, versionkey=key) request = Request( url=detail_url, callback=self.parse_detail, headers=self.get_headers()) # , cookies=self.cookies) request.meta['brand'] = response.meta['brand'] request.meta['type'] = car_type request.meta['cartypeid'] = car_type_id request.meta['versionkey'] = key # 存放url request.meta['url_redis'] = url_redis # 存放cookie request.meta['cookies_redis'] = cookies_chezhen # 存放不能用的cookie request.meta['useless_cookies'] = unuseless_cookies_chezhen yield request #车型详情信息列表页 def parse_detail(self, response): brand = response.meta['brand'] cartype = response.meta['type'] # 当请求下一页时候注意加上这条数据 cartypeid = response.meta['cartypeid'] versionkey = response.meta['versionkey'] # li-data数据解析 base_xpath = response.xpath('//ul[@class="carlist"]/li') requesturl = response.url for li in base_xpath: carstyle = li.xpath( './a/div[@class="c_title"]/p[@class="lt"]/text()' ).extract_first() price = li.xpath('./a/div[@class="c_title"]/p[@class="rt"]/text()' ).extract_first() version = li.xpath('./a/div[@class="detail"]/p[@class="lt"]/text()' ).extract_first() guide_price = li.xpath( './a/div[@class="detail"]/p[@class="rt"]/text()' ).extract_first() config = li.xpath( './a/div[@class="intro"][2]/text()').extract_first() carstyle = deal_style(carstyle) year = deal_year(carstyle, self) # 排量 displacement = deal_displacement(carstyle, self) # 指导价格 guide_price = deal_guideprice(guide_price, carstyle, self) # 详情页P标签中的值判断进口版本 if China in version: if "进口" in carstyle: version_num = 1 else: version_num = 0 elif USA in version: version_num = 2 elif Canada in version: version_num = 3 elif Mexico in version: version_num = 4 elif European in version: version_num = 5 else: version_num = "平行None" updatetime = li.xpath( './a/div[@class="attach"]/p[@class="rt"]/text()' ).extract_first() updatetime = deal_updatetime(updatetime) detail_url = li.xpath('./a/@href').extract_first() if detail_url: detail_url = self.start_urls[0] + detail_url else: detail_url = "" get_item = sav_item(brand=brand, cartype=cartype, year=year, carstyle=carstyle, guide_price=guide_price, displacement=displacement, config=config, version_num=version_num, price=price, volume=None, platform=3, requesturl=detail_url, rediskey=url_redis, updatetime=updatetime) yield get_item # 懒加载请求最后一条数据 lastid = response.xpath( '//ul[@class="carlist"]/li[last()]/@data-id').extract_first() if lastid: request_url = self.detail_requesturl.format( catid=cartypeid, versionkey=versionkey) + "?id={lastid}".format(lastid=lastid) request = response.follow( url=request_url, callback=self.parse_detail, headers=self.get_headers()) # , cookies=self.cookies) # 添加request meta信息 # 当请求下一页时候注意加上这些数据 request.meta['brand'] = brand request.meta['type'] = cartype request.meta['cartypeid'] = cartypeid request.meta['versionkey'] = versionkey # 存放url request.meta['url_redis'] = url_redis # 存放cookie request.meta['cookies_redis'] = cookies_chezhen # 存放不能用的cookie request.meta['useless_cookies'] = unuseless_cookies_chezhen # self.selflog.logger.info("继续抓取下一页信息:{}".format(request_url)) yield request set_url_r.sadd(url_redis_chezhen, response.url)
class Car168Spider(scrapy.Spider): name = 'car168' allowed_domains = ['www.chehang168.com'] start_urls = ['http://www.chehang168.com/', 'http://www.chehang168.com/index.php?c=index&m=allBrands', "http://www.chehang168.com/index.php?c=index&m=Cardata"] selflog = SelfLog(name) def get_headers(self): headers = { "User-Agent": UserAgent().random, } return headers i = 0 def start_requests(self): url = "http://www.chehang168.com/index.php?c=index&m=series&psid=%s" for key in type_r.keys(): key = key.decode() request = Request(url=url%key, callback=self.parse_select_cartype, headers=self.get_headers())#, cookies=self.get_cookies("")) value = type_r.get(key) if value: request.meta['type'] = value.decode() else: request.meta['type'] = None # 存放url request.meta['url_redis'] = url_redis # 存放cookie request.meta['cookies_redis'] = cookies_car168 # 存放不能用的cookie request.meta['useless_cookies'] = unuseless_cookies_car168 yield request # 列表详情页信息 def parse_select_cartype(self, response): lastcookie = response.request.cookies # 这里分情况,当是第一页时构建从第二页到最后一页的请求, 如果不是第一页则跳过 isfirstpage = response.xpath('//p[@class="pagenum"]/span[@class="current"]/text()').extract_first() if isfirstpage == "1": lastpage = response.xpath('//p[@class="pagenum"]//a[last()-2]/text()').extract_first() pageurl = response.url + "&pricetype=0&page=%s" for pagenum in range(2, int(lastpage)+1): nextpage_url = pageurl%pagenum # self.selflog.logger.info("继续请求下一页信息:%s" % nextpage_url) request_nextpage = response.follow(url=nextpage_url, callback=self.parse_select_cartype, headers=self.get_headers())#, cookies=self.get_cookies(lastcookie)) request_nextpage.meta['type'] = response.meta['type'] # 存放url request_nextpage.meta['url_redis'] = url_redis # 存放cookie request_nextpage.meta['cookies_redis'] = cookies_car168 # 存放不能用的cookie request_nextpage.meta['useless_cookies'] = unuseless_cookies_car168 yield request_nextpage # type_label = response.meta['type_lable'] # 车型 #***新方法的brand, type_lable, type self.i += 1 type_lable = response.xpath('//div[@class="sx_left o_cf"]/a[1]/span/text()').extract_first() brand = response.xpath('//div[@class="ch_crumb o_w mar8"]/a[3]/text()').extract_first() base_car_style = response.xpath('//div[@class="cheyuan_list"]/ul[2]/li') for li in base_car_style: car_style = li.xpath('./div/h3/a/text()').extract_first() # 类型去空格处理后使用一个空格隔开 car_style = deal_style(car_style) # 配置没有为空 config = li.xpath('./p[@class="c2"]/text()').extract_first() # 排量 displacement = deal_displacement(car_style, self) # 年款 year = deal_year(car_style, self) # 进口方式 version = li.xpath('./p[@class="c1"]/text()').extract_first() version = str(version) if China in version or "国产" in version: version_num = 0 elif ChinaImport in version: version_num = 1 elif USA in version: version_num = 2 elif Canada in version: version_num = 3 elif Mexico in version: version_num = 4 elif European in version: version_num = 5 else: self.selflog.logger.info("车型{car_style}的进口版本不在规定内".format(car_style=car_style)) version_num = None # 指导价: 指导价:25.27万下20点 guide_price = li.xpath('./div/h3/b/text()').extract_first() guide_price = deal_guideprice(guide_price, car_style, self) # 价钱标识 ¥ price_flag = li.xpath('./div/span/text()').extract_first() price = li.xpath('./div/span/b/text()').extract_first() # 成交量: 车源成交量:3单 volume = li.xpath('./p/cite[4]/text()').extract_first() if volume: search_volume = re.search("(\d+)单", volume) if search_volume: volume = search_volume.group(1) # 更新时间 updatetime = li.xpath('./p[@class="c3"]/cite[1]/text()').extract_first() updatetime = deal_updatetime(updatetime) # 详情页信息 detail_url = li.xpath('./div/h3/a/@href').extract_first() if detail_url: detail_url = self.start_urls[0] + str(detail_url) else: detail_url = "" carstyleitem = CarStyleItem() carstyleitem['brand'] = brand carstyleitem["type"] = response.meta['type'] carstyleitem['year'] = year carstyleitem['style'] = car_style carstyleitem['configuration'] = config carstyleitem['displacement'] = displacement carstyleitem['volume'] = volume carstyleitem['status'] = None carstyleitem['version'] = version_num carstyleitem['guide_price'] = guide_price carstyleitem['price'] = price carstyleitem['platform'] = 1 carstyleitem['rediskey'] = url_redis # 详情页的url 和更新时间 carstyleitem['detail_url'] = detail_url carstyleitem['updatetime'] = updatetime yield carstyleitem # 把这一页的url信息set进redis中 # 如果已经爬取过的话并不会触发添加条件,会被过滤掉, # 如何判断已经全部添加。。。。。 set_url_r.sadd(url_redis_car168, response.url)
class NnqcSpider(scrapy.Spider): name = 'nnqc' allowed_domains = ['www.niuniuqiche.com'] start_urls = [ 'http://www.niuniuqiche.com/', 'http://www.niuniuqiche.com/v2/brands' ] i = 0 selflog = SelfLog(name) def get_headers(self): # sql = "select ua from useragent order by rand() limit 1;" headers = { # "User-Agent": dbhelper.readall(sql)[0][0], "User-Agent": UserAgent().random, "Content-Type": "text/html; charset=utf-8", "Host": "www.niuniuqiche.com", } return headers # def get_cookie(self, newcookie): # # cookies_dict = random.choice(r_zet_cookie.zscan(cookies_nnqc)[1]) # dicts = json.loads(cookies_dict[0]) # phonenum = cookies_dict[1] # print("牛牛汽车每次请求时候的手机号:{phonenum}, cookie:{cookie}".format(phonenum=phonenum, cookie=dicts)) # return dicts def start_requests(self): self.i += 1 print("请求总数", self.i) request = Request( url=self.start_urls[1], callback=self.parse, ) # dont_filter=True)#headers=self.get_headers(),)# cookies=self.get_cookie(""))#,meta={'proxy':self.get_proxy()}) # 存放url request.meta['url_redis'] = url_redis # 存放cookie request.meta['cookies_redis'] = cookies_nnqc # 存放不能用的cookie request.meta['useless_cookies'] = unuseless_cookies_nnqc request.meta['handle_httpstatus_list'] = [302] yield request # 所有的汽车品牌列表页 def parse(self, response): self.i += 1 print("请求总数", self.i) cookie = response.headers.getlist('Set-Cookie') base_div = response.xpath( '//div[@class="listing-brands"]/div[@class="item"]/div[@class="brands"]//div[@class="brand"]' ) cookie_dicts = self.get_cookie(cookie) for base_xpath in base_div: url = base_xpath.xpath('./a/@href').extract_first() brand = base_xpath.xpath('./a/text()').extract_first() request = response.follow( url=self.start_urls[0][:-1] + url, cookies=cookie_dicts, callback=self.config_parse, ) # dont_filter=True)#headers=self.get_headers()) request.meta['brand'] = brand # 存放url request.meta['url_redis'] = url_redis # 存放cookie request.meta['cookies_redis'] = cookies_nnqc # 存放不能用的cookie request.meta['useless_cookies'] = unuseless_cookies_nnqc request.meta['handle_httpstatus_list'] = [302] yield request # 带车型选择的详情页 def config_parse(self, response): self.i += 1 print("请求总数", self.i) cookie = response.headers.getlist('Set-Cookie') type_lable_div = response.xpath('//ul[@class="tab-content"]/li') for div in type_lable_div: table_lable = div.xpath( './div[@class="col-sm-3"]/text()').extract_first() type_div = div.xpath('./div[@class="col-sm-9"]') for li in type_div: car_type = li.xpath('.//div/a/text()').extract_first() car_type_url = li.xpath('.//div/a/@href').extract_first() # 这个是要抓取页的url, 需要去重 request = response.follow( url=car_type_url, callback=self.parse_select_cartype, ) #headers=self.get_headers())#, cookies=self.get_cookie(cookie)) request.meta['table_lable'] = table_lable request.meta['car_type'] = car_type request.meta['brand'] = response.meta['brand'] # 存放url request.meta['url_redis'] = url_redis # 存放cookie request.meta['cookies_redis'] = cookies_nnqc # 存放不能用的cookie request.meta['useless_cookies'] = unuseless_cookies_nnqc request.meta['handle_httpstatus_list'] = [302] yield request # 所有二级车型页 def parse_select_cartype(self, response): self.i += 1 print("请求总数", self.i) cookie = response.headers.getlist('Set-Cookie') # 如果a 连接中有下一页信息则请求下一页 # 这里分情况,当是第一页时构建从第二页到最后一页的请求, 如果不是第一页则跳过 isfirstpage = response.xpath( '//div[@class="section-pagination"]//span[@class="page current"]/text()' ).extract_first() lastpage = response.xpath( '//div[@class="section-pagination"]//span[@class="last"]/a/@href' ).extract_first() print("详情页的页码信息", isfirstpage, lastpage) if isfirstpage and lastpage: isfirstpage = isfirstpage.strip() lastpage_url = lastpage.split('page=')[0] lastpage_pagenum = lastpage.split('page=')[1] if isfirstpage == "1": pageurl = response.url + "&page=%s" for pagenum in range(2, int(lastpage_pagenum) + 1): nextpage_url = self.start_urls[ 0] + lastpage_url + "&page=%s" % pagenum # self.selflog.logger.info("继续请求下一页信息:%s" % nextpage_url) request_nextpage = response.follow( url=nextpage_url, callback=self.parse_select_cartype, ) #headers=self.get_headers()) # 存放url request_nextpage.meta['url_redis'] = url_redis # 存放cookie request_nextpage.meta['cookies_redis'] = cookies_nnqc # 存放不能用的cookie request_nextpage.meta[ 'useless_cookies'] = unuseless_cookies_nnqc request_nextpage.meta['handle_httpstatus_list'] = [302] request_nextpage.meta['table_lable'] = response.meta[ 'table_lable'] request_nextpage.meta['brand'] = response.meta['brand'] request_nextpage.meta['car_type'] = response.meta[ 'car_type'] yield request_nextpage type_label = response.meta['table_lable'] # 车型 base_car_style = response.xpath('//div[@class="item"]') requesturl = response.url for div in base_car_style: car_style = div.xpath( './div[@class="car-title"]/a/text()').extract_first() # 类型去空格处理后使用一个空格隔开 car_style = deal_style(car_style) # 配置没有为空 # config = li.xpath('./p[@class="c2"]/text()').extract_first() config = None # 排量 displacement = deal_displacement(car_style, self) # 年款 year = deal_year(car_style, self) # 进口方式 version = div.xpath( './div[@class="car-subtitle clearfix"]/span/text()' ).extract_first() # 平行进口处理 if "平行进口" in type_label: if USA in version or "美规" in version: version_num = 2 elif Canada in version: version_num = 3 elif Mexico in version or "墨西哥版" in version: version_num = 4 elif European in version: version_num = 5 else: version_num = "平行None" # 进口处理 elif "进口" in type_label: version_num = 1 # 中规处理 else: if China in version or "国产" in version: version_num = 0 elif ChinaImport in version: version_num = 1 else: self.selflog.logger.info( "车型{car_style}的进口版本不在规定内".format(car_style=car_style)) version_num = None # 指导价: 指导价:25.27万下20点 guide_price = div.xpath( './div/div[@class="car-guide-price"]/text()').extract_first() guide_price = deal_guideprice(guide_price, car_style, self) price = div.xpath( './div/div[@class="car-price"]/text()').extract_first() # 成交量: 车源成交量:3单 volume = div.xpath( './div[@class="user-info clearfix"]/span[3]/text()' ).extract_first() if volume: search_volume = re.search("\d+", volume) if search_volume: volume = search_volume.group() # 更新时间和详情页url detail_url = div.xpath( './div[@class="car-title"]/a/@href').extract_first() if detail_url: detail_url = self.start_urls[0] + detail_url else: detail_url = "" updatetime = div.xpath( './div[@class="user-info clearfix"]/div[@class="car-publish-time"]/text()' ).extract_first() if updatetime: # 更换成统一格式 05-24 if "/" in updatetime: updatetime_change = "-".join(updatetime.split('/')) updatetime = str(datetime.datetime.now().year ) + "-" + updatetime_change + " 00:00" elif ":" in updatetime: updatetime = datetime.datetime.now().strftime( '%Y-%m-%d') + " " + updatetime else: updatetime = "" carstyleitem = CarStyleItem() carstyleitem['brand'] = response.meta['brand'] carstyleitem["type"] = response.meta['car_type'] carstyleitem['year'] = year carstyleitem['style'] = car_style carstyleitem["requesturl"] = requesturl carstyleitem['configuration'] = str(config) carstyleitem['displacement'] = displacement carstyleitem['version'] = str(version_num) carstyleitem['guide_price'] = guide_price carstyleitem['price'] = price carstyleitem['volume'] = volume carstyleitem['status'] = "None" carstyleitem['platform'] = 2 carstyleitem['rediskey'] = url_redis carstyleitem['detail_url'] = detail_url carstyleitem['updatetime'] = updatetime yield carstyleitem set_url_r.sadd(url_redis_nnqc, response.url)