Пример #1
0
 def process_response(self, request, response, spider):
     selferrorlog = SelfLog('error')
     selfinfolog = SelfLog(spider.name)
     # 把cookie从能用的库中转移到不能用的库里
     cookie_redis_key_hash = request.meta['cookies_redis']
     cookie_redis_key = request.meta['cookies_redis'] + "_list"
     unuse_cookie_redis_key = request.meta['useless_cookies']
     if response.status == 302:
         print(response.text)
         selferrorlog.logger.error(
             "{spidername}-被封,302重定向到登录界面{cookie}:".format(
                 spidername=spider.name, cookie=request.cookies))
         request = self.dealcookie(request, response, spider)
         return request
     elif "c=com&m=limitPage" in response.text:
         selferrorlog.logger.error(
             "{spidername}-重定向到限制界面, cookie值:{cookie}".format(
                 spidername=spider.name, cookie=request.cookies))
         request = self.dealcookie(request, response, spider)
         return request
     elif "请重新登录" in response.text:
         selferrorlog.logger.error(
             "{spidername}-cookie:{cookies}过期,或者IP不一致,到登录界面".format(
                 spidername=spider.name, cookies=request.cookies))
         request = self.dealcookie(request, response, spider)
         return request
     selfinfolog.logger.info("请求url:{url}使用的cookie:{cookie}".format(
         url=response.url, cookie=request.cookies))
     return response
Пример #2
0
 def get_cookies(self, request, spider):
     selflog = SelfLog('error')
     cookie_redis_key_list = request.meta['cookies_redis'] + "_list"
     cookie_redis_key_hash = request.meta['cookies_redis']
     unuse_cookie_redis_key = request.meta['useless_cookies']
     try:
         # cookies_dict = random.choice(self.cookies_deal_r.zscan(cookie_redis_key)[1])
         # 把cookie 取出来然后放到队尾
         popcookie = self.cookies_deal_r.lpop(cookie_redis_key_list)
         self.cookies_deal_r.rpush(cookie_redis_key_list, popcookie)
     except Exception as e:
         selflog.logger.error(
             "spidername:{spidername} 的cookie 耗尽请补充, 错误信息:{e}".format(
                 spidername=spider.name, e=e))
         # 发送邮件通知,并且最好处理能关闭爬虫
         sendEmail(content="{cookname}cookie耗尽,请尽快处理".format(
             cookname=cookie_redis_key_list))
         spider.crawler.engine.close_spider(
             spider, "{cookname}cookie耗尽,关闭爬虫".format(
                 cookname=cookie_redis_key_list))
     else:
         dicts = json.loads(popcookie)
         phonenum = self.cookies_deal_r.hget(cookie_redis_key_hash,
                                             popcookie)
         print("{cookie_redis}--手机号:{phonenum}--cookie:{cookie}".format(
             cookie_redis=cookie_redis_key_hash,
             phonenum=phonenum,
             cookie=dicts))
         return dicts
Пример #3
0
    def dealcookie(self, request, response, spider):
        selflog = SelfLog('error')
        cookie_redis_key_hash = request.meta['cookies_redis']
        cookie_redis_key_list = request.meta['cookies_redis'] + "_list"
        unuse_cookie_redis_key = request.meta['useless_cookies']

        redis_member = json.dumps(request.cookies)

        # 查到在hash 中的 手机号
        zset_phone = self.cookies_deal_r.hget(cookie_redis_key_hash,
                                              redis_member)
        # 移除在list 和 有用hash 中的 数据 # 在不能用的hash中添加
        self.cookies_deal_r.lrem(cookie_redis_key_list, 0, redis_member)
        self.cookies_deal_r.hdel(cookie_redis_key_hash, redis_member)
        self.cookies_deal_r.hset(unuse_cookie_redis_key, redis_member,
                                 zset_phone)
        # 再从redis中取出一个cookie构建request对象
        try:
            popcookie = self.cookies_deal_r.lpop(cookie_redis_key_list)
            self.cookies_deal_r.rpush(cookie_redis_key_list, popcookie)

        except Exception as e:
            selflog.logger.error("{spidername}--cookie 耗尽请补充, 错误信息:{e}".format(
                spidername=spider.name, e=e))
            # 发送邮件通知,并且最好处理能关闭爬虫
            sendEmail(content="{cookname}cookie耗尽,请尽快处理".format(
                cookname=cookie_redis_key_list))
            spider.crawler.engine.close_spider(
                spider, "{cookname}cookie耗尽,关闭爬虫".format(
                    cookname=cookie_redis_key_list))

        else:
            request.cookies = json.loads(popcookie)
            return request
Пример #4
0
    def process_request(self, request, spider):
        selflog = SelfLog(spider.name)
        # 根据爬虫名字分别处理爬虫代理和cookie
        if spider.name == "nnqc":
            request.meta['http_proxy'] = self.get_proxy()
            print("{}使用的代理为{},请求url:{}".format(spider.name,
                                               request.meta['http_proxy'],
                                               request.url))
        elif spider.name == "chezhen":
            request.meta['http_proxy'] = self.get_proxy()
            print("{}使用的代理为{},请求url:{}".format(spider.name,
                                               request.meta['http_proxy'],
                                               request.url))
        elif spider.name == "car168":
            pass

        # # 添加cookie
        request.cookies = self.get_cookies(request, spider)
        # cookie_dict = self.get_cookies(request, spider)
        # cookie = ""
        # for k, v in cookie_dict.items():
        #     cookie = cookie + k + "=" + v +";"
        # request.headers["User-Agent"] = UserAgent().random
        # request.headers["Content-Type"] = "text/html; charset=utf-8"
        # request.headers["Cookie"] = cookie

        # url = request.url
        # redis_key = request.meta['url_redis']
        # if self.add_url_r.sismember(redis_key, url):
        #     spider.logger.info("该url已经爬取,舍弃:%s"%url)
        #     raise IgnoreRequest
        return None
Пример #5
0
    def process_item(self, item, spider):

        selflog = SelfLog(spider.name)
        selflog_error = SelfLog("error")
        keys = [
            "brand", "type", "year", "style", "guide_price", "displacement",
            "configuration", "version", "status"
        ]
        values = [item[i] for i in keys]
        # 插入车型数据前先在数据库中进行匹配是否存在,若存在则判断是否符合更新条件进行更新, 若不存在则保存
        sql_search_style = "select id, brand, type from car_style where brand=%s and type=%s and year=%s and style=%s and guide_price=%s"
        search_db_result = Mysqlpython().readall(sql_search_style, [
            item["brand"], item['type'], item['year'], item['style'],
            item['guide_price']
        ])

        if search_db_result:
            commit_id = search_db_result[0][0]
            # selflog.logger.info("出现的重复车型:%s", search_db_result[0])
            update_set = "update car_style set status=null "
            update_where = " where id=%s "
            sqlparam = [commit_id]

            if not item['guide_price'] and item['guide_price'] != "None":
                update_set += " , guide_price=%s"
                update_where += " and (guide_price is null or guide_price='None')"
                sqlparam.append(item['guide_price'])
            if not item['displacement'] and item['displacement'] != "None":
                update_set += " , displacement=%s"
                update_where += " and (displacement is null or displacement='None')"
                sqlparam.append(item['displacement'])
            if item['version'] and item['version'] != "None":
                update_set += " , version=%s"
                update_where += " and (version is null or version='None')"
                sqlparam.append(item['version'])
            # 如果update_set 还是原来的值则不执行更新操作
            if update_set == "update car_style set status=null ":
                pass
            else:
                sql_update = update_set + update_where
                # selflog.logger.info(
                #     "执行更新sql:{sql_update}, values:{values}, id:{commit_id}".format(sql_update=sql_update,
                #                                                                    values=sqlparam,
                #                                                                    commit_id=commit_id))
                try:
                    self.cur.execute(sql_update, sqlparam)
                    self.conn.commit()
                except Exception as e:
                    selflog_error.logger.error(
                        "更新sql出错:{sql_update}, values:{values}, id:{commit_id},e:{e}"
                        .format(sql_update=sql_update,
                                values=sqlparam,
                                commit_id=commit_id,
                                e=e))
        # 如果查不到则进行插入车型、价钱、更新时间等详情操作
        else:
            # 插入车型表
            sql = "insert into `{}` ({}) values ({})".format(
                item.table_name,
                ','.join(keys),
                # 使用占位符插入的方式是保证兼容除字符串格式以外数据
                ','.join(['%s'] * len(values)),
            )
            try:
                self.cur.execute(sql, values)
                self.conn.commit()
            except Exception as e:
                selflog_error.logger.info(
                    "{spidername} 插入车型表出错e:{e}sql:{sql}, --values:{values}".
                    format(spidername=spider.name, e=e, sql=sql,
                           values=values))
                commit_id = None
            else:
                commit_id = self.cur.lastrowid

        # 如果没有车价格和交易量则不进行插入
        if commit_id and (item['price'] or item['volume']):

            # 在redis中判断这个详情页hash: detail-url: 更新时间##数据库保存id是否存在,
            # 如果存在则判断更新时间是否符合,不过不符合则进行数据插入然后更新redis数据
            # 如果不存在则存入redis和数据库
            requesturl = item['detail_url']
            rediskey = item['rediskey']
            # 从 中取出更新时间和保存的id
            hash_value = self.save_url_r.hget(rediskey, requesturl)
            if hash_value:
                hash_value = hash_value.decode()
                updatetime = hash_value.split('##')[0]
                save_id = hash_value.split('##')[1]

                if updatetime == item['updatetime']:
                    pass
                # 执行插入
                else:
                    self.insert_detaildata(spider, commit_id, item,
                                           selflog_error, rediskey)
                #     sql_update_detail = "update car_detail set updatetime=%s, price=%s, volume=%s where id=%s".format(
                #         updatetime=item['updatetime'], price=item['price'],volume=item['volume'], save_id=save_id)
                #     sql_insert_detail = "insert "
                #     try:
                #         self.cur.execute(sql_insert_detail, [item['updatetime'], item['price'],item['volume'], save_id])
                #         # 更新redis的值
                #         self.save_url_r.hset(rediskey, requesturl, item['updatetime'] + "##" + str(self.cur.lastrowid))
                #         print("{spidername}--更新爬取价格数据:{updata}".format(spidername=spider.name, updata=item['updatetime'] + "##" + save_id))
                #
                #         # selflog.logger.info("更新爬取价格数据:{updata}".format(updata=item['updatetime'] + "##" + save_id))
                #     except Exception as e:
                #         selflog_error.logger.error("{spidername}更新爬取车价详情出错e:{e}__sql:{sql}".format(spidername=spider.name, e=e, sql=sql_update_detail))
            # redis中查不到这条数据,直接写入数据库中, 并在redis中添加 detail_url:更新时间##save_id 数据
            else:
                self.insert_detaildata(spider, commit_id, item, selflog_error,
                                       rediskey)
                # selflog.logger.info("写入爬取价格数据:{updata}".format(updata=item['updatetime'] + "##" + str(self.cur.lastrowid)))

                # self.save_url_r.sadd(item['rediskey'], requesturl)

        else:
            selflog.logger.info(
                "没有交易价格和交易量数据,car_detial表不进行插入key:%s, --values:%s" %
                (keys, values))
        return item
Пример #6
0
class ChezhenSpider(scrapy.Spider):
    name = 'chezhen'
    selflog = SelfLog(name)

    cookies = {
        "CzcyAutoLogin":
        "******",
    }

    # 查询车进口类型接口
    type_url = "http://www.chezhency.com/Home/ncar/gettype?cat_id=%s"
    # 车型详情列表页的url
    detail_requesturl = "http://www.chezhency.com/Home/ncar/index/cat_id/{catid}/type_id/{versionkey}/timer/2.html"
    allowed_domains = ['www.chezhency.com']
    start_urls = [
        'http://www.chezhency.com/',
        "http://www.chezhency.com/home/index/indexbei.html"
    ]

    # 构造请求头
    def get_headers(self):
        sql = "select ua from useragent order by rand() limit 1;"
        headers = {
            # "User-Agent": dbhelper.readall(sql)[0][0],
            "User-Agent": UserAgent().random,
        }
        return headers

    # # 构造cookie
    # def get_cookies(self, lastcookie):
    #
    #     cookie = json.loads(cookie_r.srandmember("cookies_168"))
    #     while cookie == lastcookie:
    #         cookie = json.loads(cookie_r.srandmember("cookies_168"))
    #     return cookie

    # 构造第一个请求
    def start_requests(self):
        url = self.start_urls[1]
        request = Request(url=url,
                          callback=self.parse_brand,
                          headers=self.get_headers())  #, cookies=self.cookies)
        # 存放url
        request.meta['url_redis'] = url_redis
        # 存放cookie
        request.meta['cookies_redis'] = cookies_chezhen
        # 存放不能用的cookie
        request.meta['useless_cookies'] = unuseless_cookies_chezhen
        yield request

    def parse_brand(self, response):
        brand_base = response.xpath("//dl/dd/a")
        for brand in brand_base:
            brandname = brand.xpath('./text()').extract_first()
            url = brand.xpath('./@href').extract_first()
            request = response.follow(
                url=url, callback=self.parse_type,
                headers=self.get_headers())  #, cookies=self.cookies)
            request.meta['brand'] = brandname
            # 存放url
            request.meta['url_redis'] = url_redis
            # 存放cookie
            request.meta['cookies_redis'] = cookies_chezhen
            # 存放不能用的cookie
            request.meta['useless_cookies'] = unuseless_cookies_chezhen
            yield request

    def parse_type(self, response):
        car_type_dd = response.xpath('//dl[@class="listdl"]/dd')
        for dd in car_type_dd:
            car_type = dd.xpath('./text()').extract_first()
            car_type_id = dd.xpath('./@data-id').extract_first()
            request_url = self.type_url % car_type_id
            result = requests.get(request_url, headers=self.get_headers())
            version_data = eval(result.text).get('data')
            if len(version_data):
                for version in version_data:
                    key = version.get("key")
                    value = version.get('value')
                    detail_url = self.detail_requesturl.format(
                        catid=car_type_id, versionkey=key)
                    request = Request(
                        url=detail_url,
                        callback=self.parse_detail,
                        headers=self.get_headers())  # , cookies=self.cookies)
                    request.meta['brand'] = response.meta['brand']
                    request.meta['type'] = car_type
                    request.meta['cartypeid'] = car_type_id
                    request.meta['versionkey'] = key

                    # 存放url
                    request.meta['url_redis'] = url_redis
                    # 存放cookie
                    request.meta['cookies_redis'] = cookies_chezhen
                    # 存放不能用的cookie
                    request.meta['useless_cookies'] = unuseless_cookies_chezhen
                    yield request

    #车型详情信息列表页
    def parse_detail(self, response):
        brand = response.meta['brand']
        cartype = response.meta['type']
        # 当请求下一页时候注意加上这条数据
        cartypeid = response.meta['cartypeid']
        versionkey = response.meta['versionkey']

        # li-data数据解析
        base_xpath = response.xpath('//ul[@class="carlist"]/li')
        requesturl = response.url
        for li in base_xpath:
            carstyle = li.xpath(
                './a/div[@class="c_title"]/p[@class="lt"]/text()'
            ).extract_first()
            price = li.xpath('./a/div[@class="c_title"]/p[@class="rt"]/text()'
                             ).extract_first()
            version = li.xpath('./a/div[@class="detail"]/p[@class="lt"]/text()'
                               ).extract_first()
            guide_price = li.xpath(
                './a/div[@class="detail"]/p[@class="rt"]/text()'
            ).extract_first()
            config = li.xpath(
                './a/div[@class="intro"][2]/text()').extract_first()

            carstyle = deal_style(carstyle)
            year = deal_year(carstyle, self)

            # 排量
            displacement = deal_displacement(carstyle, self)
            # 指导价格
            guide_price = deal_guideprice(guide_price, carstyle, self)

            # 详情页P标签中的值判断进口版本
            if China in version:
                if "进口" in carstyle:
                    version_num = 1
                else:
                    version_num = 0
            elif USA in version:
                version_num = 2
            elif Canada in version:
                version_num = 3
            elif Mexico in version:
                version_num = 4
            elif European in version:
                version_num = 5
            else:
                version_num = "平行None"

            updatetime = li.xpath(
                './a/div[@class="attach"]/p[@class="rt"]/text()'
            ).extract_first()
            updatetime = deal_updatetime(updatetime)

            detail_url = li.xpath('./a/@href').extract_first()
            if detail_url:
                detail_url = self.start_urls[0] + detail_url
            else:
                detail_url = ""

            get_item = sav_item(brand=brand,
                                cartype=cartype,
                                year=year,
                                carstyle=carstyle,
                                guide_price=guide_price,
                                displacement=displacement,
                                config=config,
                                version_num=version_num,
                                price=price,
                                volume=None,
                                platform=3,
                                requesturl=detail_url,
                                rediskey=url_redis,
                                updatetime=updatetime)
            yield get_item
        # 懒加载请求最后一条数据
        lastid = response.xpath(
            '//ul[@class="carlist"]/li[last()]/@data-id').extract_first()
        if lastid:
            request_url = self.detail_requesturl.format(
                catid=cartypeid,
                versionkey=versionkey) + "?id={lastid}".format(lastid=lastid)
            request = response.follow(
                url=request_url,
                callback=self.parse_detail,
                headers=self.get_headers())  # , cookies=self.cookies)
            # 添加request meta信息
            # 当请求下一页时候注意加上这些数据
            request.meta['brand'] = brand
            request.meta['type'] = cartype
            request.meta['cartypeid'] = cartypeid
            request.meta['versionkey'] = versionkey
            # 存放url
            request.meta['url_redis'] = url_redis
            # 存放cookie
            request.meta['cookies_redis'] = cookies_chezhen
            # 存放不能用的cookie
            request.meta['useless_cookies'] = unuseless_cookies_chezhen
            # self.selflog.logger.info("继续抓取下一页信息:{}".format(request_url))
            yield request

        set_url_r.sadd(url_redis_chezhen, response.url)
Пример #7
0
class Car168Spider(scrapy.Spider):
    name = 'car168'
    allowed_domains = ['www.chehang168.com']
    start_urls = ['http://www.chehang168.com/', 'http://www.chehang168.com/index.php?c=index&m=allBrands', "http://www.chehang168.com/index.php?c=index&m=Cardata"]
    selflog = SelfLog(name)
    
    def get_headers(self):
        headers = {
            "User-Agent": UserAgent().random,
        }
        return headers

    i = 0

    def start_requests(self):
        url = "http://www.chehang168.com/index.php?c=index&m=series&psid=%s"

        for key in type_r.keys():
            key = key.decode()
            request = Request(url=url%key, callback=self.parse_select_cartype, headers=self.get_headers())#, cookies=self.get_cookies(""))
            value = type_r.get(key)
            if value:
                request.meta['type'] = value.decode()
            else:
                request.meta['type'] = None

            # 存放url
            request.meta['url_redis'] = url_redis
            # 存放cookie
            request.meta['cookies_redis'] = cookies_car168
            # 存放不能用的cookie
            request.meta['useless_cookies'] = unuseless_cookies_car168
            yield request

    # 列表详情页信息
    def parse_select_cartype(self, response):
        lastcookie = response.request.cookies
        # 这里分情况,当是第一页时构建从第二页到最后一页的请求, 如果不是第一页则跳过
        isfirstpage = response.xpath('//p[@class="pagenum"]/span[@class="current"]/text()').extract_first()
        if isfirstpage == "1":
            lastpage = response.xpath('//p[@class="pagenum"]//a[last()-2]/text()').extract_first()
            pageurl = response.url + "&pricetype=0&page=%s"
            for pagenum in range(2, int(lastpage)+1):
                nextpage_url = pageurl%pagenum
                # self.selflog.logger.info("继续请求下一页信息:%s" % nextpage_url)
                request_nextpage = response.follow(url=nextpage_url, callback=self.parse_select_cartype,
                                                   headers=self.get_headers())#, cookies=self.get_cookies(lastcookie))
                request_nextpage.meta['type'] = response.meta['type']
                # 存放url
                request_nextpage.meta['url_redis'] = url_redis
                # 存放cookie
                request_nextpage.meta['cookies_redis'] = cookies_car168
                # 存放不能用的cookie
                request_nextpage.meta['useless_cookies'] = unuseless_cookies_car168
                yield request_nextpage

        # type_label = response.meta['type_lable']
        # 车型
        #***新方法的brand, type_lable, type
        self.i += 1
        type_lable = response.xpath('//div[@class="sx_left o_cf"]/a[1]/span/text()').extract_first()
        brand = response.xpath('//div[@class="ch_crumb o_w mar8"]/a[3]/text()').extract_first()
        base_car_style = response.xpath('//div[@class="cheyuan_list"]/ul[2]/li')

        for li in base_car_style:
            car_style = li.xpath('./div/h3/a/text()').extract_first()
            # 类型去空格处理后使用一个空格隔开
            car_style = deal_style(car_style)
            # 配置没有为空
            config = li.xpath('./p[@class="c2"]/text()').extract_first()
            # 排量
            displacement = deal_displacement(car_style, self)
            # 年款
            year = deal_year(car_style, self)

            # 进口方式
            version = li.xpath('./p[@class="c1"]/text()').extract_first()
            version = str(version)
            if China in version or "国产" in version:
                version_num = 0
            elif ChinaImport in version:
                version_num = 1
            elif USA in version:
                version_num = 2
            elif Canada in version:
                version_num = 3
            elif Mexico in version:
                version_num = 4
            elif European in version:
                version_num = 5
            else:
                self.selflog.logger.info("车型{car_style}的进口版本不在规定内".format(car_style=car_style))
                version_num = None

            # 指导价: 指导价:25.27万下20点
            guide_price = li.xpath('./div/h3/b/text()').extract_first()
            guide_price = deal_guideprice(guide_price, car_style, self)
            # 价钱标识 ¥
            price_flag = li.xpath('./div/span/text()').extract_first()
            price = li.xpath('./div/span/b/text()').extract_first()
            # 成交量: 车源成交量:3单
            volume = li.xpath('./p/cite[4]/text()').extract_first()
            if volume:
                search_volume = re.search("(\d+)单", volume)
                if search_volume:
                    volume = search_volume.group(1)

            # 更新时间
            updatetime = li.xpath('./p[@class="c3"]/cite[1]/text()').extract_first()
            updatetime = deal_updatetime(updatetime)
            # 详情页信息
            detail_url = li.xpath('./div/h3/a/@href').extract_first()
            if detail_url:
                detail_url = self.start_urls[0] + str(detail_url)
            else:
                detail_url = ""

            carstyleitem = CarStyleItem()
            carstyleitem['brand'] = brand
            carstyleitem["type"] = response.meta['type']
            carstyleitem['year'] = year
            carstyleitem['style'] = car_style
            carstyleitem['configuration'] = config
            carstyleitem['displacement'] = displacement
            carstyleitem['volume'] = volume
            carstyleitem['status'] = None
            carstyleitem['version'] = version_num
            carstyleitem['guide_price'] = guide_price
            carstyleitem['price'] = price
            carstyleitem['platform'] = 1
            carstyleitem['rediskey'] = url_redis
            # 详情页的url 和更新时间
            carstyleitem['detail_url'] = detail_url
            carstyleitem['updatetime'] = updatetime

            yield carstyleitem

        # 把这一页的url信息set进redis中
        # 如果已经爬取过的话并不会触发添加条件,会被过滤掉,
        # 如何判断已经全部添加。。。。。
        set_url_r.sadd(url_redis_car168, response.url)
Пример #8
0
class NnqcSpider(scrapy.Spider):
    name = 'nnqc'
    allowed_domains = ['www.niuniuqiche.com']
    start_urls = [
        'http://www.niuniuqiche.com/', 'http://www.niuniuqiche.com/v2/brands'
    ]
    i = 0
    selflog = SelfLog(name)

    def get_headers(self):
        # sql = "select ua from useragent order by rand() limit 1;"
        headers = {
            # "User-Agent": dbhelper.readall(sql)[0][0],
            "User-Agent": UserAgent().random,
            "Content-Type": "text/html; charset=utf-8",
            "Host": "www.niuniuqiche.com",
        }

        return headers

    # def get_cookie(self, newcookie):
    #
    #     cookies_dict = random.choice(r_zet_cookie.zscan(cookies_nnqc)[1])
    #     dicts = json.loads(cookies_dict[0])
    #     phonenum = cookies_dict[1]
    #     print("牛牛汽车每次请求时候的手机号:{phonenum}, cookie:{cookie}".format(phonenum=phonenum, cookie=dicts))
    #     return dicts

    def start_requests(self):
        self.i += 1
        print("请求总数", self.i)
        request = Request(
            url=self.start_urls[1],
            callback=self.parse,
        )  # dont_filter=True)#headers=self.get_headers(),)# cookies=self.get_cookie(""))#,meta={'proxy':self.get_proxy()})
        # 存放url
        request.meta['url_redis'] = url_redis
        # 存放cookie
        request.meta['cookies_redis'] = cookies_nnqc
        # 存放不能用的cookie
        request.meta['useless_cookies'] = unuseless_cookies_nnqc
        request.meta['handle_httpstatus_list'] = [302]
        yield request

    # 所有的汽车品牌列表页
    def parse(self, response):
        self.i += 1
        print("请求总数", self.i)
        cookie = response.headers.getlist('Set-Cookie')
        base_div = response.xpath(
            '//div[@class="listing-brands"]/div[@class="item"]/div[@class="brands"]//div[@class="brand"]'
        )
        cookie_dicts = self.get_cookie(cookie)
        for base_xpath in base_div:
            url = base_xpath.xpath('./a/@href').extract_first()
            brand = base_xpath.xpath('./a/text()').extract_first()
            request = response.follow(
                url=self.start_urls[0][:-1] + url,
                cookies=cookie_dicts,
                callback=self.config_parse,
            )  # dont_filter=True)#headers=self.get_headers())
            request.meta['brand'] = brand

            # 存放url
            request.meta['url_redis'] = url_redis
            # 存放cookie
            request.meta['cookies_redis'] = cookies_nnqc
            # 存放不能用的cookie
            request.meta['useless_cookies'] = unuseless_cookies_nnqc
            request.meta['handle_httpstatus_list'] = [302]
            yield request

    # 带车型选择的详情页
    def config_parse(self, response):
        self.i += 1
        print("请求总数", self.i)
        cookie = response.headers.getlist('Set-Cookie')
        type_lable_div = response.xpath('//ul[@class="tab-content"]/li')
        for div in type_lable_div:
            table_lable = div.xpath(
                './div[@class="col-sm-3"]/text()').extract_first()
            type_div = div.xpath('./div[@class="col-sm-9"]')
            for li in type_div:
                car_type = li.xpath('.//div/a/text()').extract_first()
                car_type_url = li.xpath('.//div/a/@href').extract_first()
                # 这个是要抓取页的url, 需要去重
                request = response.follow(
                    url=car_type_url,
                    callback=self.parse_select_cartype,
                )  #headers=self.get_headers())#, cookies=self.get_cookie(cookie))
                request.meta['table_lable'] = table_lable
                request.meta['car_type'] = car_type
                request.meta['brand'] = response.meta['brand']

                # 存放url
                request.meta['url_redis'] = url_redis
                # 存放cookie
                request.meta['cookies_redis'] = cookies_nnqc
                # 存放不能用的cookie
                request.meta['useless_cookies'] = unuseless_cookies_nnqc
                request.meta['handle_httpstatus_list'] = [302]
                yield request

    # 所有二级车型页
    def parse_select_cartype(self, response):
        self.i += 1
        print("请求总数", self.i)
        cookie = response.headers.getlist('Set-Cookie')

        # 如果a 连接中有下一页信息则请求下一页
        # 这里分情况,当是第一页时构建从第二页到最后一页的请求, 如果不是第一页则跳过
        isfirstpage = response.xpath(
            '//div[@class="section-pagination"]//span[@class="page current"]/text()'
        ).extract_first()
        lastpage = response.xpath(
            '//div[@class="section-pagination"]//span[@class="last"]/a/@href'
        ).extract_first()

        print("详情页的页码信息", isfirstpage, lastpage)
        if isfirstpage and lastpage:
            isfirstpage = isfirstpage.strip()
            lastpage_url = lastpage.split('page=')[0]
            lastpage_pagenum = lastpage.split('page=')[1]
            if isfirstpage == "1":
                pageurl = response.url + "&page=%s"
                for pagenum in range(2, int(lastpage_pagenum) + 1):
                    nextpage_url = self.start_urls[
                        0] + lastpage_url + "&page=%s" % pagenum
                    # self.selflog.logger.info("继续请求下一页信息:%s" % nextpage_url)
                    request_nextpage = response.follow(
                        url=nextpage_url,
                        callback=self.parse_select_cartype,
                    )  #headers=self.get_headers())
                    # 存放url
                    request_nextpage.meta['url_redis'] = url_redis
                    # 存放cookie
                    request_nextpage.meta['cookies_redis'] = cookies_nnqc
                    # 存放不能用的cookie
                    request_nextpage.meta[
                        'useless_cookies'] = unuseless_cookies_nnqc
                    request_nextpage.meta['handle_httpstatus_list'] = [302]
                    request_nextpage.meta['table_lable'] = response.meta[
                        'table_lable']
                    request_nextpage.meta['brand'] = response.meta['brand']
                    request_nextpage.meta['car_type'] = response.meta[
                        'car_type']
                    yield request_nextpage

        type_label = response.meta['table_lable']
        # 车型
        base_car_style = response.xpath('//div[@class="item"]')
        requesturl = response.url

        for div in base_car_style:

            car_style = div.xpath(
                './div[@class="car-title"]/a/text()').extract_first()
            # 类型去空格处理后使用一个空格隔开
            car_style = deal_style(car_style)
            # 配置没有为空
            # config = li.xpath('./p[@class="c2"]/text()').extract_first()
            config = None

            # 排量
            displacement = deal_displacement(car_style, self)
            # 年款
            year = deal_year(car_style, self)

            # 进口方式
            version = div.xpath(
                './div[@class="car-subtitle clearfix"]/span/text()'
            ).extract_first()
            # 平行进口处理
            if "平行进口" in type_label:
                if USA in version or "美规" in version:
                    version_num = 2
                elif Canada in version:
                    version_num = 3
                elif Mexico in version or "墨西哥版" in version:
                    version_num = 4
                elif European in version:
                    version_num = 5
                else:
                    version_num = "平行None"
            # 进口处理
            elif "进口" in type_label:
                version_num = 1
            # 中规处理
            else:
                if China in version or "国产" in version:
                    version_num = 0
                elif ChinaImport in version:
                    version_num = 1
                else:
                    self.selflog.logger.info(
                        "车型{car_style}的进口版本不在规定内".format(car_style=car_style))
                    version_num = None

            # 指导价: 指导价:25.27万下20点
            guide_price = div.xpath(
                './div/div[@class="car-guide-price"]/text()').extract_first()
            guide_price = deal_guideprice(guide_price, car_style, self)
            price = div.xpath(
                './div/div[@class="car-price"]/text()').extract_first()
            # 成交量: 车源成交量:3单
            volume = div.xpath(
                './div[@class="user-info clearfix"]/span[3]/text()'
            ).extract_first()
            if volume:
                search_volume = re.search("\d+", volume)
                if search_volume:
                    volume = search_volume.group()

            # 更新时间和详情页url
            detail_url = div.xpath(
                './div[@class="car-title"]/a/@href').extract_first()
            if detail_url:
                detail_url = self.start_urls[0] + detail_url
            else:
                detail_url = ""

            updatetime = div.xpath(
                './div[@class="user-info clearfix"]/div[@class="car-publish-time"]/text()'
            ).extract_first()
            if updatetime:
                # 更换成统一格式 05-24
                if "/" in updatetime:
                    updatetime_change = "-".join(updatetime.split('/'))
                    updatetime = str(datetime.datetime.now().year
                                     ) + "-" + updatetime_change + " 00:00"
                elif ":" in updatetime:
                    updatetime = datetime.datetime.now().strftime(
                        '%Y-%m-%d') + " " + updatetime
            else:
                updatetime = ""

            carstyleitem = CarStyleItem()
            carstyleitem['brand'] = response.meta['brand']
            carstyleitem["type"] = response.meta['car_type']
            carstyleitem['year'] = year
            carstyleitem['style'] = car_style
            carstyleitem["requesturl"] = requesturl
            carstyleitem['configuration'] = str(config)
            carstyleitem['displacement'] = displacement
            carstyleitem['version'] = str(version_num)
            carstyleitem['guide_price'] = guide_price
            carstyleitem['price'] = price
            carstyleitem['volume'] = volume
            carstyleitem['status'] = "None"
            carstyleitem['platform'] = 2
            carstyleitem['rediskey'] = url_redis
            carstyleitem['detail_url'] = detail_url
            carstyleitem['updatetime'] = updatetime
            yield carstyleitem

        set_url_r.sadd(url_redis_nnqc, response.url)