Пример #1
0
    def start_requests(self):
        #this month
        thismonth = datetime.date.today().month
        #modellist
        with open('blm/' + self.dbname + '/modellist.csv', 'rb') as csvfile:
            reader = csv.DictReader(csvfile)
            modellist = [row for row in reader]
        #citylist
        with open('blm/' + self.dbname + '/citylist.csv', 'rb') as csvfile:
            reader = csv.DictReader(csvfile)
            citylist = [row for row in reader]
        step = len(modellist) / self.parts + 1
        starti = self.part * step
        if self.part == self.parts - 1:
            step = len(modellist) - starti
        #urllist
        for city in citylist[2:3]:
            for model in modellist[starti:(starti + step)]:
                for year in range(int(model['min_reg_year']),
                                  int(model['max_reg_year']) + 1):
                    for month in [1, 7, 12]:
                        if year == 2018:
                            month = min(thismonth, month)
                        date = str(year) + '-' + str(month)
                        usedyear = float(
                            (datetime.date(2017, 4, 15) -
                             datetime.date(year, month, 1)).days) / 365
                        mile = round(usedyear * 5 / 3, 2)
                        if mile >= 45 or mile <= 0.1:
                            mileagelist = [0.1, 45]
                        else:
                            mileagelist = [0.1, mile, 45]
                        for mile in mileagelist:
                            url ="https://dingjia.che300.com/app/EvalResult/getPreSaleRate?callback=jQuery18303745581165454668_1491989508797" \
                                 "&prov=" + str(city['provid']) +"&city=" + str(city['cityid']) + \
                                 "&brand=" + str(model['brandid']) + "&series=" + str(model['familyid']) + \
                                 "&model=" + str(model['salesdescid']) + "&regDate=" + date + "&mile=" + str(mile)
                            url1='https://dingjia.che300.com/app/EvalResult/allProvPrices?callback=jQuery18307234983962413968_1492479620941' + \
                                 "&brand=" + str(model['brandid']) + "&series=" + str(model['familyid']) + \
                                 "&model=" + str(model['salesdescid']) + "&regDate=" + date + "&mile=" + str(mile)
                            if not (dfcheck(self.df, url, self.tag)):
                                meta = dict()
                                meta['provid'] = city['provid']
                                meta['cityid'] = city['cityid']
                                meta['salesdescid'] = model['salesdescid']
                                meta['regDate'] = date
                                meta['mile'] = str(mile)
                                yield scrapy.Request(url=url,
                                                     meta={"datainfo": meta},
                                                     callback=self.parse)

                            if not (dfcheck(self.df, url1, self.tag)):
                                meta = dict()
                                meta['salesdescid'] = model['salesdescid']
                                meta['regDate'] = date
                                meta['mile'] = str(mile)
                                yield scrapy.Request(
                                    url=url1,
                                    meta={"datainfo": meta},
                                    callback=self.parse_allprov)
 def start_requests(self):
     #this month
     thismonth = datetime.date.today().month
     #modellist
     with open('blm/' + self.dbname + '/modellist.csv', 'rb') as csvfile:
         reader = csv.DictReader(csvfile)
         modellist = [row for row in reader]
     #citylist
     with open('blm/' + self.dbname + '/citylist.csv', 'rb') as csvfile:
         reader = csv.DictReader(csvfile)
         citylist = [row for row in reader]
     step = len(modellist) / self.parts + 1
     starti = self.part * step
     if self.part == self.parts - 1:
         step = len(modellist) - starti
     #urllist
     for city in citylist[2:3]:
         for model in modellist[starti:(starti + step)]:
             for year in range(int(model['min_reg_year']),
                               int(model['max_reg_year']) + 1):
                 if year == 2018:
                     for month in range(1,
                                        datetime.datetime.now().month + 1):
                         date = str(year) + '-' + str(month)
                         mile = 0.2
                         url = "https://dingjia.che300.com/app/EvalResult/getResidualAnalysis?prov="\
                               + str(city['provid']) +"&city="+ str(city['cityid']) +"&series="+ str(model['familyid']) +"&model="+ \
                               str(model['salesdescid']) +"&regDate="+ date  +"&mile="+str(mile)
                         if not (dfcheck(self.df, url, self.tag)):
                             meta = dict()
                             meta['provid'] = city['provid']
                             meta['cityid'] = city['cityid']
                             meta['salesdescid'] = model['salesdescid']
                             meta['regDate'] = date
                             meta['mile'] = str(mile)
                             yield scrapy.Request(url=url,
                                                  meta={"datainfo": meta},
                                                  callback=self.parse)
                 else:
                     for month in range(1, 13):
                         date = str(year) + '-' + str(month)
                         mile = 0.2
                         url = "https://dingjia.che300.com/app/EvalResult/getResidualAnalysis?prov="\
                               + str(city['provid']) +"&city="+ str(city['cityid']) +"&series="+ str(model['familyid']) +"&model="+ \
                               str(model['salesdescid']) +"&regDate="+ date  +"&mile="+str(mile)
                         if not (dfcheck(self.df, url, self.tag)):
                             meta = dict()
                             meta['provid'] = city['provid']
                             meta['cityid'] = city['cityid']
                             meta['salesdescid'] = model['salesdescid']
                             meta['regDate'] = date
                             meta['mile'] = str(mile)
                             yield scrapy.Request(url=url,
                                                  meta={"datainfo": meta},
                                                  callback=self.parse)
Пример #3
0
 def parse(self, response):
     # car_item
     x = response.xpath(
         '//ul[@class="car-pic-form-box car-box-list clear"]/li')
     for temp in x:
         urlbase = temp.xpath("a/@href").extract_first()
         # urlbase.strip()
         urltemp = str(urlbase.strip())
         url = "http://" + urltemp[2:len(urltemp)]
         #num = urlbase.find("html") + 4
         #urlbase = urlbase[0:num]
         datasave1 = temp.extract()
         #url = response.urljoin(urlbase)
         if not (dfcheck(self.df, url, self.tag)):
             yield scrapy.Request(url,
                                  meta={"datasave1": datasave1},
                                  callback=self.parse_car)
         # next page
     next_page = response.xpath(
         u'//div[@class="pages-box"]/a[contains(text(),"下一页")]/@href')
     if next_page:
         urlbase = str(next_page.extract_first())
         urlbase2 = "http://so.iautos.cn"
         url = urlbase2 + urlbase
         yield scrapy.Request(url, self.parse)
Пример #4
0
 def select4_parse(self, response):
     counts = response.xpath(
         '//h1[@class="historyRecord_title"]/text()').re('\d+')
     if counts:
         counts = int(counts[0])
         if counts <= 6000:
             for href in response.xpath('//div[@class="carsItem carItem"]'):
                 urlbase = href.xpath('a/@href').extract_first()
                 datasave1 = href.extract()
                 url = response.urljoin(urlbase)
                 if not (dfcheck(self.df, url, self.tag)):
                     yield scrapy.Request(url,
                                          meta={'datasave1': datasave1},
                                          callback=self.parse_car)
             # next page
             next_page = response.xpath(
                 '//a[@class="next"]/@href').extract_first()
             if next_page:
                 url = response.urljoin(next_page)
                 yield scrapy.Request(url, self.select4_parse)
         else:
             for href in response.xpath(
                     u'//dd[@click_type="sale-transmissions"]/div/ul/li/a[not(contains(text(),"不限"))]/@href'
             ):
                 url = response.urljoin(href.extract())
                 yield scrapy.Request(url, self.select5_parse)
Пример #5
0
 def select2_parse(self, response):
     #数量
     counts = response.xpath(
         '//h1[@class="historyRecord_title"]/text()').re('\d+')
     print counts
     if counts:
         counts = int(counts[0])
         if counts <= 6000:
             for href in response.xpath('//div[@class="carsItem carItem"]'):
                 urlbase = href.xpath('a/@href').extract_first()
                 datasave1 = href.extract()
                 url = response.urljoin(urlbase)
                 if not (dfcheck(self.df, url, self.tag)):
                     yield scrapy.Request(url,
                                          meta={'datasave1': datasave1},
                                          callback=self.parse_car)
             # next page
             next_page = response.xpath(
                 '//a[@class="next"]/@href').extract_first()
             if next_page:
                 url = response.urljoin(next_page)
                 yield scrapy.Request(url, self.select2_parse)
         else:
             for href in response.xpath(
                     '//dl[@class="clearfix"]/dd/a/@href'):
                 url = response.urljoin(href.extract())
                 yield scrapy.Request(url, self.select4_parse)
Пример #6
0
 def select4_parse(self, response):
     counts = response.xpath(
         '//strong[@class="fc-org"]/text()').extract_first()
     if counts:
         counts = float(counts[0:-1])
         if counts <= 3000:
             for href in response.xpath(
                     '//dl[contains(@class,"list-pic clearfix cursor_pointer")]'
             ):
                 urlbase = href.xpath(
                     'dt/div[2]/div/a/@href').extract_first()
                 datasave1 = href.extract()
                 url = response.urljoin(urlbase)
                 if not (dfcheck(self.df, url, self.tag)):
                     yield scrapy.Request(url,
                                          meta={'datasave1': datasave1},
                                          callback=self.parse_car)
             # next page
             next_page = response.xpath('//a[@class="next"]/@href')
             if next_page:
                 url = response.urljoin(next_page.extract_first())
                 yield scrapy.Request(url, self.select4_parse)
         else:
             for href in response.xpath(
                     '//dd[@class="ddmyprice"]/a[not(@class="cur")]/@href'):
                 url = response.urljoin(href.extract())
                 yield scrapy.Request(url, self.select5_parse)
Пример #7
0
 def select3_parse(self,response):
     counts = response.xpath('//a[@name="view_v"]/h4/text()').re('\d+')[0] \
         if response.xpath('//a[@name="view_v"]/h4/text()').re('\d+') else "0"
     if counts:
         counts = float(counts)
         if counts<=4000:
             temp = response.xpath('//li[@class="con caritem conHeight"]')
             for x in temp:
                 # print x
                 urlbase = x.xpath('//a[@class="aimg"]/@href').extract_first()
                 url = response.urljoin(urlbase)
                 datasave1 = x.extract()
                 # urlbase = str(x.xpath('//a[@class="aimg"]/@href').extract_first())
                 # print urlbase
                 # print len(urlbase)
                 # url = "http://www.xin.com" + urlbase
                 # print url
                 # print "parse3"
                 # print type(url)
                 if not (dfcheck(self.df, url, self.tag)):
                     yield scrapy.Request(url, meta={"datasave1": datasave1}, callback=self.parse_car)
             next_page = response.xpath(u'//a[contains(text(),"下一页")]')
             if next_page:
                 urlbase = next_page.xpath('@href').extract_first()
                 # urlbase = str(next_page.extract()[0])
                 url = response.urljoin(urlbase)
                 # url = "http://www.xin.com" + urlbase
                 yield scrapy.Request(url, self.select3_parse)
             else:
                 for href in response.xpath('//div[@class="select-menu"]/div[@class="menu menu6"]/dd/a'):
                     urlbase = href.xpath('@href').extract_first()
                     # urlbase = str(href.extract())
                     url = response.urljoin(urlbase)
                     # url = "http://www.xin.com" + urlbase
                     yield scrapy.Request(url, self.select4_parse)
Пример #8
0
 def parse(self,response):
     for i in range(150000, 250000):
     #         print(i)
     # for i in range(self.start, self.start+self.num):
             url = "http://www.akd.cn/car/" + str(i) + "/"
             if not (dfcheck(self.df, str(i), self.tag)):
                 yield scrapy.Request(url,callback=self.parse_car)
Пример #9
0
 def select3_parse(self,response):
     print("select3")
     counts = response.xpath('//div[@class="totalCarsNum"]/span/text()').re('\d+')[0]
     listok = True
     if counts:
         counts = float(counts)
         if counts > 2400:
             listok = False
     if listok:
         x = response.xpath('//div[@class="carShow"]/div')
         for temp in x:
             urlbase = temp.xpath('a[@class="car-link"]/@href').extract_first()
             url = response.urljoin(urlbase)
             datasave1 = temp.extract()
             if not (dfcheck(self.df, url, self.tag)):
                 yield scrapy.Request(url, meta={"datasave1": datasave1}, callback=self.parse_car)
         next_page = response.xpath('//span[contains(text(),">")]/../@href').extract_first()
         if next_page:
             url = response.urljoin(next_page)
             yield scrapy.Request(url, self.select3_parse)
     else:
         x = response.xpath('//ul[@class="sc-option-list"]/li')
         for temp in x:
             urlbase = temp.xpath('a/@href').extract_first()
             url = response.urljoin(urlbase)
             yield scrapy.Request(url, self.select4_parse)
Пример #10
0
 def parse(self,response):
     print(123123213)
     counts=response.xpath('//div[@class="totalCarsNum"]/span/text()').re('\d+')[0]
     listok=True
     if counts:
         counts=float(counts)
         if counts>2400:
             listok=False
     if listok:
         x=response.xpath('//div[@class="carShow"]/div')
         for temp in x:
             urlbase=temp.xpath('a[@class="car-link"]/@href').extract_first()
             url=response.urljoin(urlbase)
             datasave1=temp.extract()
             if not (dfcheck(self.df, url, self.tag)):
                 yield scrapy.Request(url, meta={"datasave1": datasave1}, callback=self.parse_car)
         next_page=response.xpath('//span[contains(text(),">")]/../@href').extract_first()
         if next_page:
             url=response.urljoin(next_page)
             yield scrapy.Request(url, self.parse)
     else:
         price=response.xpath('//div[@class="price-sc clearfix lab_block"]/a')
         for temp in price[1:9]:
             urlbase=temp.xpath('@href').extract_first()
             url=response.urljoin(urlbase)
             yield scrapy.Request(url, self.select2_parse)
Пример #11
0
 def select2_parse(self, response):
     # logging.log(msg="do this step2",level=logging.INFO)
     # print(response.body)
     counts = response.xpath(
         '//div[@class="info_funcs_right"]/span/i/text()')
     listok = True
     if counts:
         counts = float(counts.extract_first())
         if counts > 3500:
             listok = False
     if listok:
         for href in response.xpath(
                 '//ul[@class="car_list ac_container"]/li/div[@class="col col2"]'
         ):
             url = str(href.xpath('a/@href').extract_first())
             datasave1 = href.extract()
             # url = response.urljoin(urlbase)
             # print urlbase
             # print url
             if not (dfcheck(self.df, url, self.tag)):
                 yield scrapy.Request(url,
                                      meta={"datasave1": datasave1},
                                      callback=self.parse_car)
         # next page
         next_page = response.xpath(
             '//div[@class="pager"]/a[@class="next"]/@href')
         if next_page:
             url = response.urljoin(next_page.extract_first())
             yield scrapy.Request(url, self.select2_parse)
     else:
         for href in response.xpath(
                 u'//dt[contains(text(),"类型:")]/../dd/a/@href')[1:14]:
             url = str(href.extract())
             yield scrapy.Request(url, self.select3_parse)
Пример #12
0
 def list_parse(self, response):
     print(response)
     for href in response.xpath('//li[@class="span6 list-item car-item"]'):
         datasave1 = href.extract()
         urlbase = href.xpath('a/@href').extract_first()
         url = response.urljoin(urlbase)
         if not (dfcheck(self.df, url, self.tag)):
             yield scrapy.Request(url,
                                  cookies=self.cookies_for_request,
                                  headers=self.headers,
                                  meta={'datasave1': datasave1},
                                  callback=self.parse_car)
     next_page = response.xpath(
         '//a[@rrc-event-name="switchright"]/@href').extract_first()
     if not (next_page):
         time.sleep(0.5)
         try:
             page = int(
                 response.xpath(
                     '//li[@class="active"]/a[@href="javascript:void(0);"]/text()'
                 ).extract_first()) + 1
         except Exception as e:
             print(e)
             yield scrapy.Request(url=response.url, dont_filter=True)
             return
         location = response.url.find("ershouche") + 9
         newpage = response.url[0:location] + "/p" + str(page)
         print newpage
         url = response.urljoin(newpage)
     else:
         url = response.urljoin(next_page)
     yield scrapy.Request(url,
                          self.list_parse,
                          cookies=self.cookies_for_request,
                          headers=self.headers)
Пример #13
0
 def start_requests(self):
     cars = []
     for i in range(40000, self.carnum):
         urlbase = 'http://www.kx.cn/chejia/' + str(i)
         if not (dfcheck(self.df, urlbase, self.tag)):
             car = scrapy.Request(urlbase, meta={'datasave1': 'zero'})
             cars.append(car)
     return cars
Пример #14
0
    def start_requests(self):
        #this month
        thismonth = datetime.date.today().month
        #modellist
        with open('blm/' + self.dbname + '/modellist.csv', 'rb') as csvfile:
            reader = csv.DictReader(csvfile)
            modellist = [row for row in reader]
        #citylist
        with open('blm/' + self.dbname + '/citylist.csv', 'rb') as csvfile:
            reader = csv.DictReader(csvfile)
            citylist = [row for row in reader]
        step = len(modellist) / self.parts + 1
        starti = self.part * step
        if self.part == self.parts - 1:
            step = len(modellist) - starti
        #urllist
        # for model in modellist:
        #     if model["salesdescid"] == "1127558":
        #         index = modellist.index(model)

        for city in citylist[2:3]:
            for model in modellist[29682:29683]:
                # if model["salesdescid"] == "1127558":
                for year in range(1999, 2019):
                    monthlist = range(1, 13)
                    for month in monthlist:
                        date = str(year) + '-' + str(month)
                        for mile in range(1, 121) + [0.2]:
                            if mile % 2 == 0:
                                mile = mile / 2
                            else:
                                mile = float(mile) / 2.0
                            # url ="https://dingjia.che300.com/app/EvalResult/getPreSaleRate?callback=jQuery183006719160584858153_1534397702297" \
                            #      "&prov=" + str(city['provid']) +"&city=" + str(city['cityid']) + \
                            #      "&brand=" + str(model['brandid']) + "&series=" + str(model['familyid']) + \
                            #      "&model=" + str(model['salesdescid']) + "&regDate=" + date + "&mile=" + str(mile)
                            url1='https://dingjia.che300.com/app/EvalResult/allProvPrices?callback=jQuery183006719160584858153_1534397702298' + \
                                 "&brand=" + str(model['brandid']) + "&series=" + str(model['familyid']) + \
                                 "&model=" + str(model['salesdescid']) + "&regDate=" + date + "&mile=" + str(mile)
                            # if not (dfcheck(self.df, url + "-" + update_code, self.tag)):
                            #     meta =dict()
                            #     meta['provid']= city['provid']
                            #     meta['cityid']= city['cityid']
                            #     meta['salesdescid']= model['salesdescid']
                            #     meta['regDate']= date
                            #     meta['mile']= str(mile)
                            #     yield scrapy.Request(url=url, meta={"datainfo":meta},callback=self.parse)

                            if not (dfcheck(self.df, url1 + "-" + update_code,
                                            self.tag)):
                                meta = dict()
                                meta['salesdescid'] = model['salesdescid']
                                meta['regDate'] = date
                                meta['mile'] = str(mile)
                                yield scrapy.Request(
                                    url=url1,
                                    meta={"datainfo": meta},
                                    callback=self.parse_allprov)
Пример #15
0
 def parse(self, response):
     # self.start = 0 if self.df == 'none' else len(self.df)-1
     # self.counts = self.start
     #global carnum
     # self.size = min(self.carnum-self.counts, self.size)
     for i in range(self.counts, self.counts + self.size + 1):
         url = "http://www.chemao.com/show" + str(i) + ".html"
         if not (dfcheck(self.df, str(i), self.tag)):
             yield scrapy.Request(url, callback=self.parse_car)
Пример #16
0
 def parse(self, response):
     for href in response.xpath('//ul[@class="carList"]/li'):
         urlbase = href.xpath("a/@href").extract_first()
         url = response.urljoin(urlbase)
         datasave1 = href.extract()
         if not (dfcheck(self.df, url, self.tag)):
             yield scrapy.Request(url,meta={"datasave1":datasave1},callback= self.parse_car)
     next_page = response.xpath(u'//a[contains(text(),"下一页")]/@href')
     if next_page:
         url_next = response.urljoin(next_page.extract_first())
         yield scrapy.Request(url_next, self.parse)
     else:
         # as list
         for id in range(1, 290000):
             urlbase = 'http://www.carking001.com/ershouche/detail/' + str(id) + '.html'
             url = response.urljoin(urlbase)
             if not (dfcheck(self.df, url, self.tag)):
                 yield scrapy.Request(url,meta={"datasave1":"zero"} ,callback=self.parse_car)
Пример #17
0
 def start_requests(self):
     #this month
     thismonth =datetime.date.today().month
     #modellist
     with open('blm/'+self.dbname+'/che168modellist.csv', 'rb') as csvfile:
         reader = csv.DictReader(csvfile)
         modellist = [row for row in reader]
     #citylist
     with open('blm/'+self.dbname+'/che168citylist.csv', 'rb') as csvfile:
         reader = csv.DictReader(csvfile)
         citylist = [row for row in reader]
     # valuelist
     with open('blm/' + self.dbname + '/che168valuelist.csv', 'rb') as csvfile:
         reader = csv.DictReader(csvfile)
         valuelist = [row for row in reader]
     step=len(modellist)/self.parts+1
     starti = self.part * step
     if self.part==self.parts-1:
         step = len(modellist) - starti
     #urllist
     for city in citylist[8:9]:
         for model in modellist[starti:(starti+step)]:
             for year in range(int(model['min_reg_year']),int(model['max_reg_year'])+1)+[1995,2017]:
                 if year == 2017 and year!=int(model['max_reg_year']):
                     monthlist = [12]
                 elif year == 2017 and year==int(model['max_reg_year']):
                     monthlist = [1,thismonth, 12]
                 elif year==1995:
                     monthlist = [1]
                 else:
                     monthlist = [1,5,6,12]
                 for month in monthlist:
                     date = str(year)+'/'+str(month)+'/01'
                     mile=1
                     #mileagelist = [1]
                     # for mile in mileagelist:
                     #     #4S置换GetPinGuData('2.03v', 'PingGuCallBack2', 'uahm10033');
                     #     #卖给个人GetPinGuData('2.09v','PingGuCallBack3','uahm10034');
                     #     #卖给车商GetPinGuData('2.04v','PingGuCallBack1','uahm10035');
                     #     #保障车GetPinGuData('2.07v','PingGuCallBack1','uahm10036');
                     #     #商家车GetPinGuData('2.08v','PingGuCallBack2','uahm10037');
                     #     #个人车GetPinGuData('2.09v','PingGuCallBack3','uahm10038');
                     for value in valuelist:
                         url='https://cacheapi.che168.com/Assess/UsedCarAssess.ashx?_appid=m.m&_sign=&_encoding=gb2312&pid='\
                         +str(city['provid']) +"&cid="+ str(city['cityid']) +"&mileage="+ str(mile) + \
                         "&firstregtime="+ date +"&specid="+ str(model['autohomeid']) +"&_appversion="+\
                         value['_appversion']+"&mark="+value['mark']+"&_callback="+value['_callback']
                         if not (dfcheck(self.df, url, self.tag)):
                             meta =dict()
                             meta['provid']= city['provid']
                             meta['cityid']= city['cityid']
                             meta['autohomeid']= model['autohomeid']
                             meta['regDate']= date
                             meta['milage']= str(mile)
                             meta['type']=value['type']
                             yield  scrapy.Request(url=url, meta={"datainfo":meta},callback=self.parse)
Пример #18
0
 def start_requests(self):
     with open('blm/'+self.dbname+'/modellist.csv', 'rb') as csvfile:
         reader = csv.DictReader(csvfile)
         modellist = [row for row in reader]
     cars = []
     for model in modellist:
         i = model['salesdescid']
         url = 'https://dingjia.che300.com/app/CarDetail/getModelConfigure/' + str(i)
         if not (dfcheck(self.df, url, self.tag)):
             yield scrapy.Request(url=url, meta={"datainfo": {"salesdescid":i}}, callback=self.parse)
Пример #19
0
 def parse(self, response):
     datacheck = len(response.xpath("//html").extract_first())
     if datacheck > 20000:
         # as list
         for href in response.xpath('//li[@class="item"]'):
             urlbase = href.xpath("a/@href").extract_first()
             datasave1 = href.extract()
             url = response.urljoin(urlbase)
             if not (dfcheck(self.df, url, self.tag)):
                 yield scrapy.Request(url,
                                      meta={'datasave1': datasave1},
                                      callback=self.parse_car)
Пример #20
0
 def parse(self, response):
     # car_item
     for href in response.xpath('//ul[@class="carlist-content"]/li'):
         datasave1 = href.extract()
         urlbase = href.xpath('a/@href').extract_first()
         url = response.urljoin(urlbase)
         if not (dfcheck(self.df, url, self.tag)):
             yield scrapy.Request(url,meta={"datasave1":datasave1},callback= self.parse_car)
     # next page
     next_page=response.xpath(u'//a[contains(text(),"下一页")]/@href').extract_first()
     if next_page:
         url=response.urljoin(next_page)
         yield scrapy.Request(url, self.parse)
Пример #21
0
 def parse(self, response):
     # print(response.body.decode('gbk'))
     for href in response.xpath('//div[@class="search_car_lb"]/dl'):
         urlbase = href.xpath('dt/a/@href').extract_first()
         datasave1 = href.extract()
         url = response.urljoin(urlbase)
         if not (dfcheck(self.df, url, self.tag)):
             yield scrapy.Request(url, meta={"datasave1": datasave1}, callback=self.parse_car)
     next_page= response.xpath(u'//a[contains(text()," >> ")]')
     if next_page:
         nexturl = next_page.xpath('@href').extract_first()
         url = response.urljoin(nexturl)
         yield scrapy.Request(url, meta={"datasave1": datasave1}, callback=self.parse)
Пример #22
0
 def parse_list(self, response):
     # car_item
     for href in response.xpath('//div[@class="Datu_cars"]/div'):
         urlbase = href.xpath('div/a/@href').extract_first()
         datasave1 = href.extract()
         url = response.urljoin(urlbase)
         if not (dfcheck(self.df, url, self.tag)):
             yield scrapy.Request(url,meta={"datasave1":datasave1},callback= self.parse_car)
         # next page
         next_page = response.xpath('//a[@class="num"]/@href').extract_first()
         if next_page:
             url = response.urljoin(next_page)
             yield scrapy.Request(url, self.parse_list)
Пример #23
0
 def select5_parse(self,response):
     print("select5")
     x = response.xpath('//div[@class="carShow"]/div')
     for temp in x:
         urlbase = temp.xpath('a[@class="car-link"]/@href').extract_first()
         url = response.urljoin(urlbase)
         datasave1 = temp.extract()
         if not (dfcheck(self.df, url, self.tag)):
             yield scrapy.Request(url, meta={"datasave1": datasave1}, callback=self.parse_car)
     next_page = response.xpath('//span[contains(text(),">")]/../@href').extract_first()
     if next_page:
         url = response.urljoin(next_page)
         yield scrapy.Request(url, self.select5_parse)
Пример #24
0
 def select4_parse(self, response):
     x = response.xpath('//ul[@class="list-ad-items"]/li')
     for temp in x:
         datasave1 = temp.extract()
         url = str(temp.xpath('a/@href').extract_first())
         if not (dfcheck(self.df, url, self.tag)):
             yield scrapy.Request(url,
                                  meta={"datasave1": datasave1},
                                  callback=self.parse_car)
     next_page = response.xpath(u'//a[contains(text(),"下一页")]/@href')
     if next_page:
         url = response.urljoin(next_page.extract_first())
         yield scrapy.Request(url, self.select4_parse)
Пример #25
0
    def parse_list(self, response):
        for href in response.xpath('//div[@class="car_filter_list"]/ul/li'):
            urlbase = href.xpath('@data-jslink').extract_first()
            datasave1 = href.extract()
            url = response.urljoin(urlbase)

            if not (dfcheck(self.df, url, self.tag)):
                yield scrapy.Request(url, meta={"datasave1": datasave1}, callback=self.parse_car)
        # next page
        next_page = response.xpath('//a[@data-eqselog="/list@etype=click@page=next"]/@href')
        if next_page:
            url = response.urljoin(next_page.extract_first())
            yield scrapy.Request(url, self.parse_list)
Пример #26
0
 def select8_parse(self, response):
     for href in response.xpath('//div[@class="carsItem carItem"]'):
         urlbase = href.xpath('a/@href').extract_first()
         datasave1 = href.extract()
         url = response.urljoin(urlbase)
         if not (dfcheck(self.df, url, self.tag)):
             yield scrapy.Request(url,
                                  meta={'datasave1': datasave1},
                                  callback=self.parse_car)
     # next page
     next_page = response.xpath('//a[@class="next"]/@href').extract_first()
     if next_page:
         url = response.urljoin(next_page)
         yield scrapy.Request(url, self.select8_parse)
Пример #27
0
 def parse_list(self, response):
     # car_item
     for href in response.xpath('//div[contains(@class,"cheyuan")]/ul/li'):
         urlbase = href.xpath("span/a/@href").extract_first()
         datasave1 = href.extract()
         url = response.urljoin(urlbase)
         if not (dfcheck(self.df, url, self.tag)):
             yield scrapy.Request(url,
                                  meta={"datasave1": datasave1},
                                  callback=self.parse_car)
         # next page
         next_page = response.xpath(u'//a[contains(text(),"下一页")]/@href')
         if next_page:
             url = response.urljoin(next_page.extract_first())
             yield scrapy.Request(url, self.parse_list)
Пример #28
0
 def select9_parse(self, response):
     for href in response.xpath(
             '//dl[contains(@class,"list-pic clearfix cursor_pointer")]'):
         urlbase = href.xpath('dt/div[2]/div/a/@href').extract_first()
         datasave1 = href.extract()
         url = response.urljoin(urlbase)
         if not (dfcheck(self.df, url, self.tag)):
             yield scrapy.Request(url,
                                  meta={'datasave1': datasave1},
                                  callback=self.parse_car)
     # next page
     next_page = response.xpath('//a[@class="next"]/@href')
     if next_page:
         url = response.urljoin(next_page.extract_first())
         yield scrapy.Request(url, self.select9_parse)
Пример #29
0
    def parse_list(self, response):
        # if len(response.xpath('//html').extract_first())>=70000:
        for href in response.xpath(
                '//div[contains(@class,"list-items pt15 list-duibi")]'):
            urlbase = href.xpath('div[1]/span/a/@href').extract_first()
            datasave1 = href.extract()
            url = response.urljoin(urlbase)
            if not (dfcheck(self.df, url, self.tag)):
                yield scrapy.Request(url,
                                     meta={"datasave1": datasave1},
                                     callback=self.parse_car)

        nextpage = response.xpath('//a[@class="arrow-rh fl"]/@href')
        if nextpage:
            url = response.urljoin(nextpage.extract_first())
            yield scrapy.Request(url, callback=self.parse_list)
Пример #30
0
 def parse(self, response):
     # car_item
     for href in response.xpath('//div[@id="container_base"]/div'):
         url = href.xpath('./div[@class="item_main clearfix"]/h2/a/@href'
                          ).extract_first()
         # urlbase = href.xpath("div[1]/a/@href").extract_first()
         datasave1 = href.extract()
         # url = response.urljoin(urlbase)
         if not (dfcheck(self.df, url, self.tag)):
             yield scrapy.Request(url,
                                  meta={"datasave1": datasave1},
                                  callback=self.parse_car)
     # next page
     next_page = response.xpath('//a[@class="pages-next"]/@href')
     if next_page:
         url = response.urljoin(next_page.extract_first())
         yield scrapy.Request(url, callback=self.parse, dont_filter=True)