def parse_list(self, response): # print(response.text) # next = response.xpath("//a[contains(text(), '下一页')]") # if next: # yield scrapy.Request(url=response.urljoin(next.xpath("@href").extract_first()), meta=response.meta, callback=self.parse_list) cars = response.xpath("//*[@class='ch_carlistv3']/li") if cars: # with open("D:\chehang168_family_log", "a") as f: # with open("/root/familyname_log_full.txt", "a") as f: with open("/Users/cagey/PycharmProjects/zt_scrapy/projects/koubei_project/koubei/familyname_log_full.txt", "a") as f: f.write(response.meta["familyname"] + "\n") f.close() for car in cars: item = Chehang168Item() item['url'] = response.url item['grabtime'] = time.strftime('%Y-%m-%d %X', time.localtime()) item['brandname'] = response.meta["brandname"] item['brandcode'] = response.meta["brandcode"] item['familyname'] = response.meta["familyname"] item['familycode'] = response.meta["familycode"] item['title'] = car.xpath("div/h3/a/text()").extract_first() item['guideprice'] = car.xpath("div/h3/b/text()").extract_first() item['price'] = car.xpath("div/span/b/text()").extract_first().replace("万", "") item['store'] = car.xpath("p[@class='c3']/a/text()").extract_first() item['desc1'] = car.xpath("p[@class='c1']/text()[1]").extract_first() item['desc2'] = car.xpath("p[@class='c2']/text()").extract_first() item['time'] = car.xpath("p[@class='c3']/cite[1]/text()").extract_first() item['desc3_2'] = car.xpath("p[@class='c3']/cite[2]/text()").extract_first() item['desc3_3'] = car.xpath("p[@class='c3']/cite[3]/text()").extract_first() item['status'] = item["title"] + "-" + item["desc1"] + "-" + item["store"] print(item)
def parse(self, response): item = Chehang168Item() li_list = response.xpath("//*/div/ul[@class='cyxx_wrap_ull pt_1']/li") for li in li_list: a_list = li.xpath("./a") for a in a_list: item["brandcode"] = a.xpath("./@href").get() item["brandname"] = a.xpath("./text()").get() list_url = "http://www.chehang168.com" + item["brandcode"] yield scrapy.Request( url=list_url, callback=self.detail_url, meta={"item": deepcopy(item)}, cookies=self.cookies, headers=self.headers, dont_filter=True )