def parse_newhouse(self, response): province, city = response.meta.get("info") lis = response.xpath("//div[contains(@class, 'nl_con')]/ul/li") for li in lis: name = "".join(li.xpath(".//div[@class='nlcd_name']//text()").getall()) name = re.sub(r'\s', '', name) house_type_list = "".join(li.xpath(".//div[contains(@class,'house_type')]//text(" ")").getall()) house_type_list = re.sub(r'\s', "", house_type_list) if "居" not in house_type_list: continue rooms = house_type_list.split("-")[0] area = house_type_list.split("-")[1] address = li.xpath(".//div[@class='address']/a/@title").get() district = "".join(li.xpath(".//div[@class='address']/a//text()").getall()) district = re.search(r'\[(.+)\].*', district).group(1) sale = li.xpath(".//span[@class='inSale']/text()").get() origin_url = "https:" + li.xpath(".//div[@class='nlcd_name']/a/@href").get() price = "".join(li.xpath(".//div[@class='nhouse_price']//text()").getall()).strip() price = re.sub(r'\s|广告', "", price) item = NewHouseItem(name=name, rooms=rooms, area=area, address=address, district=district, sale=sale, origin_url=origin_url, price=price, province=province, city=city) yield item next_url = response.xpath(".//li[@class='fr']/a[@class='next']/@href").get() if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse, meta={"info": (province, city)})
def parse_newhouse(self, response): province,city = response.meta['info'] ul = response.xpath('//div[@class="nl_con clearfix"]/ul/li') for li in ul: name1 = li.xpath('.//div[@class="nlcd_name"]/a/text()').get() if not name1: continue name = name1.strip() rooms_text = li.xpath('.//div[@class="house_type clearfix"]/a/text()').getall() rooms = ''.join(list(filter(lambda x:x.endswith('居'),rooms_text))) area1 = ''.join(li.xpath('.//div[@class="house_type clearfix"]/text()').getall()) area = re.sub(r'\s|/|-','',area1) district1 = ''.join(li.xpath('.//div[@class="address"]/a//text()').getall()) district = re.search(r'.*\[(.+)\].*',district1).group(1) address = li.xpath('.//div[@class="address"]/a/@title').get().strip() sale = li.xpath('.//div[contains(@class,"fangyuan")]/span/text()').get() # sale = li.xpath('.//div[@class="fangyuan"]/span/text()').get() price1 = ''.join(li.xpath('.//div[@class="nhouse_price"]//text()').getall()).strip() price = re.sub(r'广告','',price1) origin_url = li.xpath('.//div[@class="nlcd_name"]/a/@href').get() item = NewHouseItem(name=name,rooms=rooms,area=area,district=district,address=address,sale=sale,price=price,origin_url=origin_url,province=province,city=city) print(item) yield item next_page = response.xpath('//a[@class="next"]/@href').get() if next_page: next_url = response.urljoin(next_page) yield scrapy.Request(next_url,callback=self.parse_newhouse,meta={'info':(province,city)})
def parse_newhouse(self, response): province, city = response.meta.get('info') #print(province + '==' + city) lis = response.xpath('//div[contains(@class,"nl_con")]/ul/li') for li in lis: name = li.xpath('.//div[@class="nlcd_name"]/a/text()').get() if not name: continue name = re.sub(r'\s', '', name) house_type_list = li.xpath('.//div[contains(@class,"house_type")]/a/text()').getall() house_type_list = list(map(lambda x:re.sub(r'\s', '', x), house_type_list)) rooms = list(filter(lambda x : x.endswith('居'), house_type_list)) area = ''.join(li.xpath('.//div[contains(@class, "house_type")]/text()').getall()) area = re.sub(r'[\s/-]', '', area) district = li.xpath('.//div[@class="address"]/a//text()').getall() # district = list(map(lambda x:re.sub(r'[\s\[\]]', '', x), district)) district = ''.join(district) district = re.search('\[(.+)\].*', district).group(1) address = li.xpath('.//div[@class="address"]/a/@title').get() sale = li.xpath('.//div[contains(@class, "fangyuan")]/span/text()').get() price = li.xpath('.//div[@class="nhouse_price"]//text()').getall() price = re.sub(r'\s|广告', '', ''.join(price)) origin_url = li.xpath('.//div[@class="nlcd_name"]/a/@href').get() item = NewHouseItem(city=city, name=name, price=price,rooms=rooms, area=area, address=address, district=district, sale=sale, origin_url=origin_url) item['id'] = 1 yield item next_url = response.xpath('//a[@class="next"]/@href').get() if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse, meta={"info":(province, city)})
def parse_newhouse(self, response): province, city = response.meta.get('info') # 元祖解包 lis = response.xpath('//div[@class="nl_con clearfix"]/ul/li') for li in lis: name = li.xpath('.//div[@class="nlcd_name"]/a/text()').get() if name == None: pass else: name = re.sub(r'\s', '', name) # contains是指找到div下class里包含有house_type的div house_type_list = li.xpath( './/div[contains(@class,"house_type")]/a/text()').getall() # map函数, 用来替换数据里的空字符 house_type_list = list( map(lambda x: re.sub(r'\s', '', x), house_type_list)) # filter 过滤函数,过滤末尾带有‘居’字的数据, 没有带‘居’的变成空list[] rooms = list(filter(lambda x: x.endswith('居'), house_type_list)) # "".join 是把列表变成字符串 getall()返回的是列表 area = "".join( li.xpath( './/div[contains(@class,"house_type")]/text()').getall()) area = re.sub(r'\s|-|/', '', area) address = li.xpath('.//div[@class="address"]/a/@title').get() district_text = "".join( li.xpath('.//div[@class="address"]/a//text()').getall()) district_text = re.sub(r'\s', '', district_text) district = re.search(r"\[(.+)\]", district_text) if district == None: pass else: district = district.group(1) sale = li.xpath( './/div[contains(@class,"fangyuan")]/span/text()').get() price = "".join( li.xpath('.//div[@class="nhouse_price"]//text()').getall()) price = re.sub(r'\s|广告', '', price) detail_url = li.xpath('.//div[@class="nlcd_name"]/a/@href').get() origin_url = response.urljoin(detail_url) # origin_url (https://lefuqiangyuerongwan.fang.com) # 楼盘简介(https://lefuqiangyuerongwan.fang.com/house/2110175680/housedetail.htm) # print("TAG============================", origin_url) yield scrapy.Request(url=origin_url, callback=self.get_new_code, meta={'info': (name, origin_url)}) # newcode = self.get_new_code(origin_url) # detail_url = origin_url + "/house/" + newcode + "/housedetail.htm" # detail_intro = self.get_house_inttro(detail_url) item = NewHouseItem() for field in item.fields.keys(): # 取出所有的键 item[field] = eval(field) yield item next_url = response.xpath( '//div[@class="page"]//a[@class="next"]/@href').get() if next_url: next_page = response.urljoin( next_url) # 拼接URL urljoin(start_urls, next_page) print(next_page) yield scrapy.Request(url=next_page, callback=self.parse_newhouse, meta={'info': (province, city)})
def parse_newhouse(self,response): province,city = response.meta.get('info') lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li") for li in lis: name = li.xpath(".//div[@class='nlcd_name']/a/text()").get() house_type_list = li.xpath(".//div[contains(@class,'house_type')]/a/text()").getall() house_type_list = list(map(lambda x:re.sub(r'\s','',x),house_type_list)) rooms = list(filter(lambda x:x.endswith('居'),house_type_list)) area = ''.join(li.xpath(".//div[contains(@class,'house_type')]/text()").getall()) area = re.sub(r'\s|-|/', '', area) address = li.xpath("//div[@class='address']/a/@title").get() #district = li.xpath("//span[@class='sngrey']").get() district = ''.join(li.xpath("//div[@class='address']/a//text()").getall()) #district = re.search(r'.*\[(.*?)\].*',district_text).group(1) sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/text()").get() price = ''.join(li.xpath(".//div[@class='nhouse_price']//text()").getall()) price = re.sub(r'\s|广告','',price) origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get() item = NewHouseItem(name=name,rooms=rooms,area=area,address=address,district=district,sale=sale,price=price,origin_url=origin_url,province=province,city=city) yield item next_url = response.xpath("//div[@class='page']//a[@class='next']/@href").get() if next_url: # join到newhouse_url yield scrapy.Request(url=response.urljoin(next_url),callback=self.parse_newhouse,meta={'info':(province,city)})
def parse_newhouse(self, response): province, city = response.meta.get("info") lis = response.xpath("//div[contains(@class, 'nl_con')]/ul/li[not(@style)]") for li in lis: # 获取房产名字 name = li.xpath(".//div[@class='nlcd_name']/a/text()").get().strip() # 获取几居室 rooms = li.xpath(".//div[contains(@class, 'house_type')]/a//text()").getall() # 获取面积 area = li.xpath(".//div[contains(@class, 'house_type')]/text()").getall() area = "".join(area).strip() area = re.sub(r"/|-|/s| |\n", "", area) # 获取地址 address = li.xpath(".//div[@class = 'address']/a/@title").get() # 获取是哪个区的房子 district = li.xpath(".//div[@class = 'address']/a//text()").getall() district = "".join(district) district = re.search(r".*\[(.+)\].*", district).group(1) # 获取是否在售 sale = li.xpath(".//div[contains(@class, 'fangyuan')]/span/text()").get() # 获取价格 price = li.xpath(".//div[@class = 'nhouse_price']//text()").getall() price = "".join(price).strip() # 获取详情页url origin_url = li.xpath(".//div[@class = 'nlcd_name']/a/@href").get() # 构建item返回 item = NewHouseItem(province = province, city = city, name = name, rooms = rooms, area = area, address = address, district = district, sale = sale, price = price, origin_url = origin_url) yield item # 爬取下一页数据 next_url = response.xpath("//div[@class = 'page']//a[@class = 'next']/@href").get() if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse, meta={"info": (province, city)})
def parse_newhouse(self,response): province,city = response.meta.get('info') lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li") for li in lis: name = li.xpath(".//div[@class='nlcd_name']/a/text()").get() if name is not None: name = li.xpath(".//div[@class='nlcd_name']/a/text()").get().strip() house_type_list = li.xpath(".//div[contains(@class,'house_type')]/a/text()").getall() house_type_list = list(map(lambda x:re.sub(r"\s","",x),house_type_list)) rooms = list(filter(lambda x: x.endswith("居"), house_type_list)) area = "".join(li.xpath(".//div[contains(@class,'house_type')]/text()").getall()) area = re.sub(r"\s|-|/","",area) address = li.xpath(".//div[@class='address']/a/@title").get() district_text = "".join(li.xpath(".//div[@class='address']/a//text()").getall()) district= re.search(r".*\[(.+)\].*",district_text).group(1) sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/text()").get() price = "".join(li.xpath(".//div[@class='nhouse_price']//text()").getall()) price = re.sub(r"\s|广告","",price) origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get() item = NewHouseItem(name=name,rooms=rooms,area=area,address=address,district=district, sale=sale,price=price,origin_url=origin_url,province=province,city=city) yield item next_url = response.xpath("//div[@class='page']//a[@class='next']/@href").get() if next_url: yield scrapy.Request(url=response.urljoin(next_url),callback=self.parse_newhouse,meta={"info":(province,city)})
def parse_newhouse(self, response): province, city, cityabbr = response.meta.get("info") print(province, city) lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li") for li in lis: name = li.xpath(".//div[@class='nlcd_name']/a/text()").get() if name is None: continue name = name.strip() house_type_list = li.xpath( ".//div[contains(@class,'house_type')]/a/text()").getall() house_type_list = list( map(lambda x: re.sub(r"\s", "", x), house_type_list)) rooms = list( filter(lambda x: x.endswith("居") | x.endswith("上"), house_type_list)) #print(rooms) area = "".join( li.xpath( ".//div[contains(@class,'house_type')]/text()").getall()) area = re.sub(r'\s|-|/', "", area) # print(area) address = li.xpath(".//div[@class='address']/a/@title").get() district_text = "".join( li.xpath(".//div[@class='address']/a//text()").getall()) district = re.search(r".*\[(.+)\].*", district_text).group(1) # print(district) sale = li.xpath( ".//div[contains(@class,'fangyuan')]/span/text()").get() price = "".join( li.xpath(".//div[@class='nhouse_price']//text()").getall()) price = re.sub(r"\s", "", price) origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get() origin_url = urljoin("https:", origin_url) # print(origin_url) # print(sale,price) item = NewHouseItem(province=province, city=city, name=name, rooms=rooms, area=area, address=address, district=district, sale=sale, price=price, origin_url=origin_url) yield item next_url = response.xpath( "//div[@class='page']//a[@class='next']/@href").get() if next_url: temp_url = "https://" + cityabbr + ".newhouse.fang.com" next_url = urljoin(temp_url, next_url) #print("下一页:%s"%next_url) yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse, meta={"info": (province, city, cityabbr)})
def parse_newhouse(self, response): province, city = response.meta.get('info') lis = response.xpath( "//div[contains(@class,'nl_con')]/ul/li[not(@style)]") for li in lis: name = li.xpath(".//div[@class='nlcd_name']/a/text()").get() name = re.sub(r"\s", '', name) house_type_text = li.xpath( ".//div[contains(@class,'house_type')]/a/text()").getall() rooms = list(map(lambda x: re.sub(r'\s', '', x), house_type_text)) area = "".join( li.xpath( ".//div[contains(@class,'house_type')]/text()").getall()) area = re.sub(r"\s|-|/", "", area) # 地址 address = li.xpath(".//div[@class='address']/a/@title").get() # 行政区 district_text = "".join( li.xpath(".//div[@class='address']/a/span/text()").getall()) district = re.search(r".*\[(.+)\].*", district_text).group(1) sale = li.xpath(".//div[@class='fangyuan']/span/text()").get() price = "".join( li.xpath(".//div[@class='nhouse_price']//text()").getall()) price = re.sub(r"\s", "", price) origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get() origin_url = "https:" + origin_url[1:-1] item = NewHouseItem(name=name, rooms=rooms, price=price, area=area, address=address, sale=sale, district=district, origin_url=origin_url, province=province, city=city) yield item next_url = response.xpath( "//div[@class='page']//a[@class='next']/@href").get() # 如果urljoin没用就手动改url # next_url = response.url + next_url # 如果存在下一页就循环调用自身,继续解析网页 if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse, meta={"info": (province, city)})
def parse_newhouse(self, response): province, city = response.meta.get('info') # 判断是否class类 包含nl_con lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li") for li in lis: NewItem = NewHouseItem() name = li.xpath(".//div[@class='nlcd_name']/a/text()").extract() # 将空白字符 转换成空字符 小区的名字 name = list(map(lambda x: re.sub(r'\s', '', x), name)) house_type_list = li.xpath( ".//div[contains(@class,'house_type ')]//a/text()").extract() # 调用过滤函数filter 以“居”结尾的 几局 rooms = list(filter(lambda x: x.endswith("居"), house_type_list)) # 平方 area = ''.join( li.xpath( ".//div[contains(@class,'house_type')]/text()").getall()) area = re.sub(r"\s|/|-", '', area) # 地址 addrees = li.xpath( ".//div[@class='address']/a/@title").extract_first() # 区域 district_text = li.xpath( ".//div[@class='address']/a/span/text()").extract() district = list(map(lambda x: re.sub(r'\s', '', x), district_text)) # 价格 将列表转换为字符串 price = ''.join( li.xpath(".//div[@class='nhouse_price']//text()").extract()) price = re.sub(r'\s|广告', '', price) # 是否在售 sale = li.xpath(".//div[contains(@class,fangyuan)]/span/text()" ).extract_first() # 房源链接 origin_url = str( li.xpath(".//div[@class='nlcd_name']/a/@href").extract_first()) NewItem["province"] = province NewItem["city"] = city NewItem["name"] = name NewItem["price"] = price NewItem["rooms"] = rooms NewItem["area"] = area NewItem["addrees"] = addrees NewItem["district"] = district NewItem["sale"] = sale NewItem["origin_url"] = origin_url NewItem["province"] = province NewItem['city'] = city yield NewItem next_url = response.xpath( "//div[@class='page']//a[@class='next']/@href").extract_first() yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse, meta={"info": (province, city)})
def parse_newhouse(self, response): province = response.meta['info'][0] city = response.meta['info'][1] newhouse_lis = response.xpath( './/div[@id="newhouse_loupai_list"]/ul/li') for newhouse_li in newhouse_lis: name = newhouse_li.xpath( './/div[@class="nlcd_name"]/a/text()').get() if name == None: continue name = name.strip() rooms = newhouse_li.xpath( ".//div[contains(@class,'house_type')]/a/text()").getall() if rooms == []: continue area = "".join( newhouse_li.xpath( ".//div[contains(@class,'house_type')]/text()").getall()) area = re.sub(r"\s|/|-", "", area) address = newhouse_li.xpath( ".//div[@class='address']/a/@title").get() district = "".join( newhouse_li.xpath( ".//div[@class='address']/a//text()").getall()) district = re.search(r".*\[(.*?)\].*", district).group(1) sale = newhouse_li.xpath( './/div[contains(@class,"fangyuan")]/span/text()').get() prise = newhouse_li.xpath( ".//div[@class='nhouse_price']//text()").getall() prise = "".join(prise) prise = re.sub(r"\s|广告", "", prise) origin_url = newhouse_li.xpath( ".//div[@class='nlcd_name']/a/@href").get() origin_url = "http:" + origin_url item = NewHouseItem(name=name, rooms=rooms, area=area, address=address, district=district, sale=sale, prise=prise, origin_url=origin_url, province=province, city=city, where="NH") yield item next_url = response.xpath('.//a[@class="next"]/@href').get() if next_url: next_url = response.urljoin(next_url) print(next_url) yield scrapy.Request(next_url, callback=self.parse_newhouse, meta={'info': (province, city)})
def parse_newhouse(self, response): province, city = response.meta.get('info') divs = response.xpath( '//div[@class="nhouse_list"]//ul/li//div[@class="nlc_details"]') for div in divs: name = div.xpath( './/div[@class="nlcd_name"]/a/text()').get().strip() rooms = div.xpath( './/div[contains(@class,"house_type")]/a/text()').getall() rooms = list(filter(lambda x: x.endswith('居'), rooms)) if not rooms: rooms = '未知' area = "".join( div.xpath( './/div[contains(@class,"house_type")]/text()').getall()) area = re.sub(r"\s|-|\/", "", area) if area == '': area = '未知' address = re.sub( r'\[.*\]', "", "".join( div.xpath('.//div[@class="address"]/a/@title').getall())) district = "".join( div.xpath('.//div[@class="address"]/a//text()').getall()) district = re.findall(r".*\[(.+)\].*", district) if not district: district = '未知' else: district = district[0] sale = div.xpath( '//div[contains(@class,"fangyuan")]/span/text()').get() price = re.sub( r"\s|广告", "", "".join( div.xpath( './/div[@class="nhouse_price"]//text()').getall())) origin_url = div.xpath('.//div[@class="nlcd_name"]/a/@href').get() if not origin_url.startswith('https:'): origin_url = 'https:' + origin_url nitem = NewHouseItem(province=province, city=city, name=name, rooms=rooms, area=area, address=address, district=district, sale=sale, price=price, origin_url=origin_url) yield nitem next_url = response.xpath( '//div[@class="page"]//a[@class="next"]/@href').get() if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse, meta={'info': (province, city)})
def parse_newhouse(self, response): province, city = response.meta.get('info') lis = response.xpath( "//div[contains(@class, 'nl_con')]/ul/li[not(@class)]/div[@class='clearfix']/div[@class='nlc_details']" ) for li in lis: item = NewHouseItem(province=province, city=city) ad = li.xpath(".//div[@class='nhouse_price']/em[2]/text()").get() if ad is not None: continue item['name'] = li.xpath( ".//div[@class='nlcd_name']/a/text()").get().strip() item['rooms'] = "/".join( li.xpath(".//div[contains(@class, 'house_type')]/a/text()"). getall()) item['area'] = re.sub( r"\s|/|-", "", "".join( li.xpath(".//div[contains(@class, 'house_type')]/text()"). getall())) district_text = "".join( li.xpath(".//div[@class='address']/a//text()").getall()) item['district'] = re.search(r".*\[(.+)\].*", district_text).group(1) item['address'] = li.xpath( ".//div[@class='address']/a/@title").get() item['origin_url'] = "http:{}".format( li.xpath(".//div[@class='nlcd_name']/a/@href").get()) price_num = "".join( li.xpath(".//div[@class='nhouse_price']//text()").getall()) item['price'] = re.sub(r"\s", "", price_num) item['telephone'] = "".join( li.xpath(".//div[@class='tel']/p//text()").getall()) item['sale'] = li.xpath( ".//div[contains(@class, 'fangyuan')]/span/text()").get() item['label'] = "/".join( li.xpath( ".//div[contains(@class, 'fangyuan')]/a/text()").getall()) yield item # 分页请求 span = response.xpath("//div[@class='otherpage']/span[1]/@class").get() if span == 'disable': next_page_url = response.xpath( "//div[@class='otherpage']/a[1]/@href").get() else: next_page_url = response.xpath( "//div[@class='otherpage']/a[2]/@href").get() if next_page_url is not None: next_page = response.urljoin(next_page_url) yield scrapy.Request(url=next_page, callback=self.parse_newhouse, meta={"info": (province, city)})
def parse_newhouse(self, response): provinces, city = response.meta.get("info") #实例化一个items item = NewHouseItem() #得到所有的房源列表 lis = response.xpath('//div[contains(@class,"nl_con")]/ul/li') for li in lis: #去广告的li标签, if not li.xpath('.//div[@class="nlcd_name"]'): continue # 房名 item["name"] = li.xpath( './/div[@class="nlcd_name"]/a/text()').get().strip() house_type_text = li.xpath( ".//div[contains(@class,'house_type')]/a//text()").getall() # 几居 item["rooms"] = list( filter(lambda x: x.endswith('居' or '以上'), house_type_text)) area = "".join( li.xpath( './/div[contains(@class,"house_type")]/text()').getall()) # 面积 item["area"] = re.sub(r"\s|/|-", "", area) # 地区 item["address"] = li.xpath( './/div[@class="address"]/a/@title').get() # 行政区 district = "".join( li.xpath('.//div[@class="address"]/a//text()').getall()) # 没有行政 if "[" not in district: item["district"] = None else: item["district"] = re.search(r".*\[(.+)\].*", district).group(1) # 销售状态 item["sale"] = li.xpath( './/div[contains(@class,"fangyuan")]/span/text()').get() # price price = "".join( li.xpath(".//div[@class='nhouse_price']//text()").getall()) item["price"] = re.sub(r'\s|"广告"', "", price) # origin_url item["origin_url"] = response.urljoin( li.xpath('.//div[@class="nlcd_name"]/a/@href').get()) item["provinces"] = provinces item["city"] = city yield item next_url = response.xpath( "//div[@class='page']//a[@class='next']/@href").get() if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse, meta={"info": (provinces, city)})
def parse_newhouse(self, response): province, city = response.meta.get('info') lis = response.xpath('//div[contains(@class,"nl_con")]/ul/li') for li in lis: # 名字 name = li.xpath('.//div[@class="nlcd_name"]/a/text()').get() if name: name = name.strip() # 介绍:几居 house_type_list = li.xpath( './/div[contains(@class,"house_type")]//a/text()').getall( ) house_type_list = list( map(lambda x: x.replace(" ", ""), house_type_list)) room = list(filter(lambda x: x.endswith('居'), house_type_list)) # 面积 area = ''.join( li.xpath('.//div[contains(@class,"house_type")]/text()'). getall()) area = re.sub('\s|-|/', '', area) # 地址 address = li.xpath('.//div[@class="address"]/a/@title').get() # 位置 district_text = ''.join( li.xpath('.//div[@class="address"]/a//text()').getall()) district = re.search(r'.*\[(.+)\].*', district_text).group(1) # 是否在售 sale = li.xpath( './/div[contains(@class,"fangyuan")]/span/text()').get() # 价格 price = ''.join( li.xpath('.//div[@class="nhouse_price"]//text()').getall()) price = re.sub(r'\s|广告', '', price) # 详细url origin_url = li.xpath( './/div[@class="nlcd_name"]/a/@href').get() origin_url = 'http:' + origin_url item = NewHouseItem(province=province, city=city, name=name, rooms=room, area=area, address=address, district=district, sale=sale, price=price, origin_url=origin_url) yield item next_url = response.xpath('//a[@class="next"]/@href').get() if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse, meta={'info': (province, city)})
def parse_newhourse(self,response): province,city=response.meta.get("info") print("newurl response:", response.url) list_div = response.xpath("//div[contains(@class,'nl_con')]/ul/li") for li in list_div: name = li.xpath(".//div[@class='nlcd_name']/a/text()").get() if name is not None: name=name.strip() else: continue house_type_list = li.xpath(".//div[contains(@class,'house_type')]/a/text()").getall() house_type_list = list(map(lambda x:re.sub(r"\s","",x),house_type_list)) rooms_list = list(filter(lambda x:x.endswith("居"),house_type_list)) rooms="" for room in rooms_list: rooms=rooms+room # "".join() 以空字符作为链接 area = "".join(li.xpath(".//div[contains(@class,'house_type')]/text()").getall()) area =re.sub(r"\s|-|/|-","",area) address = li.xpath(".//div[@class='address']/a/@title").get() district_text = "".join(li.xpath(".//div[@class='address']/a//text()").getall()) if '[' in district_text: district = re.search(r".*\[(.+)\].*",district_text).group(1) else: district=city sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/text()").get() price = "".join(li.xpath(".//div[@class='nhouse_price']//text()").getall()) price = re.sub(r"\s|广告","",price) origin_url = "http:"+li.xpath(".//div[@class='nlcd_name']/a/@href").get() item = NewHouseItem(name=name, rooms=rooms, area=area, address=address, district=district, sale=sale, price=price, origin_url=origin_url, province=province, city=city) yield item #下一页 next_url = response.xpath("//a[text()='下一页']/@href").get() #https://newhouse.fang.com/house/s/b92/ #https://newhouse.fang.com/house/s/b91/ next_url =response.urljoin(next_url) if next_url: yield scrapy.Request( url=next_url, callback=self.parse_newhourse, meta={"info": (province, city)} )
def parse_newhouse(self, response): #新房 provice, city = response.meta.get('info') lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li") for li in lis: name = li.xpath( ".//div[contains(@class,'house_value')]//div[@class='nlcd_name']/a/text()" ).get() if name: name = re.sub(r"\s", "", name) #居室 house_type_list = li.xpath( ".//div[contains(@class,'house_type')]/a/text()").getall() house_type_list = list( map(lambda x: re.sub(r"\s", "", x), house_type_list)) rooms = list(filter(lambda x: x.endswith("居"), house_type_list)) #面积 area = "".join( li.xpath(".//div[contains(@class,'house_type')]/text()"). getall()) area = re.sub(r"\s|-|/", "", area) #地址 address = li.xpath(".//div[@class='address']/a/@title").get() address = re.sub(r"[请选择]", "", address) sale = li.xpath( ".//div[contains(@class,'fangyuan')]/span/text()").get() price = "".join( li.xpath(".//div[@class='nhouse_price']//text()").getall()) price = re.sub(r"\s|广告", "", price) #详情页url origin_url = li.xpath( ".//div[@class='nlcd_name']/a/@href").get() item = NewHouseItem(name=name, rooms=rooms, area=area, address=address, sale=sale, price=price, origin_url=origin_url, provice=provice, city=city) yield item #下一页 next_url = response.xpath( "//div[@class='page']//a[@class='next']/@href").get() if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse, meta={'info': (provice, city)})
def parse_newhouse(self, response): province, city = response.meta.get("info") print(province + " " + city) lis = response.xpath("//div[@class='nl_con clearfix']/ul/li") for li in lis: name = li.xpath(".//div[@class='nlcd_name']/a/text()").get() if (name == None): continue name = name.strip() house_type = li.xpath( ".//div[@class='house_type clearfix']//text()").getall() house_type = "".join( list(map(lambda x: re.sub(r"\s", "", x), house_type))) area = None if (house_type.find("-") >= 0): area = house_type.split("-")[1] house_type = house_type.split("-")[0].split("/") address = li.xpath(".//div[@class='address']/a/@title").get() district = "".join( li.xpath(".//div[@class='address']/a//text()").getall()) district = re.search(r".*\[(.+)\].*", district) if (district != None): district = district.group(1) sale = li.xpath(".//div[@class='fangyuan pr']/span/text()").get() price = "".join( li.xpath( ".//div[@class='nhouse_price']//text()").getall()).strip() price = re.sub(r"\s|广告", "", price) origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get() item = NewHouseItem(name=name, rooms=house_type, price=price, address=address, district=district, sale=sale, origin_url=origin_url, area=area, province=province, city=city) yield item next_url = response.xpath( "//div[@class='page']//a[@class='next'][last()]/@href").get() print(next_url) if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse, meta={"info": (province, city)})
def parse_newhouse(self,response): province,city=response.meta.get('info') # 获取一页的所有信息 lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li") for li in lis: # 小区名字 name = li.xpath(".//div[@class='nlcd_name']/a/text()").get() if name: # 几居 house_type_list = li.xpath(".//div[contains(@class,'house_type clearfix')]/a/text()").getall() house_type_list = list(map(lambda x: re.sub(r"\s", "", x), house_type_list)) rooms = list(filter(lambda x: x.endswith("居"), house_type_list)) # 面积 # 转化为字符串 area = "".join(li.xpath(".//div[contains(@class,'house_type clearfix')]/text()").getall()) area = re.sub(r"\s|-|/", "", area) # 地址 address = li.xpath(".//div[@class='address']/a/@title").get() # 行政区 district_text = "".join(li.xpath(".//div[@class='address']/a//text()").getall()) district = re.search(r".\[(.+)\].*", district_text).group(1) # 是否在销售 sale = li.xpath("//div[contains(@class,'fangyuan')]/span/text()").get() # 价格 price = "".join(li.xpath("//div[@class='nhouse_price']//text()").getall()) price = re.sub(r"\s|广告","",price) # 房天下的详情url origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get() item = NewHouseItem( name=name, rooms=rooms, area=area, address=address, sale=sale, price=price, origin_url=origin_url, province=province, city=city ) yield item # 下一页 next_url = response.xpath("//div[@class='page']/a[@class='next']/@href").getall() if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse, meta={'info': (province, city)})
def parse_newhouse(self, response): province, city = response.meta.get('info') lis = response.xpath( '//div[@id="newhouse_loupai_list"]/ul/li[not(@style)]') for li in lis: name = li.xpath( './/div[@class="nlcd_name"]/a/text()').get().strip() price = "".join( li.xpath('.//div[@class="nhouse_price"]//text()').getall()) price = re.sub(r"\s|广告", "", price) house_type_list = li.xpath( './/div[contains(@class, "house_type")]//text()').getall() house_type_list = re.sub(r"\s", "", "".join(house_type_list)).split('-') rooms = house_type_list[0] area = house_type_list[-1] address = li.xpath('.//div[@class="address"]/a/@title').get() district_text = "".join( li.xpath('.//div[@class="address"]/a//text()').getall()) district = re.search(r".*\[(.+)\].*", district_text).group(1) sale = response.xpath( ".//div[@class='fangyuan pr']/span/text()").get() origin_url = response.xpath( ".//div[@class='nlcd_name']/a/@href").get() origin_url = response.urljoin(origin_url) item = NewHouseItem(province=province, city=city, name=name, price=price, rooms=rooms, area=area, address=address, district=district, sale=sale, origin_url=origin_url) # print(name, price, rooms, area, address, district, sale, origin_url) yield item next_url = response.xpath( "//div[@class='page']//a[@class='next']/@href").get if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse, meta={"info": (province, city)})
def parse_newhouse(self, response): province, city = response.meta.get('info') list = response.xpath('//div[contains(@class,"nl_con")]/ul/li') for li in list: name = li.xpath('.//div[@class="nlcd_name"]/a/text()').get() if name: name = name.strip() rooms = li.xpath( './/div[contains(@class,"house_type")]//a/text()').getall() area = ''.join( li.xpath( './/div[contains(@class,"house_type")]/text()').getall()) area = re.sub(r'\s|-|/', '', area) address = li.xpath('.//div[@class="address"]/a/@title').get() district_text = ''.join( li.xpath('.//div[@class="address"]//text()').getall()) if district_text: district = re.search(r'\[(.+)\]', district_text).group(1) sale = li.xpath( './/div[contains(@class,"fangyuan")]/span[1]/text()').get() price = ''.join( li.xpath('.//div[@class="nhouse_price"]//text()').getall()) price = re.sub('\s|广告', '', price) origin_url = li.xpath('.//div[@class="nlcd_name"]/a/@href').get() item = NewHouseItem(province=province, city=city, name=name, rooms=rooms, area=area, address=address, district=district, sale=sale, price=price, origin_url=origin_url) yield item next_url = response.xpath( '//div[@class="page"]//a[@class="next"]/@href').get() if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse, meta={'info': (province, city)})
def parse_new_fang(self, response): province, city = response.meta.get('city') lis = response.xpath('//div[contains(@class, "nl_con")]/ul/li') for li in lis: name = li.xpath( ".//div[@class='nlcd_name']/a/text()").get().strip() house_type_list = li.xpath( ".//div[contains(@class,'house_type')]/a/text()").getall() house_type_list = list( map(lambda x: re.sub(r'\s', '', x), house_type_list)) rooms = list(filter(lambda x: x.endswith('居'), house_type_list)) area = ''.join( li.xpath(".//div[contains(@class,'house_type')]/text()"). getall()).strip() area = re.sub(r'\s|-|/|平米', '', area) address = li.xpath(".//div[@class='address']/a/@title").get() district_text = ''.join( li.xpath(".//div[@class='address']/a//text()").getall()) district = re.search(r'.*?\[(.*)\].*', district_text).group(1) sale = li.xpath( ".//div[contains(@class, 'fangyuan')]/span/text()").get() price = ''.join( li.xpath(".//div[@class='nhouse_price']//text()").getall()) price = re.sub(r'\s|广告', '', price) url = li.xpath(".//div[@class='nlcd_name']/a/@href").get() item = NewHouseItem(province=province, city=city, name=name, rooms=rooms, area=area, address=address, district=district, sale=sale, price=price, url=url) yield item next_url = response.xpath( "//div[@class='page']//a[@class='next']/@href").get() if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_new_fang, meta={'city': (province, city)})
def parse_newhouse(self, response): # 解析新房具体字段 # meta里面可以携带一些参数信息放到Request里面,在callback函数里面通过response获取 province, city = response.meta.get('info') lis = response.xpath('//div[contains(@class,"nl_con")]/ul/li') for li in lis: name = li.xpath( ".//div[contains(@class,'house_value')]//div[@class='nlcd_name']/a/text()" ).get() if name: name = re.sub(r"\s", "", name) house_type_list = li.xpath( './/div[contains(@class,"house_type")]/a/text()').getall() # house_type_list = list(map(lambda x:x.replace(' ',''),house_type_list)) house_type_list = list( map(lambda x: re.sub(r'/s', '', x), house_type_list)) rooms = list(filter(lambda x: x.endswith('居'), house_type_list)) area = ''.join( li.xpath( './/div[contains(@class,"house_type")]/text()').getall()) area = re.sub(r'\s|-|/', '', area) address = li.xpath('.//div[@class="address"]/a/@title').get() # district_text = ''.join(li.xpath('.//div[@class="address"]/a//text()').getall()) # district = re.search(r'.*\[(.+)\].*',district_text).group(1) sale = li.xpath( ".//div[contains(@class,'fangyuan')]/span/text()").get() price = "".join( li.xpath(".//div[@class='nhouse_price']//text()").getall()) price = re.sub(r"\s|广告", "", price) # 详情页url origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get() item = NewHouseItem(name=name, rooms=rooms.get(), area=area, address=address, sale=sale, price=price, origin_url=origin_url, province=province, city=city) yield item
def parse_newhouse(self,response): province,city,new_city_url = response.meta.get("info") lis = response.xpath("//div[contains(@class,'nl_con')]/ul//li") # lis = lis[0::] for li in lis: name = li.xpath(".//div[@class='nlcd_name']/a/text()").get() if name: name = name.strip() elif not name: continue house_type_list = li.xpath(".//div[contains(@class,'house_type')]/a/text()").getall() area = "".join(li.xpath(".//div[contains(@class,'house_type')]/text()").getall()).strip() area = re.sub(r'\s|/|-','',area) address = li.xpath(".//div[@class='address']/a/@title").get() try: district = re.search(r'.*(\[.*?\]).*', address).group(1) except Exception as e: print("行政区获取失败",e) district = '' sale = li.xpath(".//div[@class='fangyuan']/span/text()").get() price_number = li.xpath(".//div[@class='nhouse_price']/span/text()").get() price_info = li.xpath(".//div[@class='nhouse_price']/em/text()").get() print("房子价格信息为",price_number,price_info) try: price = price_number + price_info except Exception as e: print("当前价格不完善",e) price = "价格待定" origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get() origin_url = "https:"+origin_url item = NewHouseItem(province=province,city=city,name=name,price=price,rooms=house_type_list,area=area,address=address, \ district=district,sale=sale,origin_url=origin_url) print("小区名字",name,"小区住宅类型",house_type_list,"房子面积",area,"地址为",address,"行政区为",\ district,"是否在售", sale,"房子价格信息为",price) yield item next_url = response.xpath("//div[@class='page']//a[@class='next']/@href").get() if next_url: next_url = new_city_url+next_url yield scrapy.Request(url=next_url,callback=self.parse_newhouse,meta={'info':(province,city,new_city_url)})
def parse_newhouse(self,response): province,city = response.meta.get('info') lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li") for li in lis: name = li.xpath(".//div[@class='nlcd_name']/a/text()").get() if name is not None: name = name.strip() #print(name) house_type_list = li.xpath(".//div[contains(@class,'house_type')]/a/text()").getall() if house_type_list is not None: house_type = list(map(lambda x:re.sub(r"\s","",x),house_type_list)) rooms = list(filter(lambda x:x.endswith("居"),house_type)) # print(rooms) area = "".join(li.xpath(".//div[contains(@class,'house_type')]//text()").getall()) area = re.sub(r"\s|-|/|\d+[居]|.*?[\u4E00-\u9FA5]+起|.*?[\u4E00-\u9FA5]+SOHO","",area) #print(area) address_text = li.xpath(".//div[@class='address']/a/@title").get() if address_text is not None: address = address_text.strip() district_text = "".join(li.xpath(".//div[@class='address']/a//text()").getall()) district_x = re.search(r".*\[(.+)\].*",district_text) if district_x is not None: district = district_x.group(1) sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/text()").get() if sale is not None: sale = sale price = "".join(li.xpath(".//div[@class='nhouse_price']//text()").getall()) price = re.sub(r"\s|广告","",price) if price is not "": price = price origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get() if origin_url is not None: origin_url = "".join('https:'+origin_url) item = NewHouseItem(province = province,city = city,name = name,rooms = rooms,area = area,address =address,district = district,sale = sale,price = price,origin_url = origin_url) yield item #print(item) next_url = response.xpath(".//div[@class='page']//a[@class='next']/@href").get() if next_url: yield scrapy.Request(url=response.urljoin(next_url),callback=self.parse_newhouse,meta={"info":(province,city)})
def parse_new_house(self, response): province, city = response.meta.get('info') lis = response.xpath( '//div[@id="newhouse_loupai_list"]/ul/li[not(@class)]') for li in lis: name = li.xpath('.//div[@class="nlcd_name"]/a/text()').get( "").strip() number = li.xpath('.//div[@class="nhouse_price"]/span/text()').get( "") per = li.xpath('.//div[@class="nhouse_price"]/em/text()').get("") price = number + per rooms = ",".join( li.xpath( './/div[@class="house_type clearfix"]/a/text()').getall()) area = ",".join( li.xpath( './/div[@class="house_type clearfix"]/text()').getall()) try: area = re.search("\d+.*米", area).group() except Exception: area = "" address = li.xpath('//div[@class="address"]/a/@title').get() district = ",".join( li.xpath( './/div[contains(@class,"fangyuan")]/a//text()').getall()) sale = li.xpath( './/div[contains(@class,"fangyuan")]/span/text()').get() origin_url = response.url item = NewHouseItem(province = province,city = city,name = name,price = price,rooms = rooms,\ area = area,address = address,district = district,sale = sale,origin_url = origin_url) yield item next_page = response.xpath('//a[@class="next"]/@href').get() if next_page: yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse_new_house, meta={"info": (province, city)})
def parse_newhouse(self, response): # 解析新房具体字段 # meta里面可以携带一些参数信息放到Request里面,在callback函数里面通过response获取 province, city_name = response.meta.get('info') lis = response.xpath('//div[@class="nl_con clearfix"]/ul/li') for li in lis: # 广告和正常的房产两层class相同,唯一不同是广告有h3标签。如果是广告直接跳过 ad = li.xpath('./div[@class="clearfix"]/h3/text()').extract_first() if ad: continue house_name = li.xpath( './/div[@class="house_value clearfix"]//div[@class="nlcd_name"]/a/text()' ).extract_first() if house_name: house_name = re.sub(r"\s", "", house_name) # 解析几居室 rooms = '/'.join( li.xpath('.//div[@class="house_type clearfix"]/a/text()'). extract()) # '3居/4居' # 销售电话 phone_num = ''.join( li.xpath('.//div[@class="tel"]/p//text()').extract()) # 解析房屋面积 area = ''.join( li.xpath( './/div[@class="house_type clearfix"]/text()').extract()) area = re.sub('\s|-|/', '', area) address = li.xpath( './/div[@class="address"]/a/@title').extract_first() # 是否开盘(在售、待售) sale = li.xpath( ".//div[@class='fangyuan']/span/text()").extract_first() # 房屋卖点 tags_list = li.xpath('//div[@id="sjina_C26_07"]//text()').extract() tags = list(filter(None, map(lambda x: x.strip(), tags_list)))[1:] tags = '/'.join(tags) # 每平米单价、少数整套价格 price = li.xpath( ".//div[@class='nhouse_price']/span/text()").extract_first() price_unit = li.xpath( ".//div[@class='nhouse_price']/em/text()").extract_first() nearby = li.xpath('//div[@class="nhouse_price"]/label[2]/text()' ).extract_first() if nearby: price = li.xpath( '//div[@class="nhouse_price"]/i/text()').extract_first() # if not price_unit: price = price else: price = price + price_unit # '40500元/㎡' # 详情页url origin_url = li.xpath( ".//div[@class='nlcd_name']/a/@href").extract_first() # 详情页可能会取空,加一个判断 TypeError: must be str, not NoneType if origin_url: origin_url = 'https:' + origin_url item = NewHouseItem() item['province'] = province item['city'] = city_name item['house_name'] = house_name item['sale'] = sale item['phone_num'] = phone_num if phone_num else '暂无电话' item['price'] = price item['tags'] = tags item['rooms'] = rooms item['area'] = area item['address'] = address item['origin_url'] = origin_url yield item # 提取最后一页 last_url = response.xpath( '//ul[@class="clearfix"]/li[@class="fr"]/a[@class="last"]/@href' ).extract_first() # '/house/s/b924/' # 如果某个冷门城市只有一页数据,last_url就不存在,.split('/')出异常 if last_url: last_page = last_url.split('/')[-2].replace( 'b1saledate-b9', '') for i in range(1, int(last_page) + 1): next_url = urljoin( response.url, '/house/s/b1saledate-b9{page}/'.format(page=str(i))) if next_url: yield scrapy.Request(url=next_url, callback=self.parse_newhouse, meta={ 'info': (province, city_name), 'url': next_url }, errback=self.handle_newhouse_err)
def parse_new(self, response): """新房链接爬虫""" province, city = response.meta.get('info') html = etree.HTML(response.text) li_list = html.xpath('//div[@id="newhouse_loupai_list"]//li') # print(li_list) for li in li_list: detail = li.xpath('.//div[@class="nlc_details"]') if detail: # 名字 name = detail[0].xpath( ".//div[@class='nlcd_name']/a/text()")[0].strip() url = detail[0].xpath(".//div[@class='nlcd_name']/a/@href")[0] # 房间 rooms = detail[0].xpath( './/div[@class="house_type clearfix"]//text()') rooms = "".join(rooms) rooms = "".join(rooms.split()) # 判断rooms是否含有居平米等关键字 if rooms.find("居") != -1 and rooms.find("平米") != -1: room = rooms.split("-")[0] area = rooms.split("-")[1] else: room = rooms area = rooms # 价格 price = detail[0].xpath( './/div[@class="nhouse_price"]//text()') price = "".join(price) price = "".join(price.split()) # 详细地址 address = detail[0].xpath( ".//div[@class='address']/a/@title")[0] # print(type(rooms)) # district district = detail[0].xpath( ".//div[@class='address']/a//text()") district = "".join( "".join(district).split()).split("]")[0] + "]" #status status = detail[0].xpath(".//div[@class='fangyuan']//text()") status = "".join(status).split() type = "/".join(status[1:-1]) status = status[0] newhouse_item = {} newhouse_item['new'] = NewHouseItem(province=province, city=city, name=name, price=price, rooms=room, area=area, address=address, district=district, status=status, type=type, url=url) yield newhouse_item next_url = html.xpath('//div[@class="page"]//a[@class="next"]/@href') if next_url: print("当前url", response.url) print("要返回的url:", next_url[0]) if next_url[0].find("http") == -1: base_url = response.url.split("/house")[0] next_url = base_url + next_url[0] else: print("不包含http的链接", next_url) print(next_url) yield scrapy.Request(url=next_url, callback=self.parse_new, meta={'info': (province, city)})
def parse_newhouse(self, response): province, city = response.meta.get('info') # contains div里面的class属性包含有nl_con属性 lis = response.xpath("//div[contains(@class,'nl_con')]//ul//li") for li in lis: name = li.xpath(".//div[@class='nlcd_name']/a/text()").get() if name: name = name.strip() # print(name) price1 = li.xpath( ".//div[@class='nhouse_price']/span/text()").get() price2 = li.xpath(".//div[@class='nhouse_price']/em/text()").get() price = str(price1) + str(price2) if price: price = price.strip() # print(price) rooms = li.xpath( ".//div[contains(@class,'house_type')]/a/text()").getall() if rooms: rooms = "".join(rooms).strip() # print(rooms) area = li.xpath( ".//div[contains(@class,'house_type')]/text()").getall() if area: area = "".join(area).strip() area = re.sub(r'\s|/|-', '', area) # print(area) address = li.xpath( ".//div[contains(@class,'address')]/a/text()").getall() if address: address = "".join(address).strip() address = re.sub(r'\s', '', address) # print(address) district = li.xpath( ".//div[contains(@class,'address')]/a/span/text()").get() if district: district = re.sub(r'\s|\[|\]', '', district) # print(district) sale = li.xpath( ".//div[contains(@class,'fangyuan')]/span/text()").get() if sale: sale = sale.strip() # print(sale) origin_url = li.xpath( ".//div[contains(@class,'nlcd_name')]/a/@href").get() if origin_url: origin_url = origin_url.strip() # print(origin_url) if name: yield NewHouseItem(province=province, city=city, name=name, price=price, rooms=rooms, area=area, address=address, district=district, sale=sale, origin_url=origin_url) else: continue next_url = response.xpath( "//div[@class='page']//a[contains(@class,'next')]/@href").get() if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse, meta={"info": (province, city)})
def parse_newhouse(self, response): province, city_name = response.meta.get('info') lis = response.xpath('//div[@class="nl_con clearfix"]/ul/li') for li in lis: ad = li.xpath('./div[@class="clearfix"]/h3/text()').extract_first() if ad: continue house_name = li.xpath( './/div[@class="house_value clearfix"]//div[@class="nlcd_name"]/a/text()' ).extract_first() if house_name: house_name = re.sub(r"\s", "", house_name) rooms = '/'.join( li.xpath( './/div[@class="house_type clearfix"]/a/text()').extract()) phone_num = ''.join( li.xpath('.//div[@class="tel"]/p//text()').extract()) area = ''.join( li.xpath( './/div[@class="house_type clearfix"]/text()').extract()) area = re.sub('\s|-|/', '', area) address = li.xpath( './/div[@class="address"]/a/@title').extract_first() sale = li.xpath( ".//div[@class='fangyuan']/span/text()").extract_first() tags_list = li.xpath('//div[@id="sjina_C26_07"]//text()').extract() tags = list(filter(None, map(lambda x: x.strip(), tags_list)))[1:] tags = '/'.join(tags) price = li.xpath( ".//div[@class='nhouse_price']/span/text()").extract_first() price_unit = li.xpath( ".//div[@class='nhouse_price']/em/text()").extract_first() nearby = li.xpath('//div[@class="nhouse_price"]/label[2]/text()' ).extract_first() if nearby: price = li.xpath( '//div[@class="nhouse_price"]/i/text()').extract_first() if not price_unit: price = price else: price = price + price_unit #'40500元/㎡' origin_url = li.xpath( ".//div[@class='nlcd_name']/a/@href").extract_first() # 详情页可能会取空,加一个判断 TypeError: must be str, not NoneType if origin_url: origin_url = 'https:' + origin_url item = NewHouseItem() item['province'] = province item['city'] = city_name item['house_name'] = house_name item['sale'] = sale item['phone_num'] = phone_num if phone_num else '暂无电话' item['price'] = price item['tags'] = tags item['rooms'] = rooms item['area'] = area item['address'] = address item['origin_url'] = origin_url yield item last_url = response.xpath( '//ul[@class="clearfix"]/li[@class="fr"]/a[@class="last"]/@href' ).extract_first() # '/house/s/b924/' if last_url: last_page = last_url.split('/')[-2].replace('b9', '') for i in range(1, int(last_page) + 1): next_url = urljoin(response.url, '/house/s/b9{page}/'.format(page=i)) if next_url: yield scrapy.Request( url=next_url, callback=self.parse_newhouse, meta={'info': (province, city_name)})