def parse_json_url(self, response): urldetailejson = json.loads(response.body) # print("urldetailejson: ", urldetailejson) urllinks = urldetailejson.get("tieredResults")[0].get("results") for results in urllinks: rentUrl = results.get("prettyUrl") # print("rentUrl2", rentUrl) url = "https://www.realestate.com.au/" + rentUrl # self.testdetail(url) item = get_item(ScrapymoduleRentItem) item['supplier_name'] = response.meta.get("supplier_name") item['city'] = response.meta.get("city") self.test(url, item) yield item
def parse_json(self, response): urljson = json.loads(response.body) # print("urljson: ", urljson) totalcount = urljson.get("totalResultsCount") print("totalcount: ", totalcount, "--", response.url) # 获取供应商名称和城市 supplier_name = response.meta.get("supplier_name") city = response.meta.get("city") # 先获取第一页租房信息列表链接 if totalcount > 0: urllinks = urljson.get("tieredResults")[0].get("results") for results in urllinks: rentUrl = results.get("prettyUrl") # print("rentUrl", rentUrl) url = "https://www.realestate.com.au/" + rentUrl # self.testdetail(url) item = get_item(ScrapymoduleRentItem) item['supplier_name'] = supplier_name item['city'] = city self.test(url, item) yield item # 获得API的后缀参数 last = response.meta.get("last") page = totalcount / 12 if page.is_integer(): page = int(page) print("page: ", page) else: page = math.ceil(page) print("tmppage: ", page) for p in range(1, page + 1): url = "https://services.realestate.com.au/services/listings/search?query={%22channel%22:%22rent%22,%22page%22:" + str( p ) + ",%22pageSize%22:12,%22filters%22:{%22agencyIds%22:[%22" + last + "%22]}}" # print(url) yield scrapy.Request(url, callback=self.parse_json_url, meta={ "supplier_name": supplier_name, "city": city })
def parse_data(self, response): item = get_item(ScrapymoduleRentItem) print("---------------------------", response.url) item['country'] = 'England' item['url'] = response.url item['city'] = response.meta.get('city') print("item['city']: ", item['city']) try: ''' 房源名称:2 Bed Flat, Rupert Street, W1D 位置:Location 卧室数量:Bedrooms 卫浴数量:Bathrooms 最多可容纳人数:Maximum Tenants 是否包Bill:Bills Included 房源描述:Description 价格:Price 每月租金:Rent PCM 设施:Features 是否对学生友好:Student Friendly 是否对宠物友好:Pets Allowed 最短租期:Minimum Tenancy 起租时间:Available From''' house_name = response.xpath( "//h1[@class='property-title']//text()").extract() clear_space(house_name) item['house_name'] = ''.join(house_name).strip() print("item['house_name']: ", item['house_name']) detaile_address = response.xpath( "//html//div[@class='card manage-card mb-0']//tr[1]/td[contains(text(),'Location:')]/following-sibling::td[1]//text()" ).extract() clear_space(detaile_address) item['detaile_address'] = ''.join(detaile_address).strip() print("item['detaile_address']: ", item['detaile_address']) housing_introduce = response.xpath( "//div[@class='description']//text()").extract() item['housing_introduce'] = clear_lianxu_space(housing_introduce) # print("item['housing_introduce']: ", item['housing_introduce']) price = response.xpath( "//strong[contains(text(),'Rent PCM')]/../following-sibling::td[1]//text()" ).extract() clear_space(price) item['price'] = ''.join(price).strip() print("item['price']: ", item['price']) supporting_facilities = response.xpath( "//h2[contains(text(),'Features')]/..//text()").extract() # clear_space(supporting_facilities) item['supporting_facilities'] = clear_lianxu_space( supporting_facilities) # print("item['supporting_facilities']: ", item['supporting_facilities']) lease = response.xpath( "//strong[contains(text(),'Minimum Tenancy')]/../following-sibling::td[1]//text()" ).extract() # clear_space(housing_introduce) item['lease'] = clear_lianxu_space(lease) print("item['lease']: ", item['lease']) available_timeDict = { "January": "01", "February": "02", "March": "03", "April": "04", "May": "05", "June": "06", "July": "07", "August": "08", "September": "09", "October": "10", "November": "11", "December": "12", } available_time = response.xpath( "//strong[contains(text(),'Available From')]/../following-sibling::td[1]//text()" ).extract() available_time = clear_lianxu_space(available_time) if available_time == "Today": item['available_time'] = "now" elif "Months" in available_time: item['available_time'] = available_time else: a_time = available_time.split(" ") # print("a_time: ", a_time) item['available_time'] = a_time[ -1] + "-" + available_timeDict.get(a_time[1].replace( ',', '').strip()) + "-" + a_time[0] print("item['available_time']: ", item['available_time']) picture_list = response.xpath( "//div//div[@class='property-thumb']//a[@class='photos thumbnail mfp-image']/@href" ).extract() # print(picture_list) for picture in picture_list: item['picture'] += picture + "; " print("item['picture']: ", item['picture']) yield item except Exception as e: print("异常:", str(e)) print("报错url:", response.url) with open('./error/' + item['city'] + '.txt', 'a+') as f: f.write( str(e) + "\n=====================" + item['url'] + "\n")
def parse_data(self, response): item = get_item(ScrapymoduleRentItem) print("-------------详情页链接--------------", response.url) item['country'] = 'England' item['url'] = response.url item['city'] = response.meta.get('city') print(" item['city']: ", item['city']) try: ''' 房源名称:1 bedroom flat to rent 位置:Park Street, London 房租:£3,250 pw 房源描述:Full description''' house_name = response.xpath( "//h1[@class='fs-22']//text()").extract() clear_space(house_name) item['house_name'] = ''.join(house_name).strip() print("item['house_name']: ", item['house_name']) detaile_address = response.xpath( "//address[@class='pad-0 fs-16 grid-25']/text()").extract() clear_space(detaile_address) item['detaile_address'] = ''.join(detaile_address).strip() print("item['detaile_address']: ", item['detaile_address']) housing_introduce = response.xpath( "//h3[contains(text(),'Full description')]/..//text()" ).extract() item['housing_introduce'] = clear_lianxu_space(housing_introduce) print("item['housing_introduce']: ", item['housing_introduce']) price = response.xpath( "//p[@id='propertyHeaderPrice']//strong/text()").extract() clear_space(price) item['price'] = ''.join(price).strip() print("item['price']: ", item['price']) supporting_facilities = response.xpath( "//h3[contains(text(),'Key features')]/..//text()").extract() # clear_space(supporting_facilities) item['supporting_facilities'] = clear_lianxu_space( supporting_facilities) # print("item['supporting_facilities']: ", item['supporting_facilities']) picture_re = re.findall( r'<meta\sitemprop="contentUrl"\scontent=".+\.((jpg)|(JPG))', response.text) # print("picture_re: ", picture_re) picture = '' for p in picture_re: picture += p.replace('<meta itemprop="contentUrl" content="', '').strip() + '; ' item['picture'] = picture print("item['picture']: ", item['picture']) yield item except Exception as e: print("异常:", str(e)) print("报错url:", response.url) with open('./error/' + item['city'] + '.txt', 'a+') as f: f.write( str(e) + "\n=====================" + item['url'] + "\n")
def testdetail(self, response): item = get_item(ScrapymoduleRentItem) item['country'] = 'Australia' item['city'] = 'Perth' item['url'] = response.url print("===========================") print(response.url) try: # housing_type housing_type = response.xpath( "//div[@id='listing_info']/ul[@class='info']/li[@class='property_info']/span[@class='propertyType']//text()" ).extract() clear_space(housing_type) item['housing_type'] = ''.join(housing_type) print("item['housing_type']: ", item['housing_type']) # available_time available_time = response.xpath( "//div[@id='listing_info_secondary']/div[@class='available_date']/span//text()" ).extract() clear_space(available_time) # print("available_time: ", available_time) available_timeDict = { "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05", "Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12", } if available_time[0] == "Available Now": item['available_time'] = 'now' else: available_timetmp = available_time[0].split(" ")[-1] # print(available_timetmp) available_timetmp1 = available_timetmp.split("-") # print("available_timetmp1: ====", available_timetmp1) available_timeResult = "20" + available_timetmp1[ -1] + "-" + available_timeDict[ available_timetmp1[1]] + "-" + available_timetmp1[0] item['available_time'] = available_timeResult # print("item['available_time']: ", item['available_time']) # house_name house_name = response.xpath( "//div[@id='description']/p[@class='title']//text()").extract( ) clear_space(house_name) item['house_name'] = ''.join(house_name) # print("item['house_name']: ", item['house_name']) # room_type room_typeCarspaces = response.xpath( "//div[@id='features']/div/div[@class='featureList']/ul[1]/li//text()" ).extract() clear_space(room_typeCarspaces) # print("room_typeCarspaces: ", room_typeCarspaces) if item['housing_type'] == "Studio": item['room_type'] = 'Studio' else: room_type = '' if "Bedrooms:" in room_typeCarspaces: room_typeIndex1 = room_typeCarspaces.index("Bedrooms:") room_type1 = room_typeCarspaces[room_typeIndex1 + 1] room_type = room_type1 if "Bathrooms:" in room_typeCarspaces: room_typeIndex2 = room_typeCarspaces.index("Bathrooms:") room_type2 = room_typeCarspaces[room_typeIndex2 + 1] room_type = room_type + "-" + room_type2 item['room_type'] = room_type # print("item['room_type']: ", item['room_type']) if "Garage Spaces:" in room_typeCarspaces: carIndex = room_typeCarspaces.index("Garage Spaces:") item['car_spaces'] = room_typeCarspaces[carIndex + 1] elif "Open Car Spaces:" in room_typeCarspaces: carIndex = room_typeCarspaces.index("Open Car Spaces:") item['car_spaces'] = room_typeCarspaces[carIndex + 1] # print("item['car_spaces']: ", item['car_spaces']) # lease # address address = response.xpath( "//div[@id='listing_address']/h1/span[@class='detail-address']//text()" ).extract() clear_space(address) item['address'] = ','.join(address) # print("item['address']: ", item['address']) # detaile_address //div[@id='description']/h3[@class='address'] detaile_address = response.xpath( "//div[@id='description']/h3[@class='address']//text()" ).extract() clear_space(detaile_address) item['detaile_address'] = ''.join(detaile_address) # print("item['detaile_address']: ", item['detaile_address']) opentime = response.xpath( "//a[@itemprop='events']//text()").extract() opentime = ' '.join(opentime) if len(opentime) != 0: opentimePrefixx = response.xpath( "//div[@id='inspectionTimes']/h3//text()").extract() clear_space(opentimePrefixx) opentime = ''.join(opentimePrefixx) + opentime # supporting_facilities housing_introduce = response.xpath( "//div[@id='description']/p[@class='body']//text()").extract() clear_space(housing_introduce) feacture = response.xpath( "//div[@id='features']//text()").extract() clear_space(feacture) floorplans = response.xpath( "//div[@id='floorplans']//text()").extract() clear_space(floorplans) housing_introduce = opentime + ' '.join( housing_introduce) + ''.join(feacture) + ''.join(floorplans) item['housing_introduce'] = housing_introduce # print("item['housing_introduce']: ", item['housing_introduce']) # price price = response.xpath( "//div[@id='listing_info']/ul[@class='info']/li[@class='price']/p[@class='priceText']//text()" ).extract() clear_space(price) item['price'] = ''.join(price) # print("item['price']: ", item['price']) # isRent # postal_code # picture pictureJs = response.xpath("//script").extract() # print("pictureJs: ", pictureJs) pictureJsStr = ''.join(pictureJs) pictureSrc = re.findall(r'{src:\"[\w\/\.]*jpg\"', pictureJsStr) # print("pictureSrc:========== ", pictureSrc) # print(len(pictureSrc)) for index in range(len(pictureSrc)): pictureSrc[index] = pictureSrc[index].strip('{src:').strip('"') pictureSrc[ index] = "https://i3.au.reastatic.net/800x600-resize,extend,r=33,g=40,b=46" + pictureSrc[ index] # print("pictureSrc:==========11 ", pictureSrc) item['picture'] = ';'.join(pictureSrc) # print("item['picture']: ", item['picture']) # housing_introduce # supplier_type # supplier_name supplier_name = response.xpath( "//div[@id='agentInfoExpanded']/div/a/img[@class='logo']/@alt|//div[@id='agentInfoExpanded']/div[1]/text()" ).extract() clear_space(supplier_name) item['supplier_name'] = ''.join(supplier_name) # print("item['supplier_name']: ", item['supplier_name']) # supplier_logo //div[@class='branding-banner-content']/a/img[@class='logo']/@src supplier_logo = response.xpath( "//div[@id='agentInfoExpanded']/div/a/img[@class='logo']/@src" ).extract() clear_space(supplier_logo) item['supplier_logo'] = ''.join(supplier_logo) # print("item['supplier_logo']: ", item['supplier_logo']) # contact_name contact_name = response.xpath( "//div[@class='agentContactInfo'][1]/p//text()").extract() clear_space(contact_name) if len(contact_name) != 0: item['contact_name'] = contact_name[0] print("item['contact_name']: ", item['contact_name']) # contact_phone contact_phone = response.xpath( "//div[@class='agentContactInfo']/ul/li/text()").extract() clear_space(contact_phone) if len(contact_phone) != 0: item['contact_phone'] = contact_phone[0] print("item['contact_phone']: ", item['contact_phone']) # contact_email # print(item) yield item except Exception as e: with open("./error/rentSpider.txt", 'w', encoding="utf-8") as f: f.write( str(e) + "\n" + response.url + "\n========================") print("异常:", str(e)) print("报错url:", response.url)
def parse_data(self, response): item = get_item(ScrapymoduleRentItem) print("==========================================", response.url) item['country'] = 'England' item['city'] = response.meta.get('city') print("item['city']: ", item['city']) item['url'] = response.url try: ''' 房源名称:CRESCENT PLACE 房租:FROM: £ 134.00 WEEKLY 设施:FEATURES 房源描述:Description 位置:63 St Mary's Road 房型名称:8 BEDROOM SHARED HOUSE ROOM 起租时间:AVAILABLE FROM 01/08/2018 租期:TENANCY LENGTH 45 and 51 week''' house_name = response.xpath( "//div[@class='col-sm-8']/h1//text()").extract() clear_space(house_name) item['house_name'] = ''.join(house_name).strip() print("item['house_name']: ", item['house_name']) supporting_facilities = response.xpath( "//div[@class='property-features']//text()").extract() # clear_space(supporting_facilities) item['supporting_facilities'] = clear_lianxu_space( supporting_facilities) # print("item['supporting_facilities']: ", item['supporting_facilities']) housing_introduce = response.xpath( "//div[@id='property-description']//text()").extract() clear_space(housing_introduce) item['housing_introduce'] = '\n'.join(housing_introduce).strip() # print("item['housing_introduce']: ", item['housing_introduce']) detaile_address = response.xpath( "//i[@class='fa fa-map-marker']/../text()").extract() clear_space(detaile_address) item['detaile_address'] = ''.join(detaile_address).strip() print("item['detaile_address']: ", item['detaile_address']) picture_list = response.xpath( "//div/div[1]/div[1]/img[1]/@src").extract() # print("picture_list: ", picture_list) for p in picture_list: item['picture'] += p + "; " print("item['picture']: ", item['picture']) # 房间分为长期出租、短期出租、学生房间 主要以下字段不一样:价格、起租时间、租期、房源类型、房间类型 # 长期出租 long_term = response.xpath( '//div[@id="property-rooms"]//text()').extract() # 短期出租 short_term = response.xpath( "//div[@id='short-term-rooms']/div[@class='room-wrapper']" ).extract() # print("short_term: ", short_term) # 学生房间 student_room = response.xpath( '//div[@id="property-student-rooms"]/div[@class="room-wrapper"]' ).extract() if len(long_term) != 0: print("==============长期出租") price = response.xpath( "//div[@id='property-rooms']//div[@class='room-wrapper']//div[@class='row']//div[@class='medium-gray-background clearfix']//div[@class='col-sm-5']//text()" ).extract() clear_space(price) item['price'] = ' '.join(price).replace('From:', "").strip() print("item['price']: ", item['price']) available_time = response.xpath( '//div[@id="property-rooms"]//div[@class="room-wrapper"]//div[@class="row"]//div[@class="medium-gray-background clearfix"]//div[@class="col-sm-4"]//ul[@class="room-features-list list-unstyled"]//li[@class="room-feature"]/h5[contains(text(),"Available From")]/../following-sibling::li[1]//text()' ).extract() clear_space(available_time) available_time = ''.join(available_time).split('/') # print(available_time) item['available_time'] = available_time[ -1] + "-" + available_time[1] + "-" + available_time[0] print("item['available_time']: ", item['available_time']) # lease = response.xpath("//i[@class='fa fa-map-marker']/../text()").extract() # clear_space(lease) item['lease'] = 'Long Term' print("item['lease']: ", item['lease']) housing_type = response.xpath( "//div[@id='property-rooms']//div[@class='room-wrapper']//div[@class='row']//div[@class='medium-gray-background clearfix']//div[@class='col-sm-3']/h5//text()" ).extract() clear_space(housing_type) item['housing_type'] = ''.join(housing_type).strip() print("item['housing_type']: ", item['housing_type']) yield item if len(short_term) != 0: print("==============短期出租") for div_n in range(1, len(short_term) + 1): price = response.xpath( "//div[@id='short-term-rooms']//div[@class='room-wrapper'][" + str(div_n) + "]//div[@class='row']//div[@class='medium-gray-background clearfix']//div[@class='col-sm-5']//text()" ).extract() clear_space(price) item['price'] = ' '.join(price).replace('From:', "").strip() print("item['price']: ", item['price']) available_time = response.xpath( "//div[@id='short-term-rooms']//div[@class='room-wrapper'][" + str(div_n) + "]//div[@class='row']//div[@class='medium-gray-background clearfix']//div[@class='col-sm-4']/ul/li[2]//text()" ).extract() clear_space(available_time) available_time = ''.join(available_time).split('/') # print("available_time ",available_time) item['available_time'] = available_time[ -1] + "-" + available_time[1] + "-" + available_time[0] print("item['available_time']: ", item['available_time']) # lease = response.xpath("//i[@class='fa fa-map-marker']/../text()").extract() # clear_space(lease) item['lease'] = 'Short Term' # print("item['lease']: ", item['lease']) housing_type = response.xpath( "//div[@id='short-term-rooms']//div[@class='room-wrapper'][" + str(div_n) + "]//div[@class='row']//div[@class='medium-gray-background clearfix']//div[@class='col-sm-3']/h6//text()" ).extract() clear_space(housing_type) item['housing_type'] = ''.join(housing_type).strip() print("item['housing_type']: ", item['housing_type']) room_type = response.xpath( "//div[@id='short-term-rooms']//div[@class='room-wrapper'][" + str(div_n) + "]//div[@class='row']//div[@class='medium-gray-background clearfix']//div[@class='col-sm-3']/h5//text()" ).extract() clear_space(room_type) item['room_type'] = ''.join(room_type).strip() print("item['room_type']: ", item['room_type']) yield item if len(student_room) != 0: print("==============学生房间") for div_n in range(1, len(student_room) + 1): print("***************第" + str(div_n) + "房***************") price = response.xpath( "//div[@id='property-student-rooms']//div[@class='room-wrapper'][" + str(div_n) + "]//div[@class='row']//div[@class='medium-gray-background clearfix']//div[@class='col-sm-5']//text()" ).extract() clear_space(price) item['price'] = ' '.join(price).replace('From:', "").strip() print("item['price']: ", item['price']) available_time = response.xpath( "//div[@id='property-student-rooms']//div[@class='room-wrapper'][" + str(div_n) + "]//div[@class='row']//div[@class='medium-gray-background clearfix']//div[@class='col-sm-4']/ul/li[2]//text()" ).extract() clear_space(available_time) available_time = ''.join(available_time).split('/') # print("available_time ",available_time) item['available_time'] = available_time[ -1] + "-" + available_time[1] + "-" + available_time[0] print("item['available_time']: ", item['available_time']) lease = response.xpath( "//div[@id='property-student-rooms']//div[@class='room-wrapper'][" + str(div_n) + "]//div[@class='row']//div[@class='medium-gray-background clearfix']//div[@class='col-sm-4']/ul/li[6]/text()" ).extract() clear_space(lease) item['lease'] = ''.join(lease).strip() print("item['lease']: ", item['lease']) housing_type = response.xpath( "//div[@id='property-student-rooms']//div[@class='room-wrapper'][" + str(div_n) + "]//div[@class='row']//div[@class='medium-gray-background clearfix']//div[@class='col-sm-3']/h6//text()" ).extract() clear_space(housing_type) item['housing_type'] = ''.join(housing_type).strip() print("item['housing_type']: ", item['housing_type']) room_type = response.xpath( "//div[@id='property-student-rooms']//div[@class='room-wrapper'][" + str(div_n) + "]//div[@class='row']//div[@class='medium-gray-background clearfix']//div[@class='col-sm-3']/h5//text()" ).extract() clear_space(room_type) item['room_type'] = ''.join(room_type).strip() print("item['room_type']: ", item['room_type']) yield item except Exception as e: print("异常:", str(e)) print("报错url:", response.url)
def parse_data(self, response): item = get_item(ScrapymoduleRentItem) print("==========================================", response.url) item['country'] = 'England' item['url'] = response.url item['city'] = response.meta.get('city') print("item['city']: ", item['city']) try: ''' 房源名称:2 bed flat to rent 房源地址:Princes Court, Brompton Road SW3 每周价格:£3,250 每月价格:£3,250 房源描述:Property description 图片:照片 房间设施:Property features 平面图:Floorplan 立即可租:Available immediately 有家具:Furnished''' house_name = response.xpath( "//h2[@class='listing-details-h1']//text()").extract() clear_space(house_name) item['house_name'] = ''.join(house_name).strip() print("item['house_name']: ", item['house_name']) detaile_address = response.xpath( "//div[@class='listing-details-address']/h2//text()").extract( ) clear_space(detaile_address) item['detaile_address'] = ''.join(detaile_address).strip() print("item['detaile_address']: ", item['detaile_address']) price = response.xpath( "//div[@class='listing-details-price text-price']//strong/span//text()" ).extract() clear_space(price) item['price'] = ''.join(price).replace('(', "").replace(')', '').strip() print("item['price']: ", item['price']) housing_introduce = response.xpath( "//h3[contains(text(),'Property description')]/..//text()" ).extract() # clear_space(housing_introduce) item['housing_introduce'] = clear_lianxu_space(housing_introduce) # print("item['housing_introduce']: ", item['housing_introduce']) picture_re = re.findall( r'<meta\sproperty="og:image"\scontent=".+\.jpg', response.text) # print("picture_re: ", picture_re) picture = '' for p in picture_re: picture += p.replace('<meta property="og:image" content="', '').strip() + '; ' item['picture'] = picture print("item['picture']: ", item['picture']) supporting_facilities = response.xpath( "//h3[contains(text(),'Property features')]/..//text()" ).extract() # clear_space(supporting_facilities) item['supporting_facilities'] = clear_lianxu_space( supporting_facilities) # print("item['supporting_facilities']: ", item['supporting_facilities']) yield item except Exception as e: print("异常:", str(e)) print("报错url:", response.url) with open('./error/' + item['city'] + '.txt', 'a+') as f: f.write( str(e) + "\n=====================" + item['url'] + "\n")
def parse_data(self, response): item = get_item(ScrapymoduleRentItem) item['country'] = 'Australia' item['url'] = response.url print("===========================") print(response.url) try: # housing_type housing_type = response.xpath( "//div[@id='listing_info']/ul[@class='info']/li[@class='property_info']/span[@class='propertyType']//text()" ).extract() clear_space(housing_type) item['housing_type'] = ''.join(housing_type) # print("item['housing_type']: ", item['housing_type']) # available_time available_time = response.xpath( "//div[@id='listing_info_secondary']/div[@class='available_date']/span//text()" ).extract() clear_space(available_time) item['available_time'] = ''.join(available_time) # print("item['available_time']: ", item['available_time']) # house_name house_name = response.xpath( "//div[@id='description']/p[@class='title']//text()").extract( ) clear_space(house_name) item['house_name'] = ''.join(house_name) # print("item['house_name']: ", item['house_name']) # room_type # lease # address address = response.xpath( "//div[@id='listing_address']/h1/span[@class='detail-address']//text()" ).extract() clear_space(address) item['address'] = ','.join(address) # print("item['address']: ", item['address']) # detaile_address //div[@id='description']/h3[@class='address'] detaile_address = response.xpath( "//div[@id='description']/h3[@class='address']//text()" ).extract() clear_space(detaile_address) item['detaile_address'] = ''.join(detaile_address) # print("item['detaile_address']: ", item['detaile_address']) # supporting_facilities supporting_facilities = response.xpath( "//div[@id='description']/p[@class='body']//text()").extract() clear_space(supporting_facilities) item['supporting_facilities'] = ''.join(supporting_facilities) # print("item['supporting_facilities']: ", item['supporting_facilities']) # price price = response.xpath( "//div[@id='listing_info']/ul[@class='info']/li[@class='price']/p[@class='priceText']//text()" ).extract() clear_space(price) item['price'] = ''.join(price) # print("item['price']: ", item['price']) # isRent # postal_code # picture picture = response.xpath( "//div[@id='mainPhoto']/div[@class='hero-image__image-wrapper']/a[@class='hero-image__link']/img[@class='hero-image__image']/@src" ).extract() clear_space(picture) item['picture'] = ''.join(picture) # print("item['picture']: ", item['picture']) # housing_introduce # supplier_type # supplier_name supplier_name = response.xpath( "//div[@class='branding-banner-content']/a/img[@class='logo']/@alt" ).extract() clear_space(supplier_name) item['supplier_name'] = ''.join(supplier_name) print("item['supplier_name']: ", item['supplier_name']) # supplier_logo //div[@class='branding-banner-content']/a/img[@class='logo']/@src supplier_logo = response.xpath( "//div[@class='branding-banner-content']/a/img[@class='logo']/@src" ).extract() clear_space(supplier_logo) item['supplier_logo'] = ''.join(supplier_logo) print("item['supplier_logo']: ", item['supplier_logo']) # contact_name contact_name = response.xpath( "//div[@class='agentContactInfo']/p//text()").extract() clear_space(contact_name) item['contact_name'] = ','.join(contact_name) print("item['contact_name']: ", item['contact_name']) # contact_phone contact_phone = response.xpath( "//div[@class='agentContactInfo']/ul/li/text()").extract() clear_space(contact_phone) item['contact_phone'] = ','.join(contact_phone) print("item['contact_phone']: ", item['contact_phone']) # contact_email # print(item) yield item except Exception as e: with open("./error/" + item['university'] + item['degree_level'] + ".txt", 'w', encoding="utf-8") as f: f.write( str(e) + "\n" + response.url + "\n========================") print("异常:", str(e)) print("报错url:", response.url)