예제 #1
0
 def parse_json_url(self, response):
     urldetailejson = json.loads(response.body)
     # print("urldetailejson: ", urldetailejson)
     urllinks = urldetailejson.get("tieredResults")[0].get("results")
     for results in urllinks:
         rentUrl = results.get("prettyUrl")
         # print("rentUrl2", rentUrl)
         url = "https://www.realestate.com.au/" + rentUrl
         # self.testdetail(url)
         item = get_item(ScrapymoduleRentItem)
         item['supplier_name'] = response.meta.get("supplier_name")
         item['city'] = response.meta.get("city")
         self.test(url, item)
         yield item
예제 #2
0
 def parse_json(self, response):
     urljson = json.loads(response.body)
     # print("urljson: ", urljson)
     totalcount = urljson.get("totalResultsCount")
     print("totalcount: ", totalcount, "--", response.url)
     # 获取供应商名称和城市
     supplier_name = response.meta.get("supplier_name")
     city = response.meta.get("city")
     # 先获取第一页租房信息列表链接
     if totalcount > 0:
         urllinks = urljson.get("tieredResults")[0].get("results")
         for results in urllinks:
             rentUrl = results.get("prettyUrl")
             # print("rentUrl", rentUrl)
             url = "https://www.realestate.com.au/" + rentUrl
             # self.testdetail(url)
             item = get_item(ScrapymoduleRentItem)
             item['supplier_name'] = supplier_name
             item['city'] = city
             self.test(url, item)
             yield item
     # 获得API的后缀参数
     last = response.meta.get("last")
     page = totalcount / 12
     if page.is_integer():
         page = int(page)
         print("page: ", page)
     else:
         page = math.ceil(page)
         print("tmppage: ", page)
     for p in range(1, page + 1):
         url = "https://services.realestate.com.au/services/listings/search?query={%22channel%22:%22rent%22,%22page%22:" + str(
             p
         ) + ",%22pageSize%22:12,%22filters%22:{%22agencyIds%22:[%22" + last + "%22]}}"
         # print(url)
         yield scrapy.Request(url,
                              callback=self.parse_json_url,
                              meta={
                                  "supplier_name": supplier_name,
                                  "city": city
                              })
예제 #3
0
    def parse_data(self, response):
        item = get_item(ScrapymoduleRentItem)
        print("---------------------------", response.url)
        item['country'] = 'England'
        item['url'] = response.url
        item['city'] = response.meta.get('city')
        print("item['city']: ", item['city'])
        try:
            '''   房源名称:2 Bed Flat, Rupert Street, W1D
                位置:Location
                卧室数量:Bedrooms
                卫浴数量:Bathrooms
                最多可容纳人数:Maximum Tenants
                是否包Bill:Bills Included
                房源描述:Description
                价格:Price
                每月租金:Rent PCM
                设施:Features
                是否对学生友好:Student Friendly
                是否对宠物友好:Pets Allowed
                最短租期:Minimum Tenancy
                起租时间:Available From'''
            house_name = response.xpath(
                "//h1[@class='property-title']//text()").extract()
            clear_space(house_name)
            item['house_name'] = ''.join(house_name).strip()
            print("item['house_name']: ", item['house_name'])

            detaile_address = response.xpath(
                "//html//div[@class='card manage-card mb-0']//tr[1]/td[contains(text(),'Location:')]/following-sibling::td[1]//text()"
            ).extract()
            clear_space(detaile_address)
            item['detaile_address'] = ''.join(detaile_address).strip()
            print("item['detaile_address']: ", item['detaile_address'])

            housing_introduce = response.xpath(
                "//div[@class='description']//text()").extract()
            item['housing_introduce'] = clear_lianxu_space(housing_introduce)
            # print("item['housing_introduce']: ", item['housing_introduce'])

            price = response.xpath(
                "//strong[contains(text(),'Rent PCM')]/../following-sibling::td[1]//text()"
            ).extract()
            clear_space(price)
            item['price'] = ''.join(price).strip()
            print("item['price']: ", item['price'])

            supporting_facilities = response.xpath(
                "//h2[contains(text(),'Features')]/..//text()").extract()
            # clear_space(supporting_facilities)
            item['supporting_facilities'] = clear_lianxu_space(
                supporting_facilities)
            # print("item['supporting_facilities']: ", item['supporting_facilities'])

            lease = response.xpath(
                "//strong[contains(text(),'Minimum Tenancy')]/../following-sibling::td[1]//text()"
            ).extract()
            # clear_space(housing_introduce)
            item['lease'] = clear_lianxu_space(lease)
            print("item['lease']: ", item['lease'])

            available_timeDict = {
                "January": "01",
                "February": "02",
                "March": "03",
                "April": "04",
                "May": "05",
                "June": "06",
                "July": "07",
                "August": "08",
                "September": "09",
                "October": "10",
                "November": "11",
                "December": "12",
            }
            available_time = response.xpath(
                "//strong[contains(text(),'Available From')]/../following-sibling::td[1]//text()"
            ).extract()
            available_time = clear_lianxu_space(available_time)
            if available_time == "Today":
                item['available_time'] = "now"
            elif "Months" in available_time:
                item['available_time'] = available_time
            else:
                a_time = available_time.split(" ")
                # print("a_time: ", a_time)
                item['available_time'] = a_time[
                    -1] + "-" + available_timeDict.get(a_time[1].replace(
                        ',', '').strip()) + "-" + a_time[0]
            print("item['available_time']: ", item['available_time'])

            picture_list = response.xpath(
                "//div//div[@class='property-thumb']//a[@class='photos thumbnail mfp-image']/@href"
            ).extract()
            # print(picture_list)
            for picture in picture_list:
                item['picture'] += picture + "; "
            print("item['picture']: ", item['picture'])

            yield item
        except Exception as e:
            print("异常:", str(e))
            print("报错url:", response.url)
            with open('./error/' + item['city'] + '.txt', 'a+') as f:
                f.write(
                    str(e) + "\n=====================" + item['url'] + "\n")
예제 #4
0
    def parse_data(self, response):
        item = get_item(ScrapymoduleRentItem)
        print("-------------详情页链接--------------", response.url)
        item['country'] = 'England'
        item['url'] = response.url
        item['city'] = response.meta.get('city')
        print(" item['city']: ", item['city'])
        try:
            '''  房源名称:1 bedroom flat to rent
                位置:Park Street, London
                房租:£3,250 pw
                房源描述:Full description'''
            house_name = response.xpath(
                "//h1[@class='fs-22']//text()").extract()
            clear_space(house_name)
            item['house_name'] = ''.join(house_name).strip()
            print("item['house_name']: ", item['house_name'])

            detaile_address = response.xpath(
                "//address[@class='pad-0 fs-16 grid-25']/text()").extract()
            clear_space(detaile_address)
            item['detaile_address'] = ''.join(detaile_address).strip()
            print("item['detaile_address']: ", item['detaile_address'])

            housing_introduce = response.xpath(
                "//h3[contains(text(),'Full description')]/..//text()"
            ).extract()
            item['housing_introduce'] = clear_lianxu_space(housing_introduce)
            print("item['housing_introduce']: ", item['housing_introduce'])

            price = response.xpath(
                "//p[@id='propertyHeaderPrice']//strong/text()").extract()
            clear_space(price)
            item['price'] = ''.join(price).strip()
            print("item['price']: ", item['price'])

            supporting_facilities = response.xpath(
                "//h3[contains(text(),'Key features')]/..//text()").extract()
            # clear_space(supporting_facilities)
            item['supporting_facilities'] = clear_lianxu_space(
                supporting_facilities)
            # print("item['supporting_facilities']: ", item['supporting_facilities'])

            picture_re = re.findall(
                r'<meta\sitemprop="contentUrl"\scontent=".+\.((jpg)|(JPG))',
                response.text)
            # print("picture_re: ", picture_re)
            picture = ''
            for p in picture_re:
                picture += p.replace('<meta itemprop="contentUrl" content="',
                                     '').strip() + '; '
            item['picture'] = picture
            print("item['picture']: ", item['picture'])

            yield item
        except Exception as e:
            print("异常:", str(e))
            print("报错url:", response.url)
            with open('./error/' + item['city'] + '.txt', 'a+') as f:
                f.write(
                    str(e) + "\n=====================" + item['url'] + "\n")
    def testdetail(self, response):
        item = get_item(ScrapymoduleRentItem)
        item['country'] = 'Australia'
        item['city'] = 'Perth'
        item['url'] = response.url
        print("===========================")
        print(response.url)
        try:
            # housing_type
            housing_type = response.xpath(
                "//div[@id='listing_info']/ul[@class='info']/li[@class='property_info']/span[@class='propertyType']//text()"
            ).extract()
            clear_space(housing_type)
            item['housing_type'] = ''.join(housing_type)
            print("item['housing_type']: ", item['housing_type'])

            # available_time
            available_time = response.xpath(
                "//div[@id='listing_info_secondary']/div[@class='available_date']/span//text()"
            ).extract()
            clear_space(available_time)
            # print("available_time: ", available_time)
            available_timeDict = {
                "Jan": "01",
                "Feb": "02",
                "Mar": "03",
                "Apr": "04",
                "May": "05",
                "Jun": "06",
                "Jul": "07",
                "Aug": "08",
                "Sep": "09",
                "Oct": "10",
                "Nov": "11",
                "Dec": "12",
            }
            if available_time[0] == "Available Now":
                item['available_time'] = 'now'
            else:
                available_timetmp = available_time[0].split(" ")[-1]
                # print(available_timetmp)
                available_timetmp1 = available_timetmp.split("-")
                # print("available_timetmp1: ====", available_timetmp1)
                available_timeResult = "20" + available_timetmp1[
                    -1] + "-" + available_timeDict[
                        available_timetmp1[1]] + "-" + available_timetmp1[0]
                item['available_time'] = available_timeResult
            # print("item['available_time']: ", item['available_time'])

            # house_name
            house_name = response.xpath(
                "//div[@id='description']/p[@class='title']//text()").extract(
                )
            clear_space(house_name)
            item['house_name'] = ''.join(house_name)
            # print("item['house_name']: ", item['house_name'])

            # room_type
            room_typeCarspaces = response.xpath(
                "//div[@id='features']/div/div[@class='featureList']/ul[1]/li//text()"
            ).extract()
            clear_space(room_typeCarspaces)
            # print("room_typeCarspaces: ", room_typeCarspaces)
            if item['housing_type'] == "Studio":
                item['room_type'] = 'Studio'
            else:
                room_type = ''
                if "Bedrooms:" in room_typeCarspaces:
                    room_typeIndex1 = room_typeCarspaces.index("Bedrooms:")
                    room_type1 = room_typeCarspaces[room_typeIndex1 + 1]
                    room_type = room_type1
                if "Bathrooms:" in room_typeCarspaces:
                    room_typeIndex2 = room_typeCarspaces.index("Bathrooms:")
                    room_type2 = room_typeCarspaces[room_typeIndex2 + 1]
                    room_type = room_type + "-" + room_type2
                item['room_type'] = room_type
            # print("item['room_type']: ", item['room_type'])

            if "Garage Spaces:" in room_typeCarspaces:
                carIndex = room_typeCarspaces.index("Garage Spaces:")
                item['car_spaces'] = room_typeCarspaces[carIndex + 1]
            elif "Open Car Spaces:" in room_typeCarspaces:
                carIndex = room_typeCarspaces.index("Open Car Spaces:")
                item['car_spaces'] = room_typeCarspaces[carIndex + 1]
            # print("item['car_spaces']: ", item['car_spaces'])

            # lease
            # address
            address = response.xpath(
                "//div[@id='listing_address']/h1/span[@class='detail-address']//text()"
            ).extract()
            clear_space(address)
            item['address'] = ','.join(address)
            # print("item['address']: ", item['address'])

            # detaile_address   //div[@id='description']/h3[@class='address']
            detaile_address = response.xpath(
                "//div[@id='description']/h3[@class='address']//text()"
            ).extract()
            clear_space(detaile_address)
            item['detaile_address'] = ''.join(detaile_address)
            # print("item['detaile_address']: ", item['detaile_address'])

            opentime = response.xpath(
                "//a[@itemprop='events']//text()").extract()
            opentime = ' '.join(opentime)
            if len(opentime) != 0:
                opentimePrefixx = response.xpath(
                    "//div[@id='inspectionTimes']/h3//text()").extract()
                clear_space(opentimePrefixx)
                opentime = ''.join(opentimePrefixx) + opentime
            # supporting_facilities
            housing_introduce = response.xpath(
                "//div[@id='description']/p[@class='body']//text()").extract()
            clear_space(housing_introduce)
            feacture = response.xpath(
                "//div[@id='features']//text()").extract()
            clear_space(feacture)
            floorplans = response.xpath(
                "//div[@id='floorplans']//text()").extract()
            clear_space(floorplans)
            housing_introduce = opentime + ' '.join(
                housing_introduce) + ''.join(feacture) + ''.join(floorplans)
            item['housing_introduce'] = housing_introduce
            # print("item['housing_introduce']: ", item['housing_introduce'])

            # price
            price = response.xpath(
                "//div[@id='listing_info']/ul[@class='info']/li[@class='price']/p[@class='priceText']//text()"
            ).extract()
            clear_space(price)
            item['price'] = ''.join(price)
            # print("item['price']: ", item['price'])

            # isRent
            # postal_code
            # picture
            pictureJs = response.xpath("//script").extract()
            # print("pictureJs: ", pictureJs)
            pictureJsStr = ''.join(pictureJs)
            pictureSrc = re.findall(r'{src:\"[\w\/\.]*jpg\"', pictureJsStr)
            # print("pictureSrc:========== ", pictureSrc)
            # print(len(pictureSrc))
            for index in range(len(pictureSrc)):
                pictureSrc[index] = pictureSrc[index].strip('{src:').strip('"')
                pictureSrc[
                    index] = "https://i3.au.reastatic.net/800x600-resize,extend,r=33,g=40,b=46" + pictureSrc[
                        index]
            # print("pictureSrc:==========11 ", pictureSrc)
            item['picture'] = ';'.join(pictureSrc)
            # print("item['picture']: ", item['picture'])

            # housing_introduce
            # supplier_type
            # supplier_name
            supplier_name = response.xpath(
                "//div[@id='agentInfoExpanded']/div/a/img[@class='logo']/@alt|//div[@id='agentInfoExpanded']/div[1]/text()"
            ).extract()
            clear_space(supplier_name)
            item['supplier_name'] = ''.join(supplier_name)
            # print("item['supplier_name']: ", item['supplier_name'])

            # supplier_logo //div[@class='branding-banner-content']/a/img[@class='logo']/@src
            supplier_logo = response.xpath(
                "//div[@id='agentInfoExpanded']/div/a/img[@class='logo']/@src"
            ).extract()
            clear_space(supplier_logo)
            item['supplier_logo'] = ''.join(supplier_logo)
            # print("item['supplier_logo']: ", item['supplier_logo'])

            # contact_name
            contact_name = response.xpath(
                "//div[@class='agentContactInfo'][1]/p//text()").extract()
            clear_space(contact_name)
            if len(contact_name) != 0:
                item['contact_name'] = contact_name[0]
            print("item['contact_name']: ", item['contact_name'])

            # contact_phone
            contact_phone = response.xpath(
                "//div[@class='agentContactInfo']/ul/li/text()").extract()
            clear_space(contact_phone)
            if len(contact_phone) != 0:
                item['contact_phone'] = contact_phone[0]
            print("item['contact_phone']: ", item['contact_phone'])

            # contact_email

            # print(item)
            yield item
        except Exception as e:
            with open("./error/rentSpider.txt", 'w', encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常:", str(e))
            print("报错url:", response.url)
예제 #6
0
    def parse_data(self, response):
        item = get_item(ScrapymoduleRentItem)
        print("==========================================", response.url)
        item['country'] = 'England'
        item['city'] = response.meta.get('city')
        print("item['city']: ", item['city'])
        item['url'] = response.url
        try:
            '''   房源名称:CRESCENT PLACE
                房租:FROM: £ 134.00 WEEKLY
                设施:FEATURES
                房源描述:Description
                位置:63 St Mary's Road
                房型名称:8 BEDROOM SHARED HOUSE ROOM
                起租时间:AVAILABLE FROM 01/08/2018
                租期:TENANCY LENGTH 45 and 51 week'''
            house_name = response.xpath(
                "//div[@class='col-sm-8']/h1//text()").extract()
            clear_space(house_name)
            item['house_name'] = ''.join(house_name).strip()
            print("item['house_name']: ", item['house_name'])

            supporting_facilities = response.xpath(
                "//div[@class='property-features']//text()").extract()
            # clear_space(supporting_facilities)
            item['supporting_facilities'] = clear_lianxu_space(
                supporting_facilities)
            # print("item['supporting_facilities']: ", item['supporting_facilities'])

            housing_introduce = response.xpath(
                "//div[@id='property-description']//text()").extract()
            clear_space(housing_introduce)
            item['housing_introduce'] = '\n'.join(housing_introduce).strip()
            # print("item['housing_introduce']: ", item['housing_introduce'])

            detaile_address = response.xpath(
                "//i[@class='fa fa-map-marker']/../text()").extract()
            clear_space(detaile_address)
            item['detaile_address'] = ''.join(detaile_address).strip()
            print("item['detaile_address']: ", item['detaile_address'])

            picture_list = response.xpath(
                "//div/div[1]/div[1]/img[1]/@src").extract()
            # print("picture_list: ", picture_list)
            for p in picture_list:
                item['picture'] += p + "; "
            print("item['picture']: ", item['picture'])

            # 房间分为长期出租、短期出租、学生房间    主要以下字段不一样:价格、起租时间、租期、房源类型、房间类型
            # 长期出租
            long_term = response.xpath(
                '//div[@id="property-rooms"]//text()').extract()
            # 短期出租
            short_term = response.xpath(
                "//div[@id='short-term-rooms']/div[@class='room-wrapper']"
            ).extract()
            # print("short_term: ", short_term)
            # 学生房间
            student_room = response.xpath(
                '//div[@id="property-student-rooms"]/div[@class="room-wrapper"]'
            ).extract()
            if len(long_term) != 0:
                print("==============长期出租")
                price = response.xpath(
                    "//div[@id='property-rooms']//div[@class='room-wrapper']//div[@class='row']//div[@class='medium-gray-background clearfix']//div[@class='col-sm-5']//text()"
                ).extract()
                clear_space(price)
                item['price'] = ' '.join(price).replace('From:', "").strip()
                print("item['price']: ", item['price'])

                available_time = response.xpath(
                    '//div[@id="property-rooms"]//div[@class="room-wrapper"]//div[@class="row"]//div[@class="medium-gray-background clearfix"]//div[@class="col-sm-4"]//ul[@class="room-features-list list-unstyled"]//li[@class="room-feature"]/h5[contains(text(),"Available From")]/../following-sibling::li[1]//text()'
                ).extract()
                clear_space(available_time)
                available_time = ''.join(available_time).split('/')
                # print(available_time)
                item['available_time'] = available_time[
                    -1] + "-" + available_time[1] + "-" + available_time[0]
                print("item['available_time']: ", item['available_time'])

                # lease = response.xpath("//i[@class='fa fa-map-marker']/../text()").extract()
                # clear_space(lease)
                item['lease'] = 'Long Term'
                print("item['lease']: ", item['lease'])

                housing_type = response.xpath(
                    "//div[@id='property-rooms']//div[@class='room-wrapper']//div[@class='row']//div[@class='medium-gray-background clearfix']//div[@class='col-sm-3']/h5//text()"
                ).extract()
                clear_space(housing_type)
                item['housing_type'] = ''.join(housing_type).strip()
                print("item['housing_type']: ", item['housing_type'])
                yield item

            if len(short_term) != 0:
                print("==============短期出租")
                for div_n in range(1, len(short_term) + 1):
                    price = response.xpath(
                        "//div[@id='short-term-rooms']//div[@class='room-wrapper']["
                        + str(div_n) +
                        "]//div[@class='row']//div[@class='medium-gray-background clearfix']//div[@class='col-sm-5']//text()"
                    ).extract()
                    clear_space(price)
                    item['price'] = ' '.join(price).replace('From:',
                                                            "").strip()
                    print("item['price']: ", item['price'])

                    available_time = response.xpath(
                        "//div[@id='short-term-rooms']//div[@class='room-wrapper']["
                        + str(div_n) +
                        "]//div[@class='row']//div[@class='medium-gray-background clearfix']//div[@class='col-sm-4']/ul/li[2]//text()"
                    ).extract()
                    clear_space(available_time)
                    available_time = ''.join(available_time).split('/')
                    # print("available_time ",available_time)
                    item['available_time'] = available_time[
                        -1] + "-" + available_time[1] + "-" + available_time[0]
                    print("item['available_time']: ", item['available_time'])

                    # lease = response.xpath("//i[@class='fa fa-map-marker']/../text()").extract()
                    # clear_space(lease)
                    item['lease'] = 'Short Term'
                    # print("item['lease']: ", item['lease'])

                    housing_type = response.xpath(
                        "//div[@id='short-term-rooms']//div[@class='room-wrapper']["
                        + str(div_n) +
                        "]//div[@class='row']//div[@class='medium-gray-background clearfix']//div[@class='col-sm-3']/h6//text()"
                    ).extract()
                    clear_space(housing_type)
                    item['housing_type'] = ''.join(housing_type).strip()
                    print("item['housing_type']: ", item['housing_type'])

                    room_type = response.xpath(
                        "//div[@id='short-term-rooms']//div[@class='room-wrapper']["
                        + str(div_n) +
                        "]//div[@class='row']//div[@class='medium-gray-background clearfix']//div[@class='col-sm-3']/h5//text()"
                    ).extract()
                    clear_space(room_type)
                    item['room_type'] = ''.join(room_type).strip()
                    print("item['room_type']: ", item['room_type'])
                    yield item

            if len(student_room) != 0:
                print("==============学生房间")
                for div_n in range(1, len(student_room) + 1):
                    print("***************第" + str(div_n) + "房***************")
                    price = response.xpath(
                        "//div[@id='property-student-rooms']//div[@class='room-wrapper']["
                        + str(div_n) +
                        "]//div[@class='row']//div[@class='medium-gray-background clearfix']//div[@class='col-sm-5']//text()"
                    ).extract()
                    clear_space(price)
                    item['price'] = ' '.join(price).replace('From:',
                                                            "").strip()
                    print("item['price']: ", item['price'])

                    available_time = response.xpath(
                        "//div[@id='property-student-rooms']//div[@class='room-wrapper']["
                        + str(div_n) +
                        "]//div[@class='row']//div[@class='medium-gray-background clearfix']//div[@class='col-sm-4']/ul/li[2]//text()"
                    ).extract()
                    clear_space(available_time)
                    available_time = ''.join(available_time).split('/')
                    # print("available_time ",available_time)
                    item['available_time'] = available_time[
                        -1] + "-" + available_time[1] + "-" + available_time[0]
                    print("item['available_time']: ", item['available_time'])

                    lease = response.xpath(
                        "//div[@id='property-student-rooms']//div[@class='room-wrapper']["
                        + str(div_n) +
                        "]//div[@class='row']//div[@class='medium-gray-background clearfix']//div[@class='col-sm-4']/ul/li[6]/text()"
                    ).extract()
                    clear_space(lease)
                    item['lease'] = ''.join(lease).strip()
                    print("item['lease']: ", item['lease'])

                    housing_type = response.xpath(
                        "//div[@id='property-student-rooms']//div[@class='room-wrapper']["
                        + str(div_n) +
                        "]//div[@class='row']//div[@class='medium-gray-background clearfix']//div[@class='col-sm-3']/h6//text()"
                    ).extract()
                    clear_space(housing_type)
                    item['housing_type'] = ''.join(housing_type).strip()
                    print("item['housing_type']: ", item['housing_type'])

                    room_type = response.xpath(
                        "//div[@id='property-student-rooms']//div[@class='room-wrapper']["
                        + str(div_n) +
                        "]//div[@class='row']//div[@class='medium-gray-background clearfix']//div[@class='col-sm-3']/h5//text()"
                    ).extract()
                    clear_space(room_type)
                    item['room_type'] = ''.join(room_type).strip()
                    print("item['room_type']: ", item['room_type'])
                    yield item

        except Exception as e:
            print("异常:", str(e))
            print("报错url:", response.url)
예제 #7
0
    def parse_data(self, response):
        item = get_item(ScrapymoduleRentItem)
        print("==========================================", response.url)
        item['country'] = 'England'
        item['url'] = response.url
        item['city'] = response.meta.get('city')
        print("item['city']: ", item['city'])
        try:
            '''   房源名称:2 bed flat to rent
                房源地址:Princes Court, Brompton Road SW3
                每周价格:£3,250
                每月价格:£3,250
                房源描述:Property description
                图片:照片
                房间设施:Property features
                平面图:Floorplan
                立即可租:Available immediately
                有家具:Furnished'''
            house_name = response.xpath(
                "//h2[@class='listing-details-h1']//text()").extract()
            clear_space(house_name)
            item['house_name'] = ''.join(house_name).strip()
            print("item['house_name']: ", item['house_name'])

            detaile_address = response.xpath(
                "//div[@class='listing-details-address']/h2//text()").extract(
                )
            clear_space(detaile_address)
            item['detaile_address'] = ''.join(detaile_address).strip()
            print("item['detaile_address']: ", item['detaile_address'])

            price = response.xpath(
                "//div[@class='listing-details-price text-price']//strong/span//text()"
            ).extract()
            clear_space(price)
            item['price'] = ''.join(price).replace('(',
                                                   "").replace(')',
                                                               '').strip()
            print("item['price']: ", item['price'])

            housing_introduce = response.xpath(
                "//h3[contains(text(),'Property description')]/..//text()"
            ).extract()
            # clear_space(housing_introduce)
            item['housing_introduce'] = clear_lianxu_space(housing_introduce)
            # print("item['housing_introduce']: ", item['housing_introduce'])

            picture_re = re.findall(
                r'<meta\sproperty="og:image"\scontent=".+\.jpg', response.text)
            # print("picture_re: ", picture_re)
            picture = ''
            for p in picture_re:
                picture += p.replace('<meta property="og:image" content="',
                                     '').strip() + '; '
            item['picture'] = picture
            print("item['picture']: ", item['picture'])

            supporting_facilities = response.xpath(
                "//h3[contains(text(),'Property features')]/..//text()"
            ).extract()
            # clear_space(supporting_facilities)
            item['supporting_facilities'] = clear_lianxu_space(
                supporting_facilities)
            # print("item['supporting_facilities']: ", item['supporting_facilities'])

            yield item
        except Exception as e:
            print("异常:", str(e))
            print("报错url:", response.url)
            with open('./error/' + item['city'] + '.txt', 'a+') as f:
                f.write(
                    str(e) + "\n=====================" + item['url'] + "\n")
예제 #8
0
    def parse_data(self, response):
        item = get_item(ScrapymoduleRentItem)
        item['country'] = 'Australia'
        item['url'] = response.url
        print("===========================")
        print(response.url)
        try:
            # housing_type
            housing_type = response.xpath(
                "//div[@id='listing_info']/ul[@class='info']/li[@class='property_info']/span[@class='propertyType']//text()"
            ).extract()
            clear_space(housing_type)
            item['housing_type'] = ''.join(housing_type)
            # print("item['housing_type']: ", item['housing_type'])

            # available_time
            available_time = response.xpath(
                "//div[@id='listing_info_secondary']/div[@class='available_date']/span//text()"
            ).extract()
            clear_space(available_time)
            item['available_time'] = ''.join(available_time)
            # print("item['available_time']: ", item['available_time'])

            # house_name
            house_name = response.xpath(
                "//div[@id='description']/p[@class='title']//text()").extract(
                )
            clear_space(house_name)
            item['house_name'] = ''.join(house_name)
            # print("item['house_name']: ", item['house_name'])

            # room_type
            # lease
            # address
            address = response.xpath(
                "//div[@id='listing_address']/h1/span[@class='detail-address']//text()"
            ).extract()
            clear_space(address)
            item['address'] = ','.join(address)
            # print("item['address']: ", item['address'])

            # detaile_address   //div[@id='description']/h3[@class='address']
            detaile_address = response.xpath(
                "//div[@id='description']/h3[@class='address']//text()"
            ).extract()
            clear_space(detaile_address)
            item['detaile_address'] = ''.join(detaile_address)
            # print("item['detaile_address']: ", item['detaile_address'])

            # supporting_facilities
            supporting_facilities = response.xpath(
                "//div[@id='description']/p[@class='body']//text()").extract()
            clear_space(supporting_facilities)
            item['supporting_facilities'] = ''.join(supporting_facilities)
            # print("item['supporting_facilities']: ", item['supporting_facilities'])

            # price
            price = response.xpath(
                "//div[@id='listing_info']/ul[@class='info']/li[@class='price']/p[@class='priceText']//text()"
            ).extract()
            clear_space(price)
            item['price'] = ''.join(price)
            # print("item['price']: ", item['price'])

            # isRent
            # postal_code
            # picture
            picture = response.xpath(
                "//div[@id='mainPhoto']/div[@class='hero-image__image-wrapper']/a[@class='hero-image__link']/img[@class='hero-image__image']/@src"
            ).extract()
            clear_space(picture)
            item['picture'] = ''.join(picture)
            # print("item['picture']: ", item['picture'])

            # housing_introduce
            # supplier_type
            # supplier_name
            supplier_name = response.xpath(
                "//div[@class='branding-banner-content']/a/img[@class='logo']/@alt"
            ).extract()
            clear_space(supplier_name)
            item['supplier_name'] = ''.join(supplier_name)
            print("item['supplier_name']: ", item['supplier_name'])

            # supplier_logo //div[@class='branding-banner-content']/a/img[@class='logo']/@src
            supplier_logo = response.xpath(
                "//div[@class='branding-banner-content']/a/img[@class='logo']/@src"
            ).extract()
            clear_space(supplier_logo)
            item['supplier_logo'] = ''.join(supplier_logo)
            print("item['supplier_logo']: ", item['supplier_logo'])

            # contact_name
            contact_name = response.xpath(
                "//div[@class='agentContactInfo']/p//text()").extract()
            clear_space(contact_name)
            item['contact_name'] = ','.join(contact_name)
            print("item['contact_name']: ", item['contact_name'])

            # contact_phone
            contact_phone = response.xpath(
                "//div[@class='agentContactInfo']/ul/li/text()").extract()
            clear_space(contact_phone)
            item['contact_phone'] = ','.join(contact_phone)
            print("item['contact_phone']: ", item['contact_phone'])

            # contact_email

            # print(item)
            yield item
        except Exception as e:
            with open("./error/" + item['university'] + item['degree_level'] +
                      ".txt",
                      'w',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常:", str(e))
            print("报错url:", response.url)