示例#1
0
    def parse_csv(self, sold_houses_content):
        '''
        extract house model from csv.
        '''
        all_sold_houses = []

        f = StringIO(sold_houses_content)
        reader = csv.DictReader(f)

        for row in reader:
            house_info = {
                # Address
                "street_address":
                row["ADDRESS"] if "ADDRESS" in row else 'N/A',
                "city":
                row["CITY"] if 'CITY' in row else 'N/A',
                "state":
                row["STATE"] if "STATE" in row else 'N/A',
                # rooms
                "beds":
                row["BEDS"] if "BEDS" in row else 'N/A',
                "baths":
                row["BATHS"] if "BATHS" in row else 'N/A',
                "sq_ft":
                row["SQUARE FEET"] if "SQUARE FEET" in row else 'N/A',
                # info
                "sold_price":
                row["PRICE"] if "PRICE" in row else 'N/A',
                "year_built":
                row["YEAR BUILT"] if "YEAR BUIL" in row else 'N/A',
                "property_type":
                row["PROPERTY TYPE"] if "PROPERTY TYPE" in row else 'N/A',
                # years on market
                "days_on_market":
                row["DAYS ON MARKET"] if "DAYS ON MARKET" in row else 'N/A'
            }

            house = House.from_dict(house_info)
            all_sold_houses.append(house)
        return all_sold_houses
示例#2
0
    def parse_single_page(self, page_source):
        '''
        Parse the current page, to get the inform of houses
        '''
        soup = BeautifulSoup(page_source, 'html.parser')

        all_sold_houses = []

        # Every Home Card contains home info.
        # Price. Address. Type.
        sold_houses_info = soup.find_all('div',
                                         attrs={"class": "HomeCardContainer"})

        for sold_house in sold_houses_info:
            house_json = {}

            # home stat. Beds. Baths
            try:
                # May V1 or V2
                home_stat = sold_house.find(
                    'div', {"class": "HomeStats font-size-smaller"})
                if home_stat is None:
                    home_stat = sold_house.find(
                        'div', {"class": "HomeStatsV2 font-size-small"})
                    self.parse_home_stat2_v2(home_stat, house_json)
                else:
                    self.parse_home_stats_v1(home_stat, house_json)
            except:
                print("parse home stat failed.")
                house_json['beds'] = 'N/A'
                house_json['baths'] = 'N/A'
                house_json['sq_ft'] = 'N/A'

            sold_house_address = sold_house.find('script')
            self.parse_house_address(sold_house_address, house_json)

            # price
            try:
                price = sold_house.find('span', {
                    "class":
                    "homecardPrice font-size-small font-weight-bold"
                })
                if price is None:
                    price = sold_house.find('span',
                                            {"class": "homecardV2Price"})
                house_json['sold_price'] = price.text[1:].replace(',', '')
            except:
                print('parse house price failed.')
                house_json['sold_price'] = 'N/A'

            # # sold time.
            # try:
            #     sold_time = sold_house.find('span', {'class': 'HomeSash font-weight-bold roundedCorners'})
            #     house_json['sold_price'] = sold_time.text
            # except:
            #     house_json['sold_price'] = 'N/A'

            house_json['year_built'] = 'N/A'
            house_json['days_on_market'] = 'N/A'

            house = House.from_dict(house_json)
            all_sold_houses.append(house)
        return all_sold_houses