def parse_csv(self, sold_houses_content): ''' extract house model from csv. ''' all_sold_houses = [] f = StringIO(sold_houses_content) reader = csv.DictReader(f) for row in reader: house_info = { # Address "street_address": row["ADDRESS"] if "ADDRESS" in row else 'N/A', "city": row["CITY"] if 'CITY' in row else 'N/A', "state": row["STATE"] if "STATE" in row else 'N/A', # rooms "beds": row["BEDS"] if "BEDS" in row else 'N/A', "baths": row["BATHS"] if "BATHS" in row else 'N/A', "sq_ft": row["SQUARE FEET"] if "SQUARE FEET" in row else 'N/A', # info "sold_price": row["PRICE"] if "PRICE" in row else 'N/A', "year_built": row["YEAR BUILT"] if "YEAR BUIL" in row else 'N/A', "property_type": row["PROPERTY TYPE"] if "PROPERTY TYPE" in row else 'N/A', # years on market "days_on_market": row["DAYS ON MARKET"] if "DAYS ON MARKET" in row else 'N/A' } house = House.from_dict(house_info) all_sold_houses.append(house) return all_sold_houses
def parse_single_page(self, page_source): ''' Parse the current page, to get the inform of houses ''' soup = BeautifulSoup(page_source, 'html.parser') all_sold_houses = [] # Every Home Card contains home info. # Price. Address. Type. sold_houses_info = soup.find_all('div', attrs={"class": "HomeCardContainer"}) for sold_house in sold_houses_info: house_json = {} # home stat. Beds. Baths try: # May V1 or V2 home_stat = sold_house.find( 'div', {"class": "HomeStats font-size-smaller"}) if home_stat is None: home_stat = sold_house.find( 'div', {"class": "HomeStatsV2 font-size-small"}) self.parse_home_stat2_v2(home_stat, house_json) else: self.parse_home_stats_v1(home_stat, house_json) except: print("parse home stat failed.") house_json['beds'] = 'N/A' house_json['baths'] = 'N/A' house_json['sq_ft'] = 'N/A' sold_house_address = sold_house.find('script') self.parse_house_address(sold_house_address, house_json) # price try: price = sold_house.find('span', { "class": "homecardPrice font-size-small font-weight-bold" }) if price is None: price = sold_house.find('span', {"class": "homecardV2Price"}) house_json['sold_price'] = price.text[1:].replace(',', '') except: print('parse house price failed.') house_json['sold_price'] = 'N/A' # # sold time. # try: # sold_time = sold_house.find('span', {'class': 'HomeSash font-weight-bold roundedCorners'}) # house_json['sold_price'] = sold_time.text # except: # house_json['sold_price'] = 'N/A' house_json['year_built'] = 'N/A' house_json['days_on_market'] = 'N/A' house = House.from_dict(house_json) all_sold_houses.append(house) return all_sold_houses