예제 #1
0
def list2frame(data_list):
    data_frame = pd.DataFrame(
        data_list,
        columns=[
            "Address",
            "Sell Price",
            "Beds",
            "Baths",
            "Home Size",
            "Home Type",
            "Year Built",
            "Heating",
            "Cooling",
            "Parking",
            "Lot Size",
        ],
    )

    # Print data_frame to screen
    pd.set_option(
        "display.max_rows", None, "display.max_columns", None, "display.width", None
    )

    # Log dirty data_frame
    logtofile(__name__, data_frame, "INFO")

    return data_frame
예제 #2
0
def laundry(dirty_laundry):
    # loop over each row in the data frame for cleaning
    for i, row in dirty_laundry.iterrows():
        # loop over each column in each row for cleaning
        for j, col in row.iteritems():
            # first convert all "n/a"s, "No Data"s, and "--"s to np.nans, and continue next loop iteration
            if ("n/a" in col or "No Data" in col or "--" in col or "Off" in col
                    or col == ""):
                dirty_laundry.loc[i, j] = np.nan
                continue
            # remove the "sqft" string from each item that contains "sqft", then continue to next loop iteration
            if "sqft" in col.lower():
                # first split the item into a list that contains the value and the string
                col = col.lower().split()
                # then remove the item in the new list that matches "sqft"
                col.remove("sqft")
                # store the remaining item from the new size1 list (i.e. the value itself) into the original df location
                dirty_laundry.loc[i, j] = float(col[0])
                continue
            # remove the "acres" string from each item that contains "acres" and convert to sqft, then continue to next
            # loop iteration
            if "acres" in col.lower():
                # first split the item into a list that contains the value and the string
                col = col.lower().split()
                # then remove the item in the new list that matches "acres"
                col.remove("acres")
                # store the remaining item from the new size1 list (i.e. the value itself) into the original df location
                dirty_laundry.loc[i, j] = float(col[0]) * 43560.0
                continue

    # convert Sell Price, Beds, Baths, Home Size, Year Built, and Lot Size columns into float type
    dirty_laundry["Sell Price"] = dirty_laundry["Sell Price"].astype(float)
    dirty_laundry["Beds"] = dirty_laundry["Beds"].astype(float)
    dirty_laundry["Baths"] = dirty_laundry["Baths"].astype(float)
    dirty_laundry["Home Size"] = dirty_laundry["Home Size"].astype(float)
    dirty_laundry["Year Built"] = dirty_laundry["Year Built"].astype(float)
    dirty_laundry["Lot Size"] = dirty_laundry["Lot Size"].astype(float)

    # Log cleaned data frame
    logtofile(__name__, dirty_laundry, "INFO")
    # Log cleaned data frame data types
    logtofile(__name__, dirty_laundry.dtypes, "INFO")

    print(dirty_laundry)
    print()
    # return dirty_laundry, which is actually now cleaned
    return dirty_laundry
예제 #3
0
def scrapepagecount(html, page_type):
    if page_type.lower() == "zillow":
        # Zillow keeps the page numbers in the "search-pagination" class
        page_list = html.find("div", class_="search-pagination")
        if page_list is None:
            mess = "Only 1 page of Zillow search results exist in this search box."
            logtofile(__name__, mess, "INFO")
            page_count = 1
        else:
            page_links = page_list.find_all("a")
            mess = (
                str(page_links[-2].text) +
                " pages of Zillow search results exist in this search box.")
            logtofile(__name__, mess, "INFO")
            page_count = int(page_links[-2].text)
    else:
        sys.exit(
            "ERROR!!! scrapepagecount.py currently only works for Zillow page types. \n"
            "Please ensure 'Zillow' is being passed as the page_type argument for this function"
        )
    return page_count
예제 #4
0
def scraperesultcount(html, page_type, zillow_max_result):
    if page_type.lower() == "zillow":
        # Zillow keeps the result count in the "result-count" class
        result_count = int(
            (html.find("span", class_="result-count")).text.split()[0].replace(
                ",", ""))
        mess = (str(result_count) +
                " Zillow search results were found within the search box.")
        logtofile(__name__, mess, "INFO")
        # Zillow limits the number of pages of a search to 20, and the number of results/page to 40.
        # Therefore, the maximum number of results one can obtain is 800.
        if result_count > zillow_max_result:
            mess = (
                "Warning! The maximum number of Zillow search results was met or exceeded. \n"
                "Therefore, only the first " + str(zillow_max_result) +
                " search result links will be scraped.")
            logtofile(__name__, mess, "INFO")
            result_count = zillow_max_result
    else:
        sys.exit(
            "ERROR!!! scraperesultcount.py currently only works for Zillow page types. \n"
            "Please ensure 'Zillow' is being passed as the page_type argument for this function"
        )
    return result_count
예제 #5
0
def buildsearchbox(address, search_box_half_width):

    # Nominatim is the GeoCoder class used in this code
    geolocator = Nominatim(user_agent="hbf")

    # Using the GeoCoder, we obtain the coordinates of the user supplied address, and if the address cannot be found by
    # the GeoCoder, we prompt the user for a new address. If the address cannot be found, location
    # is returned as None. When .latitude and .longitude are tried to be executed on a NoneType, the AttributeError is
    # thrown and hence a new address from the user is prompted.
    while True:
        try:
            location = geolocator.geocode(address)
            origin = geopy.Point(location.latitude, location.longitude)
        except AttributeError:
            print(
                "ERROR!!! The user supplied address cannot be GeoCoded with Nominatim.\n"
                "In some instances, a mailing address is different from its physical address.\n"
                "If this is the case, please try supplying only the street number, name, state, and zip code."
            )
            address = input(
                "Try typing in the specific physical address using method above: "
            )
            print("Trying to GeoCode the new address: ", address)
        else:
            logtofile(__name__, "The address has been GeoCoded!", "INFO")
            break
    mess = "The geocoded origin of the address is: " + str(repr(origin))
    logtofile(__name__, mess, "INFO")
    # The search box coordinates are obtained via a geodesic measurement from the origin (e.g. user supplied address)
    # to the direct North, direct East, direct South, and direct West using the user supplied search_box_half_width.
    # The North Boundary thus becomes the latitude of the geodesic measurement from the origin, and so on.
    north_destination = geodesic(
        kilometers=(search_box_half_width * 1.60934)).destination(origin, 0.0)
    north_boundary = north_destination.latitude
    east_destination = geodesic(kilometers=(search_box_half_width *
                                            1.60934)).destination(
                                                origin, 90.0)
    east_boundary = east_destination.longitude
    south_destination = geodesic(kilometers=(search_box_half_width *
                                             1.60934)).destination(
                                                 origin, 180.0)
    south_boundary = south_destination.latitude
    west_destination = geodesic(kilometers=(search_box_half_width *
                                            1.60934)).destination(
                                                origin, 270.0)
    west_boundary = west_destination.longitude

    mess = ("Searching within a square area of half-width " +
            str(search_box_half_width) + " miles.")
    logtofile(__name__, mess, "INFO")

    return north_boundary, south_boundary, east_boundary, west_boundary
예제 #6
0
def getzillowurls(
    page_type,
    north_boundary,
    south_boundary,
    east_boundary,
    west_boundary,
    zillow_max_result,
    zillow_req_headers,
):
    # The first/initial Zillow search page url is constructed with buildzillowsearchpageurl.py
    zillow_search_page_url = buildzillowsearchpageurl(page_type, None,
                                                      north_boundary,
                                                      south_boundary,
                                                      east_boundary,
                                                      west_boundary)

    # Open up the Zillow search page to view the search box, results, and see how many pages there are.
    # import webbrowser
    # webbrowser.open_new(zillow_search_page_url)

    # Using the zillow_search_page_url, its HTML is requested and parsed using gethtml function
    zillow_search_page_html = gethtml(zillow_search_page_url,
                                      zillow_req_headers)

    # Find the total number of pages of results on this first/initial Zillow search page.
    zillow_page_count = scrapepagecount(zillow_search_page_html, "Zillow")

    # Find the total number of Zillow results.
    zillow_result_count = scraperesultcount(zillow_search_page_html, "Zillow",
                                            zillow_max_result)

    # Loop over the number of pages found from the original Zillow search to obtain each home's unique url.
    zillow_home_urls = []
    for i in range(0, zillow_page_count):
        if i == 0:
            # The page one search URL was already requested and parsed above; therefore, we scrape it to
            # get all of the unique home URLs on this page.
            zillow_home_urls.extend(
                scrapehomeurls(zillow_search_page_html, zillow_result_count,
                               "Zillow"))

        else:
            # For each page after page one, the Zillow search page URL must be constructed, requested, parsed, and
            # scraped.
            zillow_search_page_url = buildzillowsearchpageurl(
                page_type,
                i,
                north_boundary,
                south_boundary,
                east_boundary,
                west_boundary,
            )

            # Using the zillow_search_page_url, its HTML is requested and parsed using gethtml function
            zillow_search_page_html = gethtml(zillow_search_page_url,
                                              zillow_req_headers)

            # Scrape the current Zillow search page to obtain all unique home URLs.
            zillow_home_urls.extend(
                scrapehomeurls(zillow_search_page_html, zillow_result_count,
                               "Zillow"))

    # All unique home URLs are printed to log, in case the user needs to quickly visit any one of them.
    mess = "\n".join(zillow_home_urls)
    logtofile(__name__, mess, "INFO")

    # An equality check is completed to make sure the number of zillow_home_urls is the same as the
    # zillow_result_count. The number of zillow_home_urls is a count of each unique home url we scraped. The
    # zillow_result_count is the total number of search results from the original Zillow search web page. If they are
    # not equal, it may mean more or less home URLs have been scraped than what the original Zillow search result
    # count was from the original Zillow web page, thus indicating a potential error.
    if len(zillow_home_urls) == zillow_result_count:
        mess = (
            "The number of unique home urls scraped from all pages equals \n"
            "the Zillow Result Count that was scraped from the initial page.")
        logtofile(__name__, mess, "INFO")
    else:
        mess = (
            "WARNING!!! The number of unique home urls scraped from all pages DOES NOT EQUAL \n"
            "the Zillow Result Count that was scraped from the initial page. There may be an issue with Zillow \n"
            "recommending/providing additional urls on subsequent pages that weren't originally included in \n"
            "the Zillow Result Count, or additional pages of results were not scraped because they are hidden \n"
            "behind Java Script.")
        logtofile(__name__, mess, "WARNING")

    return zillow_home_urls