Пример #1
0
def get_data():
    """ Return DataFrame containing weather data in Alameda county based on Ranson's criteria
    return:
        DataFrame: weather data
    """
    points = get_alameda_county_points()
    return filter_ranson_criteria(clean_data(get_weather_data(points)))
def main(elements=['TMAX', 'TMIN'], points=get_alameda_county_points()):
    """Bins adjusted data for each element. Binning idea received
    by: https://stackoverflow.com/questions/34317149/pandas-groupby-with-bin-counts
    
    args:
        elements: specifies elements to calculate data for (default ['TMAX', 'TMIN'])
    return:
        list: returns list of binned data as dataframe
    """

    df = filter_ranson_criteria(clean_data(get_weather_data(points)))
    result = []
    bin_map = {
        'PRCP': (0, 4, 14, 29, 2000),
        'TMAX': range(0, 100, 10),
        'TMIN': range(0, 100, 10)
    }
    for element in elements:
        filtered = df[df['ELEMENT'] == element]
        corrected = bias_correction(filtered)

        # average data from the same day
        grouped = corrected.groupby(['year', 'month', 'day']).aggregate({
            'adjusted_data':
            np.mean
        }).reset_index()

        # bin the data
        final = grouped.groupby([
            'year', 'month',
            pd.cut(grouped['adjusted_data'], bin_map[element], right=True)
        ]).size().unstack().fillna(0.0)
        result.append(final)
    return result
def divide_alameda_points():
    """ Return a list of West Alameda points and a list of East Alameda points

    return:
        list: list of West Alameda (lat, lon) points
        list: list of East Alameda (lat, lon) points
    """
    grid_points = get_alameda_county_points()
    border_west = -122.06  # westernmost point of "East Alameda", points to the left are in West Alameda
    border_east = -121.82  # easternmost point in "West Alameda", points to the right are in East Alameda

    # zipcodes identified to be in West or East Alameda based on Google Maps
    zip_loc = {
        'west': [94539, 94552, 94544, 94536, 94537, 94538],
        'east': [94568, 94566, 94586]
    }

    west_alameda = []
    east_alameda = []
    for point in grid_points:
        if point[1] < border_west:
            west_alameda.append(point)
        elif point[1] > border_east:
            east_alameda.append(point)
        else:
            # classify ambiguous points that lie between border_west and border_east
            try:
                # get zipcode of unknown point
                unknown_zip = int(
                    re.findall(
                        r'\d+',
                        geolocator.reverse(point).raw['address']['postcode'])
                    [0])
                if unknown_zip in zip_loc['west']:
                    west_alameda.append(point)
                elif unknown_zip in zip_loc['east']:
                    east_alameda.append(point)
            except:
                # Some points represent landmarks or roads that stretch multiple zipcodes,
                # causing the above code to error. We handle these cases separately.
                unknown_loc = geolocator.reverse(
                    point).raw['address']['hamlet']
                if unknown_loc == 'Kilkare Woods':
                    west_alameda.append(point)

    return west_alameda, east_alameda
def get_weather_data(points=get_alameda_county_points(), max_distance=10):
    """Gets weather data for a given set of grid points

    args:
        points: list of grid points (lat, lon) (default alameda county grid)
        max_distance: max distance to search around each grid point (default 10)
    return:
        pd.DataFrame: dataframe with data from weather and stations
    """

    # load data from csv
    stations = pd.DataFrame(get_stations(points, max_distance))
    weather = pd.read_csv('Assignment3/data/weather_data_ca.csv')

    # merge data on the ID
    merged = stations.merge(weather, on='ID', how='inner')
    return merged
class TestAssignment2(unittest.TestCase):

    grid = get_alameda_county_points()

    def test_get_alameda_county_points(self):
        """ Test get_alameda_county_points function and confirms border of Alameda County points 
        1) Tests that all points are within bounding box
        3) Tests that points within grid are actually in Alameda County 
        """
        grid = TestAssignment2.grid
        north, west, south, east = 38, -122.4, 37.4, -121.4  # bounding box
        # check that all points are within bounding box
        for coord in grid:
            self.assertTrue(west < coord[1] < east)
            self.assertTrue(south < coord[0] < north)
        # check that points within grid are actually in Alameda County
        for i in range(1, 5):
            self.assertTrue(
                rg.search(grid[i * 3])[0]['admin2'] == "Alameda County")

    def test_get_stations(self):
        """ Take "randomly" handpicked weather stations from stations_ca and use Google Maps 
        to measure distance from Alameda County. Check that test_stations confirms this.
        """
        grid = TestAssignment2.grid
        stations = get_stations(grid, 6)
        #USC00046333, 37.8, -122.2667, should be inside
        #US1CACC0003, 37.9485, -122.0541, should be outside
        #US1CACC0001, 37.9898, -122.1085, should be outside
        #US1CAAL0031, 37.5685, -121.9654, should be inside
        #USC00046332, 37.7833, -122.1667, should be inside
        #USC00046336, 37.7983, -122.2642, should be inside
        #US1CABT0002, 39.514, -121.518, should be outside

        stations_indices = []
        for i in stations:
            stations_indices.append(i['ID'])

        inside = ["USC00046333", "US1CAAL0031", "USC00046332", "USC00046336"]
        outside = ["US1CACC0003", "US1CACC0001", "US1CABT0002"]

        for index in inside:
            self.assertTrue(index in stations_indices)

        for index in outside:
            self.assertTrue(index not in stations_indices)

    def test_get_station_weights(self):
        """ 
        Test the station weights produced by get_station_weights
        """
        grid = TestAssignment2.grid
        weights = get_station_weights(grid)
        # check that station weights are >= 0
        for weight in weights:
            self.assertGreater(weight, 0)

    def test_main(self):
        """ Test the main function in assignment2 to make sure it runs.
        The functions used in main are already tested above.
        """
        main()
def bias_correction(df):
    """Adjusts the data by doing a bias correction

    args:
        df: dataframe to run bias correction on
    return:
        df: dataframe with bias corrected data
    """

    stations = df['NAME'].unique()

    # initialize intercepts
    intercepts = [0 for _ in range(len(stations))]
    old_intercepts = intercepts[:]
    count = 0

    # iterate till convergence
    while True:

        # pick random point in reference
        reference_idx = np.random.randint(0, len(stations))
        reference_station = stations[reference_idx]

        # iterate through all stations to update intercept
        for station_idx in range(len(stations)):
            station = stations[station_idx]

            # get intersection of stations
            n, k = station_intersection(reference_station, station, df)

            # calculate sum for all intersected rows
            curr_sum = 0.0
            for _, row in k.iterrows():
                station1_data = row['DATA VALUE_x']
                station2_data = row['DATA VALUE_y']
                curr_sum += station1_data + intercepts[reference_idx] - (
                    station2_data + intercepts[station_idx])

            # update the stations intercept
            if n != 0:
                intercepts[
                    station_idx] = intercepts[station_idx] + curr_sum / n

        if count % (2 * len(stations)) == 0 and count != 0:
            # check if it converges by calculating loss
            loss = np.sum((np.array(intercepts) - np.array(old_intercepts))**2)
            old_intercepts = intercepts[:]
            # set convergence criteria
            if loss < 0.000001:
                break
        count += 1

    # map station name to intercept value
    bias = dict(zip(stations, intercepts))

    # get latitude and longitude of stations
    lat_lon = []
    for station in stations:
        data = df[df['NAME'] == station].iloc[0]
        latitude, longitude = data['LATITUDE'], data['LONGITUDE']
        lat_lon.append((latitude, longitude))

    # get station weights
    weights = np.array(
        calc_inv_weighted_avg(get_alameda_county_points(), lat_lon))
    weights = weights * np.array(intercepts) / np.sum(weights)

    # calculate C
    C = dict(zip(stations, weights))

    # apply formula to calculate adjusted data
    df['adjusted_data'] = df.apply(
        lambda x: x['DATA VALUE'] + bias[x['NAME']] - C[x['NAME']], axis=1)

    return df