Пример #1
def get_data():
    """ Return DataFrame containing weather data in Alameda county based on Ranson's criteria
        DataFrame: weather data
    points = get_alameda_county_points()
    return filter_ranson_criteria(clean_data(get_weather_data(points)))
def main(elements=['TMAX', 'TMIN'], points=get_alameda_county_points()):
    """Bins adjusted data for each element. Binning idea received
    by: https://stackoverflow.com/questions/34317149/pandas-groupby-with-bin-counts
        elements: specifies elements to calculate data for (default ['TMAX', 'TMIN'])
        list: returns list of binned data as dataframe

    df = filter_ranson_criteria(clean_data(get_weather_data(points)))
    result = []
    bin_map = {
        'PRCP': (0, 4, 14, 29, 2000),
        'TMAX': range(0, 100, 10),
        'TMIN': range(0, 100, 10)
    for element in elements:
        filtered = df[df['ELEMENT'] == element]
        corrected = bias_correction(filtered)

        # average data from the same day
        grouped = corrected.groupby(['year', 'month', 'day']).aggregate({

        # bin the data
        final = grouped.groupby([
            'year', 'month',
            pd.cut(grouped['adjusted_data'], bin_map[element], right=True)
    return result
def divide_alameda_points():
    """ Return a list of West Alameda points and a list of East Alameda points

        list: list of West Alameda (lat, lon) points
        list: list of East Alameda (lat, lon) points
    grid_points = get_alameda_county_points()
    border_west = -122.06  # westernmost point of "East Alameda", points to the left are in West Alameda
    border_east = -121.82  # easternmost point in "West Alameda", points to the right are in East Alameda

    # zipcodes identified to be in West or East Alameda based on Google Maps
    zip_loc = {
        'west': [94539, 94552, 94544, 94536, 94537, 94538],
        'east': [94568, 94566, 94586]

    west_alameda = []
    east_alameda = []
    for point in grid_points:
        if point[1] < border_west:
        elif point[1] > border_east:
            # classify ambiguous points that lie between border_west and border_east
                # get zipcode of unknown point
                unknown_zip = int(
                if unknown_zip in zip_loc['west']:
                elif unknown_zip in zip_loc['east']:
                # Some points represent landmarks or roads that stretch multiple zipcodes,
                # causing the above code to error. We handle these cases separately.
                unknown_loc = geolocator.reverse(
                if unknown_loc == 'Kilkare Woods':

    return west_alameda, east_alameda
def get_weather_data(points=get_alameda_county_points(), max_distance=10):
    """Gets weather data for a given set of grid points

        points: list of grid points (lat, lon) (default alameda county grid)
        max_distance: max distance to search around each grid point (default 10)
        pd.DataFrame: dataframe with data from weather and stations

    # load data from csv
    stations = pd.DataFrame(get_stations(points, max_distance))
    weather = pd.read_csv('Assignment3/data/weather_data_ca.csv')

    # merge data on the ID
    merged = stations.merge(weather, on='ID', how='inner')
    return merged
class TestAssignment2(unittest.TestCase):

    grid = get_alameda_county_points()

    def test_get_alameda_county_points(self):
        """ Test get_alameda_county_points function and confirms border of Alameda County points 
        1) Tests that all points are within bounding box
        3) Tests that points within grid are actually in Alameda County 
        grid = TestAssignment2.grid
        north, west, south, east = 38, -122.4, 37.4, -121.4  # bounding box
        # check that all points are within bounding box
        for coord in grid:
            self.assertTrue(west < coord[1] < east)
            self.assertTrue(south < coord[0] < north)
        # check that points within grid are actually in Alameda County
        for i in range(1, 5):
                rg.search(grid[i * 3])[0]['admin2'] == "Alameda County")

    def test_get_stations(self):
        """ Take "randomly" handpicked weather stations from stations_ca and use Google Maps 
        to measure distance from Alameda County. Check that test_stations confirms this.
        grid = TestAssignment2.grid
        stations = get_stations(grid, 6)
        #USC00046333, 37.8, -122.2667, should be inside
        #US1CACC0003, 37.9485, -122.0541, should be outside
        #US1CACC0001, 37.9898, -122.1085, should be outside
        #US1CAAL0031, 37.5685, -121.9654, should be inside
        #USC00046332, 37.7833, -122.1667, should be inside
        #USC00046336, 37.7983, -122.2642, should be inside
        #US1CABT0002, 39.514, -121.518, should be outside

        stations_indices = []
        for i in stations:

        inside = ["USC00046333", "US1CAAL0031", "USC00046332", "USC00046336"]
        outside = ["US1CACC0003", "US1CACC0001", "US1CABT0002"]

        for index in inside:
            self.assertTrue(index in stations_indices)

        for index in outside:
            self.assertTrue(index not in stations_indices)

    def test_get_station_weights(self):
        Test the station weights produced by get_station_weights
        grid = TestAssignment2.grid
        weights = get_station_weights(grid)
        # check that station weights are >= 0
        for weight in weights:
            self.assertGreater(weight, 0)

    def test_main(self):
        """ Test the main function in assignment2 to make sure it runs.
        The functions used in main are already tested above.
def bias_correction(df):
    """Adjusts the data by doing a bias correction

        df: dataframe to run bias correction on
        df: dataframe with bias corrected data

    stations = df['NAME'].unique()

    # initialize intercepts
    intercepts = [0 for _ in range(len(stations))]
    old_intercepts = intercepts[:]
    count = 0

    # iterate till convergence
    while True:

        # pick random point in reference
        reference_idx = np.random.randint(0, len(stations))
        reference_station = stations[reference_idx]

        # iterate through all stations to update intercept
        for station_idx in range(len(stations)):
            station = stations[station_idx]

            # get intersection of stations
            n, k = station_intersection(reference_station, station, df)

            # calculate sum for all intersected rows
            curr_sum = 0.0
            for _, row in k.iterrows():
                station1_data = row['DATA VALUE_x']
                station2_data = row['DATA VALUE_y']
                curr_sum += station1_data + intercepts[reference_idx] - (
                    station2_data + intercepts[station_idx])

            # update the stations intercept
            if n != 0:
                    station_idx] = intercepts[station_idx] + curr_sum / n

        if count % (2 * len(stations)) == 0 and count != 0:
            # check if it converges by calculating loss
            loss = np.sum((np.array(intercepts) - np.array(old_intercepts))**2)
            old_intercepts = intercepts[:]
            # set convergence criteria
            if loss < 0.000001:
        count += 1

    # map station name to intercept value
    bias = dict(zip(stations, intercepts))

    # get latitude and longitude of stations
    lat_lon = []
    for station in stations:
        data = df[df['NAME'] == station].iloc[0]
        latitude, longitude = data['LATITUDE'], data['LONGITUDE']
        lat_lon.append((latitude, longitude))

    # get station weights
    weights = np.array(
        calc_inv_weighted_avg(get_alameda_county_points(), lat_lon))
    weights = weights * np.array(intercepts) / np.sum(weights)

    # calculate C
    C = dict(zip(stations, weights))

    # apply formula to calculate adjusted data
    df['adjusted_data'] = df.apply(
        lambda x: x['DATA VALUE'] + bias[x['NAME']] - C[x['NAME']], axis=1)

    return df