Exemplo n.º 1
0
 def test_combine_first_both_none(self):
     data1 = COMBINE_TEST_DATA_1.copy()
     data2 = COMBINE_TEST_DATA_2.copy()
     result = combine_tables([data1[0:1], data2[3:4]], ["key"])
     self.assertEqual(1, len(result))
     self.assertEqual(2, result.loc[0, "value_column_1"])
     self.assertEqual(2, result.loc[0, "value_column_2"])
Exemplo n.º 2
0
 def test_combine_second_right_none(self):
     data1 = COMBINE_TEST_DATA_1.copy()
     data2 = COMBINE_TEST_DATA_2.copy()
     result = combine_tables([data1[3:4], data2[2:3]], ["key"])
     self.assertEqual(1, len(result))
     self.assertEqual(2, result.loc[0, "value_column_1"])
     self.assertEqual(1, result.loc[0, "value_column_2"])
Exemplo n.º 3
0
 def test_combine_all_none(self):
     data1 = COMBINE_TEST_DATA_1.copy()
     data2 = COMBINE_TEST_DATA_2.copy()
     result = combine_tables([data1[0:1], data2[0:1]], ["key"])
     self.assertEqual(1, len(result))
     self.assertTrue(isnull(result.loc[0, "value_column_1"]))
     self.assertTrue(isnull(result.loc[0, "value_column_2"]))
Exemplo n.º 4
0
 def test_combine_second_left_none(self):
     data1 = TEST_DATA_1.copy()
     data2 = TEST_DATA_2.copy()
     result = combine_tables([data1[3:4], data2[1:2]], ["key"])
     self.assertEqual(1, len(result))
     self.assertEqual("1", result.loc[0, "value_column_1"])
     self.assertEqual("2", result.loc[0, "value_column_2"])
Exemplo n.º 5
0
 def test_combine_first_right_none(self):
     data1 = TEST_DATA_1.copy()
     data2 = TEST_DATA_2.copy()
     result = combine_tables([data1[2:3], data2[3:4]], ["key"])
     self.assertEqual(1, len(result))
     self.assertEqual("2", result.loc[0, "value_column_1"])
     self.assertEqual("2", result.loc[0, "value_column_2"])
Exemplo n.º 6
0
    def process_location(station_cache: Dict[str, DataFrame],
                         stations: DataFrame, location: Series):
        nearest = stations.copy()
        nearest["key"] = location.key

        # Get the nearest stations from our list of stations given lat and lon
        nearest["distance"] = NoaaGsodDataSource.haversine_distance(
            nearest, location.lat, location.lon)

        # Filter out the 10 nearest stations
        nearest = nearest[nearest.distance < _DISTANCE_THRESHOLD].sort_values(
            "distance").iloc[:10]

        # Early exit: no stations found within distance threshold
        if len(nearest) == 0 or all(station_id not in station_cache
                                    for station_id in nearest.id.values):
            return DataFrame(columns=_OUTPUT_COLUMNS)

        # Get station records from the cache
        nearest = nearest.rename(columns={
            "id": "noaa_station",
            "distance": "noaa_distance"
        })
        station_tables = [
            station_cache.get(station_id)
            for station_id in nearest.noaa_station.values
        ]
        station_tables = [
            table.merge(nearest, on="noaa_station") for table in station_tables
            if table is not None
        ]
        data = combine_tables(reversed(station_tables), ["date", "key"])

        # Return all the available data from the records
        return data[[col for col in _OUTPUT_COLUMNS if col in data.columns]]
Exemplo n.º 7
0
    def station_records(station_cache: Dict[str, DataFrame],
                        stations: DataFrame, location: Series):
        nearest = stations.copy()
        nearest["key"] = location.key

        # Get the nearest stations from our list of stations given lat and lon
        nearest["distance"] = NoaaGhcnDataSource.haversine_distance(
            nearest, location.lat, location.lon)

        # Filter out the 10 nearest stations
        nearest = nearest[nearest.distance < _DISTANCE_THRESHOLD].sort_values(
            "distance").iloc[:20]

        # Early exit: no stations found within distance threshold
        if len(nearest) == 0:
            return DataFrame(columns=_OUTPUT_COLUMNS)

        # Query the cache and pull data only if not already cached
        for station_id in filter(lambda x: x not in station_cache,
                                 nearest.id.values):

            # Read the records from the nearest station
            # Use our mirror since NOAA's website is very flaky
            station_url = _STATION_URL_TPL.format(station_id)
            data = read_csv(
                station_url,
                usecols=lambda column: column in _COLUMN_MAPPING.keys())
            data = data.rename(columns=_COLUMN_MAPPING)

            # Convert temperature to correct values
            data["minimum_temperature"] = data["minimum_temperature"].apply(
                NoaaGhcnDataSource.fix_temp)
            data["maximum_temperature"] = data["maximum_temperature"].apply(
                NoaaGhcnDataSource.fix_temp)

            # Get only data for 2020 and add location values
            data = data[data.date > "2019-12-31"]

            # Save into the cache
            station_cache[station_id] = data

        # Get station records from the cache
        nearest = nearest.rename(columns={
            "id": "noaa_station",
            "distance": "noaa_distance"
        })
        station_tables = [
            station_cache[station_id]
            for station_id in nearest.noaa_station.values
        ]
        station_tables = [table.merge(nearest) for table in station_tables]
        data = combine_tables(reversed(station_tables), ["date", "key"])

        # Return all the available data from the records
        return data[[col for col in _OUTPUT_COLUMNS if col in data.columns]]