Exemplo n.º 1
0
def test_compute_moving_avg_from_daily_data():
    daily_vacc_data = soda_data.VACCINATION_DATA_OBJ
    response = socrata_api_requests.SocrataAPIClient(
        daily_vacc_data.request_url)
    daily_data_df = response.data_df

    col_to_avg = 'total_doses_daily'

    data_transformations.compute_moving_avg_from_daily_data(
        daily_data_df, 'zip_code', 'date', [col_to_avg])

    true_avg = statistics.mean(
        daily_data_df.loc[daily_data_df['zip_code'] == "60637"]
        ['total_doses_daily'][:data_transformations.MOVING_AVG_WINDOW])

    assert true_avg == (daily_data_df.loc[daily_data_df['zip_code'] == '60637']
                        [data_transformations.MOVING_AVG_COL_PREFIX +
                         col_to_avg][6:7].values[0])
Exemplo n.º 2
0
def test_soda_data_groupby_query():

    soda_obj_groupby_query = soda_data.SodaData(
        "Traffic Crashes - Crashes",
        "TRAFFIC_CRASHES",
        "85ca-t3if", ["COUNT(CRASH_RECORD_ID)", "CRASH_DATE"],
        group_by=['CRASH_DATE'],
        limit=100)

    api_resp_groupby = socrata_api_requests.SocrataAPIClient(
        soda_obj_groupby_query.request_url)

    correct_query = "https://data.cityofchicago.org/resource/85ca-t3if.json" \
                    "?$query=SELECT COUNT(CRASH_RECORD_ID), " \
                    "CRASH_DATE GROUP BY CRASH_DATE LIMIT 100"

    assert correct_query == soda_obj_groupby_query.request_url
    assert api_resp_groupby.response.status_code == 200
db = dbclient.DBClient(db_path=dbclient.DB_PATH_TEST)

# SOCRATA DATA PROCESS [data from https://data.cityofchicago.org]
# 1. get SodaData obj (representing single dataset) from soda_data global const
# 2. use SocrataAPIClient to get dataset, using SodaData.request_url
#    this returns a json that is converted to pandas dataframe
#    by default, all data values are of type str
# 3. standardize
# 4. compute weekly averages
# 5. use dbclient to create sql table from the pandas df

# Vaccinations
data_obj = soda_data.VACCINATION_DATA_OBJ  # 1
print(f" ##### making api request and create table for {data_obj.dataset_name} ####")
print(f"    sqlite table will be named {data_obj.sql_table_name}")
api_resp = socrata_api_requests.SocrataAPIClient(data_obj.request_url)  # 2
data_transformations.standardize_zip_code_col(api_resp.data_df, soda_data.VACC_ZIP_COL_NAME)  # 3
data_transformations.standardize_date_col(api_resp.data_df, soda_data.VACC_DATE_COL_NAME)
data_transformations.\
    compute_moving_avg_from_daily_data(api_resp.data_df,
                                       data_transformations.STD_ZIP_COL_NAME,  # should store this
                                       data_transformations.STD_DATE_COL_NAME,  # this too
                                       data_obj.COLS_TO_AVG)  # 4
db.create_table_from_pandas(api_resp.data_df, data_obj.sql_table_name)  # 5
print(f"    request url: {api_resp.request_url}")
print(f"    request headers {api_resp.header_fields}")
print(f"    request header dtypes {api_resp.header_dtypes}")
print("~~~~ pandas df dtypes ~~~~")
print(api_resp.data_df.dtypes)
print("~~~~ sql table info ~~~~~")
print(db.get_table_info(data_obj.sql_table_name))
Exemplo n.º 4
0
def build_back2normal_db():
    """
    Builds database from various datasources
    For each source:
        1. get data from source (API, CSV, etc) and convert to pandas DatFrame
        2. standardize zip code col name and format
        3. standardize date col name and format
        4. compute 7 day moving average
        5. create table in sqlite db
    """

    if os.path.exists(dbclient.DB_PATH):
        print("Deleting existing db and recreating with build_db_script\n")
        os.remove(dbclient.DB_PATH)
    db = dbclient.DBClient()

    # Vaccinations
    vacc_data_obj = soda_data.VACCINATION_DATA_OBJ
    vacc_api_resp = socrata_api_requests.SocrataAPIClient(vacc_data_obj.request_url)
    data_transformations.standardize_zip_code_col(vacc_api_resp.data_df, soda_data.VACC_ZIP_COL_NAME)
    data_transformations.standardize_date_col(vacc_api_resp.data_df, soda_data.VACC_DATE_COL_NAME)
    data_transformations.\
        compute_moving_avg_from_daily_data(vacc_api_resp.data_df,
                                           data_transformations.STD_ZIP_COL_NAME,
                                           data_transformations.STD_DATE_COL_NAME,
                                           vacc_data_obj.COLS_TO_AVG)
    db.create_table_from_pandas(vacc_api_resp.data_df, VACC_TBL)

    # DAILY COVID DATA BY ZIP from IDPH
    print("...Downloading daily Covid-19 data...")
    daily_covid_data = daily_case_data_by_zip.get_daily_covid_data_from_api()
    data_transformations.standardize_zip_code_col(daily_covid_data, daily_case_data_by_zip.ZIP_COL_NAME)
    data_transformations.standardize_date_col(daily_covid_data, daily_case_data_by_zip.DATE_COL_NAME)
    data_transformations.\
        compute_moving_avg_from_daily_data(daily_covid_data,
                                           data_transformations.STD_ZIP_COL_NAME,
                                           data_transformations.STD_DATE_COL_NAME,
                                           daily_case_data_by_zip.COLS_TO_AVG)
    db.create_table_from_pandas(daily_covid_data, CASE_TBL)

    # Ground truth foot traffic data
    daily_foot_traffic_data = process_ground_truth_data.get_combined_ground_truth_data()
    data_transformations.standardize_zip_code_col(
        daily_foot_traffic_data, process_ground_truth_data.ZIP_COL_NAME)
    data_transformations.standardize_date_col(daily_foot_traffic_data, process_ground_truth_data.DATE_COL_NAME)
    data_transformations.\
        compute_moving_avg_from_daily_data(daily_foot_traffic_data,
                                           data_transformations.STD_ZIP_COL_NAME,
                                           data_transformations.STD_DATE_COL_NAME,
                                           process_ground_truth_data.COLS_TO_AVG)
    db.create_table_from_pandas(daily_foot_traffic_data, FOOT_TRAFF_TBL)

    # SOCRATA CRASH DATA
    crash_file = os.path.join("core", "resources", "zipcode_crash_data_1_1_2019-3_7_20201.csv")
    crash_data = pd.read_csv(crash_file)
    data_transformations.standardize_zip_code_col(crash_data, soda_data.CRASH_ZIP_COL_NAME)
    data_transformations.standardize_date_col(crash_data, soda_data.CRASH_DATE_COL_NAME)
    data_transformations.\
        compute_moving_avg_from_daily_data(crash_data,
                                           data_transformations.STD_ZIP_COL_NAME,
                                           data_transformations.STD_DATE_COL_NAME,
                                           ['crash_count'])
    db.create_table_from_pandas(crash_data, CRASHES_TBL)

    # CENSUS Demographic Data
    census_data = census_api_pull.get_census_data_from_api()
    data_transformations.standardize_zip_code_col(census_data, census_api_pull.ZIP_COL_NAME)
    db.create_table_from_pandas(census_data, CENSUS_TBL)
Exemplo n.º 5
0
from core.data import data_transformations
from core.data.socrata import soda_data, socrata_api_requests

daily_vacc_data = soda_data.datasets[0]
response = socrata_api_requests.SocrataAPIClient(daily_vacc_data.request_url)
response.data_df

daily_data_df = response.data_df
# daily_data_df.sort_values(date_col_name, inplace = True)

# zipcode_col_name = 'zip_code'
# date_col_name = 'date'
# cols_to_avg = ['total_doses_daily']
# col_name = 'total_doses_daily'
# daily_data_df.groupby(zipcode_col_name)[col_name].rolling(window = 7).mean()
# daily_data_df.groupby(zipcode_col_name)[col_name].rolling(window = 7).mean()
# daily_data_df[new_col_name] = daily_data_df.groupby(zipcode_col_name)[col_name].rolling(window = 7).mean().reset_index(level = 0, drop=True)

data_transformations.compute_moving_avg_from_daily_data(
    daily_data_df, 'zip_code', 'date', ['total_doses_daily'])

print(daily_data_df[daily_data_df['zip_code' == "60637"]])