예제 #1
0
파일: fldem.py 프로젝트: hmanuel1/covid
def get_data(download=False):
    """Download data from web or from file

    Keyword Arguments:
        download {bool} -- get data from source (default: {False})

    Returns:
        DataFrame -- florida covid19 cases and deaths data
    """
    if download:
        # instantiate covid19 pdf scraper
        pdf = PdfScraper()
        pdf.get_pages()

        # covid19 cases
        cases = pdf.get_data(marker=r'Case[^s]')
        deaths = pdf.get_data(marker=r'Death[^s]')

        _db = DataBase()
        _db.add_table(FL_CASES_TABLE, cases, index=False)
        _db.add_table(FL_DEATHS_TABLE, deaths, index=False)
        _db.close()

    else:

        # read from database
        _db = DataBase()
        cases = _db.get_table(FL_CASES_TABLE, parse_dates=['date'])
        deaths = _db.get_table(FL_DEATHS_TABLE, parse_dates=['date'])
        _db.close()

    return cases, deaths
예제 #2
0
파일: nytimes.py 프로젝트: hmanuel1/covid
def add_metadata():
    """Updates options, dates and levels database tables

    Updates:
        database table -- OPTIONS
        database table -- DATES
        database table -- LEVELS
    """
    _db = DataBase()
    _db.update(DROP_US_MAP_PIVOT_VIEW)
    _db.update(US_MAP_PIVOT_VIEW)
    data = _db.get_table(NYTIMES_COUNTIES_TABLE, parse_dates=['dates'])
    _db.close()

    data['date'] = pd.to_datetime(data['date'])

    # last 15 days
    dates = []
    latest_date = data['date'].max()
    for day in range(15):
        date = latest_date - pd.to_timedelta(day, 'days')
        dates.append(date)

    # meta data
    _db = DataBase()
    _db.add_table(LEVELS_TABLE, pd.DataFrame({'level': LEVELS}), index=False)
    _db.add_table(DATES_TABLE, pd.DataFrame({'date': dates}), index=False)
    _db.update(DROP_OPTIONS_TABLE)
    _db.update(CREATE_OPTIONS_TABLE)
    _db.update(INSERT_USA_OPTION)
    _db.close()
예제 #3
0
파일: nytimes.py 프로젝트: hmanuel1/covid
def clean_states_data():
    """Clean US States data from NY Times

    Returns:
        DataFrame -- clean us states data

    Updates:
        database table -- NYTIMES_STATES_TABLEs
        database view -- STATES_VIEW
    """
    # covid19 data and metadata
    _db = DataBase()
    data = _db.get_table(US_STATES_TABLE, parse_dates=['date'])
    states = _db.get_geotable(STATE_MAP_TABLE)
    _db.close()

    start = len(data)

    # add state ids
    states['name'] = states['name'].str.lower()
    lookup = states.set_index('name')['state_id'].to_dict()
    data['state_id'] = data['state'].str.lower().map(lookup)

    # get rid of data that is not in county meta data
    data = data[data['state_id'].isin(list(
        states['state_id']))].copy(deep=True)

    # days from lastest reported date
    delta_day = pd.to_timedelta(1, unit='days')
    data['day'] = (data['date'].max() - data['date']) / delta_day
    data['day'] = data['day'].astype('Int32')

    end = len(data)

    # ny times table
    data = data[['state_id', 'date', 'day', 'cases', 'deaths']].copy(deep=True)
    data.reset_index(drop=True, inplace=True)

    # ignored lines
    print(
        f'ignored lines: {start-end}/{start} = {(100*(start-end)/start):.01f}%'
    )

    # table to database
    _db = DataBase()
    _db.add_table(NYTIMES_STATES_TABLE, data.set_index('state_id'))
    _db.update(DROP_STATES_VIEW)
    _db.update(STATES_VIEW)
    _db.close()

    return data
예제 #4
0
파일: fldem.py 프로젝트: hmanuel1/covid
def download_fldem():
    """Get, clean and store covid19 data from FL DEM
    """
    get_data(True)

    _db = DataBase()

    # cases
    data = clean_data(FL_CASES_TABLE)
    _db.add_table(FLDEM_CASES_TABLE, data.set_index('case_id'))

    # deaths
    data = clean_data(FL_DEATHS_TABLE)
    _db.add_table(FLDEM_DEATHS_TABLE, data.set_index('case_id'))

    # view
    _db.update(DROP_FLDEM_VIEW)
    _db.update(FLDEM_VIEW)

    _db.close()
예제 #5
0
파일: nytimes.py 프로젝트: hmanuel1/covid
def download_nytimes():
    """Read NY Times data from github
    """
    # read covid19 county by county data from url
    data = pd.read_csv(URL_COUNTIES, dtype={'fips': 'str'})

    _db = DataBase()
    _db.add_table(US_COUNTIES_TABLE, data, index=False)
    _db.close()

    # read covid19 state by state data from url
    data = pd.read_csv(URL_STATES, dtype={'fips': 'str'})

    _db = DataBase()
    _db.add_table(US_STATES_TABLE, data, index=False)
    _db.close()

    clean_counties_data()
    clean_states_data()
    add_metadata()
예제 #6
0
def predict():
    """main module function to predict covid19 cases and deaths

    Inputs from databae:
        US_STATES_TABLE {database table} -- nytimes covid19 data

    Outputs to database:
        ARIMA_CASES_TABLE {database table} -- cases[predict, upper, lower]
        ARIMA_DEATHS_TABLE {database table } -- deaths[predict, upper, lower]
    """
    _db = DataBase()
    data = _db.get_table(STATES_VIEW_TABLE, parse_dates=['date'])
    _db.close()

    # predict cases
    result = run_arima(data, 'cases')

    # only data after 3/15/2020
    slicer = result['date'] > pd.to_datetime('3/15/2020')
    result = result.loc[slicer, :].copy(deep=True)
    result.sort_values(['date', 'state'], inplace=True)
    result = result.round(0)

    _db = DataBase()
    _db.add_table(ARIMA_CASES_TABLE, data=result, index=False)
    _db.close()

    # predict deaths
    result = run_arima(data, 'deaths')

     # only data after 3/15/2020
    slicer = result['date'] > pd.to_datetime('3/15/2020')
    result = result.loc[slicer, :].copy(deep=True)
    result.sort_values(['date', 'state'], inplace=True)
    result = result.round(0)

    _db = DataBase()
    _db.add_table(ARIMA_DEATHS_TABLE, data=result, index=False)
    _db.close()
예제 #7
0
파일: nytimes.py 프로젝트: hmanuel1/covid
def clean_counties_data():
    """Clean US Counties data from NY Times

    Returns:
        DataFrame -- clean us counties data

    Updates:
        database table -- NYTIMES_COUNTIES_TABLEs
        database view -- COUNTIES_VIEW
    """
    _db = DataBase()
    data = _db.get_table(US_COUNTIES_TABLE, parse_dates=['date'])
    counties = _db.get_geotable(US_MAP_TABLE)
    states = _db.get_geotable(STATE_MAP_TABLE)
    _db.close()

    start = len(data)

    # use new york county fips for new york city
    data.loc[data['county'] == 'New York City', 'fips'] = '36061'

    # add state ids
    lookup = states.set_index('name')['state_id'].to_dict()
    data['state_id'] = data['state'].map(lookup)

    # add county ids - first attempt
    data['id'] = data['county'].str.lower() + data['state_id']
    counties['id'] = counties['name'].str.lower() + counties['state_id']
    lookup = counties[['id',
                       'county_id']].set_index('id')['county_id'].to_dict()
    data['county_id'] = data['id'].map(lookup)

    # add county ids - last attempt
    condition = (~data['fips'].isna()) & (data['county_id'].isna())
    data.loc[condition, 'county_id'] = data.loc[condition, 'fips']

    # get rid of data that is not in county meta data
    data = data[data['county_id'].isin(list(
        counties['county_id']))].copy(deep=True)

    # state ids base on county_ids
    lookup = counties.set_index('county_id')['state_id'].to_dict()
    data['state_id'] = data['county_id'].map(lookup)

    # days from lastest day
    delta_day = pd.to_timedelta(1, unit='days')
    data['day'] = (data['date'].max() - data['date']) / delta_day
    data['day'] = data['day'].astype('Int32')

    end = len(data)

    # ny times counties table
    cols = ['county_id', 'state_id', 'date', 'day', 'cases', 'deaths']
    data = data[cols].copy(deep=True)
    data.reset_index(drop=True, inplace=True)

    data['case_level'] = pd.cut(data['cases'],
                                LEVELS,
                                labels=range(1, len(LEVELS)))
    data['case_level'] = pd.to_numeric(data['case_level'], 'coerce').fillna(0)
    data['case_level'] = data['case_level'].astype('Int32')

    # ignored lines
    print(
        f'ignored lines: {start-end}/{start} = {(100*(start-end)/start):.01f}%'
    )

    # tables to database
    _db = DataBase()
    _db.add_table(NYTIMES_COUNTIES_TABLE, data.set_index(['county_id', 'day']))
    _db.update(DROP_COUNTIES_VIEW)
    _db.update(COUNTIES_VIEW)
    _db.close()

    return data