def generate(): possible_factors = [["safegraph", "part_time_work_prop"], ["fb-survey", "smoothed_cli"], ["fb-survey", "smoothed_hh_cmnty_cli"], ["doctor-visits", "smoothed_adj_cli"], [ "indicator-combination", "confirmed_7dav_incidence_num" ]] x = [] dt_temp = [] for value in possible_factors: source, signal = value temp = covidcast.signal(source, signal, date(2020, 11, 1), date(2020, 11, 30), geo_type="state") x.append(temp) dt_temp.append(3) x.append( covidcast.signal("indicator-combination", "confirmed_incidence_num", date(2020, 11, 1), date(2020, 11, 30), geo_type="state")) dt_temp.append(0) df = covidcast.aggregate_signals(x, dt=dt_temp) df.head() return df
def fetch(dat): #safegraph: The number of daily visits made by those with SafeGraph’s apps to bar-related POIs in a certain region, per 100,000 population if (dat == 3): data1 = covidcast.signal("safegraph", "bars_visit_prop", date(2020, 10, 1), date(2020, 12, 31), "county") #safegraph: The number of daily visits made by those with SafeGraph’s apps to restaurant-related POIs in a certain region, per 100,000 population elif (dat == 4): data1 = covidcast.signal("safegraph", "restaurants_visit_prop", date(2020, 10, 1), date(2020, 12, 31), "county") #fb-survey: Estimated percentage of people reporting illness in their local community, including their household, with no survey weighting elif (dat == 5): data1 = covidcast.signal("fb-survey", "smoothed_hh_cmnty_cli", date(2020, 10, 1), date(2020, 12, 31), "county") else: #(dat == 6) #fb-survey: Estimated percentage of respondents who reported feeling very or somewhat worried that “you or someone in your immediate family might become seriously ill from COVID-19” data1 = covidcast.signal("fb-survey", "smoothed_worried_become_ill", date(2020, 10, 1), date(2020, 12, 31), "county") return data1
def get_covidcast_data(): mask_since_date = datetime.strptime("2020-10-09", '%Y-%m-%d') vaccine_since_date = datetime.strptime("2020-12-27", '%Y-%m-%d') mask_until_date = datetime.strptime(MASK_DATE, '%Y-%m-%d') vaccine_until_date = datetime.strptime(VACCINE_DATE, '%Y-%m-%d') mask_state = covidcast.signal("fb-survey", "smoothed_wearing_mask", mask_since_date, mask_until_date, "state") vaccine_state = covidcast.signal("fb-survey", "smoothed_accept_covid_vaccine", vaccine_since_date, vaccine_until_date, "state") get_national_data(mask_state, "mask") get_national_data(vaccine_state, "vaccine")
def fetch_api_reference(data_source, start_date, end_date, geo_type, signal_type): """ Get and process API data for use as a reference. Formatting is changed to match that of source data CSVs. """ api_df = covidcast.signal(data_source, signal_type, start_date, end_date, geo_type) if not isinstance(api_df, pd.DataFrame): custom_msg = "Error fetching data from " + str(start_date) + \ " to " + str(end_date) + \ " for data source: " + data_source + \ ", signal type: " + signal_type + \ ", geo type: " + geo_type raise APIDataFetchError(custom_msg) column_names = ["geo_id", "val", "se", "sample_size", "time_value"] # Replace None with NA to make numerical manipulation easier. # Rename and reorder columns to match those in df_to_test. api_df = api_df.replace(to_replace=[None], value=np.nan).rename(columns={ 'geo_value': "geo_id", 'stderr': 'se', 'value': 'val' }).drop(['issue', 'lag'], axis=1).reindex(columns=column_names) return api_df
def load_data_cov19(geo_type = 'state', start_day = date(2020, 6, 10), end_day = date(2020, 6, 10)): # geo_type = 'state' or 'county' all_measures = ['confirmed_cumulative_num', 'confirmed_cumulative_prop', 'confirmed_incidence_num', 'confirmed_incidence_prop', 'deaths_cumulative_num', 'deaths_cumulative_prop', 'deaths_incidence_num', 'confirmed_7dav_cumulative_num', 'confirmed_7dav_cumulative_prop', 'confirmed_7dav_incidence_num', 'confirmed_7dav_incidence_prop', 'deaths_7dav_cumulative_num', 'deaths_7dav_cumulative_prop', 'deaths_7dav_incidence_num'] data_source = 'indicator-combination' covid_data = {} for measure in all_measures: measure_data = covidcast.signal(data_source, measure, start_day, end_day, geo_type) covid_data[measure] = measure_data return covid_data
def load_remote_signal_data(remote_source_name, signal_type, start_day, end_day, geo_type): """ This function is a caching wrapper for the covidcast signal function. You can force clear the cache like so: >>> load_remote_signal_data.clear() """ remote_data = covidcast.signal(remote_source_name, signal_type, start_day, end_day, geo_type) if not isinstance(remote_data, type(None)): remote_data["time_value"] = pd.to_datetime(remote_data["time_value"]) remote_data["geo_value"] = remote_data["geo_value"].astype(str) remote_data = remote_data.set_index(["geo_value", "time_value"]).sort_index() return remote_data
def query(self, data_source, signal, forecast_date, geo_type, start_date=None, geo_values="*"): """query a single signal""" sig = covidcast.signal(data_source, signal, start_day=start_date, end_day=forecast_date, as_of=forecast_date, geo_type=geo_type, geo_values=geo_values) return sig
def fetch(dat): if (dat == 1): # safegraph: The fraction of devices that spent between 3 and 6 hours at a location other than their home during the daytime data1 = covidcast.signal("safegraph", "part_time_work_prop", date(2020, 10, 1), date(2020, 12, 31), "county") #safegraph: The fraction of mobile devices that spent more than 6 hours at a location other than their home during the daytime elif (dat == 2): data1 = covidcast.signal("safegraph", "full_time_work_prop", date(2020, 10, 1), date(2020, 12, 31), "county") #safegraph: The number of daily visits made by those with SafeGraph’s apps to bar-related POIs in a certain region, per 100,000 population elif (dat == 3): data1 = covidcast.signal("safegraph", "bars_visit_prop", date(2020, 10, 1), date(2020, 12, 31), "county") #safegraph: The number of daily visits made by those with SafeGraph’s apps to restaurant-related POIs in a certain region, per 100,000 population elif (dat == 4): data1 = covidcast.signal("safegraph", "restaurants_visit_prop", date(2020, 10, 1), date(2020, 12, 31), "county") #fb-survey: Estimated percentage of people reporting illness in their local community, including their household, with no survey weighting elif (dat == 5): data1 = covidcast.signal("fb-survey", "smoothed_hh_cmnty_cli", date(2020, 10, 1), date(2020, 12, 31), "county") #fb-survey: Estimated percentage of respondents who reported feeling very or somewhat worried that “you or someone in your immediate family might become seriously ill from COVID-19” elif (dat == 6): data1 = covidcast.signal("fb-survey", "smoothed_worried_become_ill", date(2020, 10, 1), date(2020, 12, 31), "county") #fb-survey: Estimated percentage of people with COVID-like illness, with no survey weighting elif (dat == 7): data1 = covidcast.signal("fb-survey", "smoothed_cli", date(2020, 10, 1), date(2020, 12, 31), "county") else: data1 = covidcast.signal("doctor-visits", "smoothed_cli", date(2020, 10, 1), date(2020, 12, 31), "county") return data1
def load_cmu_covidcast(data_dir='.'): ''' Load in CMU COVIDcast (county-level) data set (pulled directly from source) Parameters ---------- data_dir : str; path to the data directory to write raw cmu_covidcast.csv Returns ------- data frame ''' signals_ls = [] signal_dict = { # "safegraph": ["full_time_work_prop", "part_time_work_prop", "completely_home_prop", "median_home_dwell_time"], "fb-survey": ["smoothed_hh_cmnty_cli", "smoothed_cli"], "doctor-visits": ["smoothed_adj_cli"], "hospital-admissions": ["smoothed_adj_covid19"], "indicator-combination": ["nmf_day_doc_fbc_fbs_ght"] } print("Loading CMU signals:") for source, signals in signal_dict.items(): for signal in signals: print(source + " " + signal) signals_ls.append(covidcast.signal(source, signal, geo_type="county").rename(columns = { "geo_value": "countyFIPS", "time_value": "date", "value": source + "-" + signal }).drop(columns = ["direction", "issue", "lag", "stderr", "sample_size"])) for i in range(len(signals_ls)): if i == 0: raw = signals_ls[i] else: raw = pd.merge(raw, signals_ls[i], on=['countyFIPS', 'date'], how="outer") # merge data raw = raw.sort_values("date") raw.to_csv(oj(data_dir, "cmu_covidcast.csv"), header=True, index=False) return raw
def get_covid_cast_signal(data_source=str, signal=str, start_day=datetime.date, end_day=datetime.date, geo_type="county"): """ Get data from covid cast. Parameters: - data_source: String identifying the data source to query, such as ``"fb-survey"``. - signal: String identifying the signal from that source to query, such as ``"smoothed_cli"``. - start_day: Query data beginning on this date. Provided as as ``datetime.date`` object. If ``start_day`` is ``None``, defaults to the first day data is available for this signal. - end_day: Query data up to this date, inclusive. Provided as a ``datetime.date`` object. If ``end_day`` is ``None``, defaults to the most recent day data is available for this signal. - geo_type: The geography type for which to request this data, such as ``"county"`` or ``"state"``. Available types are described in the COVIDcast signal documentation. Defaults to ``"county"``. """ try: data = covidcast.signal(data_source, signal, start_day, end_day, geo_type) logger.info( f"~~ Downloaded Successfully: SOURCE={data_source}, SIGNAL={signal} ~~" ) return data except: logger.error( f"Problem downloading: SOURCE={data_source}, SIGNAL={signal}", exc_info=True)
import covidcast from datetime import date from matplotlib import pyplot as plt data = covidcast.signal("fb-survey", "smoothed_cli", start_day=date(2020, 8, 4), end_day=date(2020, 8, 4), geo_type="county") geo_data = covidcast.get_geo_df(data) CA = geo_data.loc[geo_data.state_fips == "06", :] CA = CA.to_crs("EPSG:3395") CA.plot(column="value", figsize=(5, 5), legend=True) plt.axis("off") plt.show()
class TestDelay: #This is mostly code copied from Maria's notebooks fl_line_data = "tests/test_data/FL_line_list.csv" us_zip_data_path = "tests/test_data/02_20_uszips.csv" florida_df = pd.read_csv(fl_line_data, parse_dates=["Case_", "EventDate", "ChartDate"]) florida_delays = (florida_df.ChartDate - florida_df.EventDate).dt.days florida_delays = florida_delays[florida_delays.gt(0) & florida_delays.lt(60)] fl_delay_dist = Delay.get_delay_distribution(florida_delays) start_date = datetime(2020, 4, 15) end_date = datetime(2020, 7, 15) cases_df = covidcast.signal( 'indicator-combination', 'confirmed_7dav_incidence_num', start_date, end_date, geo_type='county', ) cumulative_cases_df = covidcast.signal( 'indicator-combination', 'confirmed_7dav_cumulative_num', end_date, end_date, geo_type='county', ) thresh_geos = cumulative_cases_df[ cumulative_cases_df.value > 500].geo_value # get all florida fips codes geo_map = pd.read_csv( us_zip_data_path, usecols=["fips", "state_id", "population"], dtype={"state_id": str}, converters={"fips": lambda x: str(x).zfill(5)}, ) florida_geo = geo_map[geo_map.state_id.eq("FL")] florida_population = florida_geo.groupby( "fips").population.sum().reset_index() florida_fips = florida_geo.fips.unique() cases_df = cases_df.set_index(["geo_value"]) geos = cases_df.index.unique() geos = geos[geos.isin(florida_fips)] # only keep florida geos geos = geos[geos.isin(thresh_geos)] # counties with >500 cumulative cases def test_deconv(self): delay_dist = self.fl_delay_dist geo = "12086" cases = self.cases_df.loc[geo].sort_values(by='time_value') n = cases.value.shape[0] train_time = cases.time_value[:] train_cases_ = cases[cases.time_value.isin(train_time)] # dow_adjust and deconvolution will both be done in Delay.deconv train_cases = dow_adjust_cases(train_cases_, lam=10) sub_infections = np.clip( admm_deconvolution_v2(train_cases, delay_dist, 3000, 3000, n_iters=500, k=2), 0, np.inf) assert np.array_equal( sub_infections, Delay.deconv( train_cases_, delay_dist)), "The two deconvolved arrays do not match" def test_conv(self): x = tf.random.uniform((100, 1), minval=0, maxval=100) Delay.conv(x, self.fl_delay_dist)
def check_source(data_source, meta, params, grace, logger): """Iterate over all signals from a source and check for problems. Possible problems: - Newest available data exceeds max age. - Gap between subsequent data points exceeds max gap. For example, consider a source with a max age of 5 days and max gap of 1 day. If today is 2020-10-15, and the latest available data is from 2020-10-09, the max age is exceeded. If there is no data available on 2020-10-07, but there is on 2020-10-06 and 2020-10-08, there is a gap of 2 days and the max gap is exceeded. The gap window controls how much data we check for gaps -- a gap window of 10 days means we check the most recent 10 days of data. Defaults to 7. """ source_config = params[data_source] gap_window = pd.Timedelta(days=source_config.get("gap_window", 7)) max_allowed_gap = source_config.get("max_gap", 1) signals = meta[meta.data_source == data_source] now = pd.Timestamp.now() age_complaints = {} gap_complaints = {} for _, row in signals.iterrows(): if "retired-signals" in source_config and \ row["signal"] in source_config["retired-signals"]: continue # Check max age age = (now - row["max_time"]).days if age > source_config["max_age"] + grace: if row["signal"] not in age_complaints: age_complaints[row["signal"]] = Complaint( "is more than {age} days old".format(age=age), data_source, row["signal"], [row["geo_type"]], row["max_time"], source_config["maintainers"]) else: age_complaints[row["signal"]].geo_types.append(row["geo_type"]) # Check max gap if max_allowed_gap == -1: # No gap detection for this source continue logger.info("Retrieving signal", source=data_source, signal=row["signal"], start_day=(row["max_time"] - gap_window).strftime("%Y-%m-%d"), end_day=row["max_time"].strftime("%Y-%m-%d"), geo_type=row["geo_type"]) latest_data = covidcast.signal(data_source, row["signal"], start_day=row["max_time"] - gap_window, end_day=row["max_time"], geo_type=row["geo_type"]) # convert numpy datetime values to pandas datetimes and then to # datetime.date, so we can work with timedeltas after unique_dates = [ pd.to_datetime(val).date() for val in latest_data["time_value"].unique() ] gap_days = [ (day - prev_day).days for day, prev_day in zip(unique_dates[1:], unique_dates[:-1]) ] gap = max(gap_days) if gap > max_allowed_gap: if row["signal"] not in gap_complaints: gap_complaints[row["signal"]] = Complaint( "has a {gap}-day gap of missing data in its most recent " "{gap_window} days of data".format( gap=gap, gap_window=gap_window.days), data_source, row["signal"], [row["geo_type"]], row["max_time"], source_config["maintainers"]) else: gap_complaints[row["signal"]].geo_types.append(row["geo_type"]) return list(age_complaints.values()) + list(gap_complaints.values())
def check_source(data_source, meta, params, grace, logger): """Iterate over all signals from a source and check for problems. Possible problems: - Newest available data exceeds max age. - Gap between subsequent data points exceeds max gap. For example, consider a source with a max age of 5 days and max gap of 1 day. If today is 2020-10-15, and the latest available data is from 2020-10-09, the max age is exceeded. If there is no data available on 2020-10-07, but there is on 2020-10-06 and 2020-10-08, there is a gap of 2 days and the max gap is exceeded. The gap window controls how much data we check for gaps -- a gap window of 10 days means we check the most recent 10 days of data. Defaults to 7. """ source_config = params[data_source] gap_window = pd.Timedelta(days=source_config.get("gap_window", 7)) max_allowed_gap = source_config.get("max_gap", 1) signals = meta[meta.data_source == data_source] now = pd.Timestamp.now() age_complaints = {} gap_complaints = {} for _, row in signals.iterrows(): if "retired-signals" in source_config and \ row["signal"] in source_config["retired-signals"]: continue logger.info("Retrieving signal", source=data_source, signal=row["signal"], start_day=(datetime.now() - timedelta(days = 14)).strftime("%Y-%m-%d"), end_day=datetime.now().strftime("%Y-%m-%d"), geo_type=row["geo_type"]) latest_data = covidcast.signal( data_source, row["signal"], start_day=datetime.now() - timedelta(days = 14), end_day=datetime.now(), geo_type=row["geo_type"] ) current_lag_in_days = (now - row["max_time"]).days lag_calculated_from_api = False if latest_data is not None: unique_dates = [pd.to_datetime(val).date() for val in latest_data["time_value"].unique()] current_lag_in_days = (datetime.now().date() - max(unique_dates)).days lag_calculated_from_api = True logger.info("Signal lag", current_lag_in_days = current_lag_in_days, data_source = data_source, signal = row["signal"], geo_type=row["geo_type"], lag_calculated_from_api = lag_calculated_from_api) if current_lag_in_days > source_config["max_age"] + grace: if row["signal"] not in age_complaints: age_complaints[row["signal"]] = Complaint( "is {current_lag_in_days} days old".format(current_lag_in_days=current_lag_in_days), data_source, row["signal"], [row["geo_type"]], row["max_time"], source_config["maintainers"]) else: age_complaints[row["signal"]].geo_types.append(row["geo_type"]) # Check max gap if max_allowed_gap == -1 or latest_data is None: # No gap detection for this source continue # convert numpy datetime values to pandas datetimes and then to # datetime.date, so we can work with timedeltas after unique_issues = [pd.to_datetime(val).date() for val in latest_data["issue"].unique()] gap_days = [(day - prev_day).days for day, prev_day in zip(unique_dates[1:], unique_dates[:-1])] # If we only have a single day of data available then gap days will be # empty. if not gap_days: logger.info( "Not enough data to calculate gap days.", data_source=data_source, signal=row["signal"], geo_type=row["geo_type"]) continue gap = max(gap_days) - 1 logger.info("Detecting days with data present", data_source = data_source, signal = row["signal"], geo_type=row["geo_type"], most_recent_dates_with_data = [x.strftime("%Y-%m-%d") for x in unique_dates], gap_days = gap_days, max_gap = gap, issue_dates = [x.strftime("%Y-%m-%d") for x in unique_issues]) if gap > max_allowed_gap: if row["signal"] not in gap_complaints: gap_complaints[row["signal"]] = Complaint( "has a {gap}-day gap of missing data in its most recent " "{gap_window} days of data".format(gap=gap, gap_window=gap_window.days), data_source, row["signal"], [row["geo_type"]], datetime.now(), source_config["maintainers"]) else: gap_complaints[row["signal"]].geo_types.append(row["geo_type"]) return list(age_complaints.values()) + list(gap_complaints.values())