示例#1
0
def generate():
    possible_factors = [["safegraph", "part_time_work_prop"],
                        ["fb-survey", "smoothed_cli"],
                        ["fb-survey", "smoothed_hh_cmnty_cli"],
                        ["doctor-visits", "smoothed_adj_cli"],
                        [
                            "indicator-combination",
                            "confirmed_7dav_incidence_num"
                        ]]
    x = []
    dt_temp = []
    for value in possible_factors:
        source, signal = value
        temp = covidcast.signal(source,
                                signal,
                                date(2020, 11, 1),
                                date(2020, 11, 30),
                                geo_type="state")
        x.append(temp)
        dt_temp.append(3)

    x.append(
        covidcast.signal("indicator-combination",
                         "confirmed_incidence_num",
                         date(2020, 11, 1),
                         date(2020, 11, 30),
                         geo_type="state"))
    dt_temp.append(0)
    df = covidcast.aggregate_signals(x, dt=dt_temp)
    df.head()
    return df
示例#2
0
def fetch(dat):

    #safegraph: The number of daily visits made by those with SafeGraph’s apps to bar-related POIs in a certain region, per 100,000 population
    if (dat == 3):
        data1 = covidcast.signal("safegraph", "bars_visit_prop",
                                 date(2020, 10, 1), date(2020, 12, 31),
                                 "county")
    #safegraph: The number of daily visits made by those with SafeGraph’s apps to restaurant-related POIs in a certain region, per 100,000 population
    elif (dat == 4):
        data1 = covidcast.signal("safegraph", "restaurants_visit_prop",
                                 date(2020, 10, 1), date(2020, 12, 31),
                                 "county")

    #fb-survey: Estimated percentage of people reporting illness in their local community, including their household, with no survey weighting
    elif (dat == 5):
        data1 = covidcast.signal("fb-survey", "smoothed_hh_cmnty_cli",
                                 date(2020, 10, 1), date(2020, 12, 31),
                                 "county")

    else:  #(dat == 6)
        #fb-survey: Estimated percentage of respondents who reported feeling very or somewhat worried that “you or someone in your immediate family might become seriously ill from COVID-19”
        data1 = covidcast.signal("fb-survey", "smoothed_worried_become_ill",
                                 date(2020, 10, 1), date(2020, 12, 31),
                                 "county")
    return data1
示例#3
0
def get_covidcast_data():
    mask_since_date = datetime.strptime("2020-10-09", '%Y-%m-%d')
    vaccine_since_date = datetime.strptime("2020-12-27", '%Y-%m-%d')
    mask_until_date = datetime.strptime(MASK_DATE, '%Y-%m-%d')
    vaccine_until_date = datetime.strptime(VACCINE_DATE, '%Y-%m-%d')

    mask_state = covidcast.signal("fb-survey", "smoothed_wearing_mask",
                                  mask_since_date, mask_until_date, "state")
    vaccine_state = covidcast.signal("fb-survey",
                                     "smoothed_accept_covid_vaccine",
                                     vaccine_since_date, vaccine_until_date,
                                     "state")

    get_national_data(mask_state, "mask")
    get_national_data(vaccine_state, "vaccine")
示例#4
0
def fetch_api_reference(data_source, start_date, end_date, geo_type,
                        signal_type):
    """
    Get and process API data for use as a reference. Formatting is changed
    to match that of source data CSVs.
    """
    api_df = covidcast.signal(data_source, signal_type, start_date, end_date,
                              geo_type)

    if not isinstance(api_df, pd.DataFrame):
        custom_msg = "Error fetching data from " + str(start_date) + \
                     " to " + str(end_date) + \
                     " for data source: " + data_source + \
                     ", signal type: " + signal_type + \
                     ", geo type: " + geo_type

        raise APIDataFetchError(custom_msg)

    column_names = ["geo_id", "val", "se", "sample_size", "time_value"]

    # Replace None with NA to make numerical manipulation easier.
    # Rename and reorder columns to match those in df_to_test.
    api_df = api_df.replace(to_replace=[None],
                            value=np.nan).rename(columns={
                                'geo_value': "geo_id",
                                'stderr': 'se',
                                'value': 'val'
                            }).drop(['issue', 'lag'],
                                    axis=1).reindex(columns=column_names)

    return api_df
def load_data_cov19(geo_type = 'state', start_day = date(2020, 6, 10), end_day = date(2020, 6, 10)):
    # geo_type = 'state' or 'county'
    all_measures = ['confirmed_cumulative_num', 'confirmed_cumulative_prop', 'confirmed_incidence_num', 'confirmed_incidence_prop', 'deaths_cumulative_num', 'deaths_cumulative_prop',
                'deaths_incidence_num', 'confirmed_7dav_cumulative_num', 'confirmed_7dav_cumulative_prop', 'confirmed_7dav_incidence_num', 'confirmed_7dav_incidence_prop',
                'deaths_7dav_cumulative_num', 'deaths_7dav_cumulative_prop', 'deaths_7dav_incidence_num']
    
    data_source = 'indicator-combination'
    covid_data = {}
    for measure in all_measures:
        measure_data = covidcast.signal(data_source, measure, start_day, end_day, geo_type)
        covid_data[measure] = measure_data
    
    return covid_data
示例#6
0
def load_remote_signal_data(remote_source_name, signal_type, start_day,
                            end_day, geo_type):
    """
    This function is a caching wrapper for the covidcast signal function. You can force clear the cache like so:
    >>> load_remote_signal_data.clear()
    """
    remote_data = covidcast.signal(remote_source_name, signal_type, start_day,
                                   end_day, geo_type)
    if not isinstance(remote_data, type(None)):
        remote_data["time_value"] = pd.to_datetime(remote_data["time_value"])
        remote_data["geo_value"] = remote_data["geo_value"].astype(str)
        remote_data = remote_data.set_index(["geo_value",
                                             "time_value"]).sort_index()

    return remote_data
 def query(self,
           data_source,
           signal,
           forecast_date,
           geo_type,
           start_date=None,
           geo_values="*"):
     """query a single signal"""
     sig = covidcast.signal(data_source,
                            signal,
                            start_day=start_date,
                            end_day=forecast_date,
                            as_of=forecast_date,
                            geo_type=geo_type,
                            geo_values=geo_values)
     return sig
def fetch(dat):
    if (dat == 1):
        # safegraph: The fraction of devices that spent between 3 and 6 hours at a location other than their home during the daytime
        data1 = covidcast.signal("safegraph", "part_time_work_prop",
                                 date(2020, 10, 1), date(2020, 12, 31),
                                 "county")
    #safegraph: The fraction of mobile devices that spent more than 6 hours at a location other than their home during the daytime
    elif (dat == 2):
        data1 = covidcast.signal("safegraph", "full_time_work_prop",
                                 date(2020, 10, 1), date(2020, 12, 31),
                                 "county")
    #safegraph: The number of daily visits made by those with SafeGraph’s apps to bar-related POIs in a certain region, per 100,000 population
    elif (dat == 3):
        data1 = covidcast.signal("safegraph", "bars_visit_prop",
                                 date(2020, 10, 1), date(2020, 12, 31),
                                 "county")
    #safegraph: The number of daily visits made by those with SafeGraph’s apps to restaurant-related POIs in a certain region, per 100,000 population
    elif (dat == 4):
        data1 = covidcast.signal("safegraph", "restaurants_visit_prop",
                                 date(2020, 10, 1), date(2020, 12, 31),
                                 "county")

    #fb-survey: Estimated percentage of people reporting illness in their local community, including their household, with no survey weighting
    elif (dat == 5):
        data1 = covidcast.signal("fb-survey", "smoothed_hh_cmnty_cli",
                                 date(2020, 10, 1), date(2020, 12, 31),
                                 "county")

    #fb-survey: Estimated percentage of respondents who reported feeling very or somewhat worried that “you or someone in your immediate family might become seriously ill from COVID-19”
    elif (dat == 6):
        data1 = covidcast.signal("fb-survey", "smoothed_worried_become_ill",
                                 date(2020, 10, 1), date(2020, 12, 31),
                                 "county")

    #fb-survey: Estimated percentage of people with COVID-like illness, with no survey weighting
    elif (dat == 7):
        data1 = covidcast.signal("fb-survey",
                                 "smoothed_cli", date(2020, 10, 1),
                                 date(2020, 12, 31), "county")

    else:
        data1 = covidcast.signal("doctor-visits", "smoothed_cli",
                                 date(2020, 10, 1), date(2020, 12, 31),
                                 "county")
    return data1
示例#9
0
def load_cmu_covidcast(data_dir='.'):
    ''' Load in CMU COVIDcast (county-level) data set (pulled directly from source)
    
    Parameters
    ----------
    data_dir : str; path to the data directory to write raw cmu_covidcast.csv
    
    Returns
    -------
    data frame
    '''
    
    signals_ls = []
    
    signal_dict = {
#         "safegraph": ["full_time_work_prop", "part_time_work_prop", "completely_home_prop", "median_home_dwell_time"],
        "fb-survey": ["smoothed_hh_cmnty_cli", "smoothed_cli"],
        "doctor-visits": ["smoothed_adj_cli"],
        "hospital-admissions": ["smoothed_adj_covid19"],
        "indicator-combination": ["nmf_day_doc_fbc_fbs_ght"]
    }
    
    print("Loading CMU signals:")
    for source, signals in signal_dict.items():
        for signal in signals:
            print(source + " " + signal)
            signals_ls.append(covidcast.signal(source, signal, geo_type="county").rename(columns = {
                "geo_value": "countyFIPS",
                "time_value": "date",
                "value": source + "-" + signal
            }).drop(columns = ["direction", "issue", "lag", "stderr", "sample_size"]))
    
    for i in range(len(signals_ls)):
        if i == 0:
            raw = signals_ls[i]
        else:
            raw = pd.merge(raw, signals_ls[i], on=['countyFIPS', 'date'], how="outer")  # merge data
    raw = raw.sort_values("date")
    raw.to_csv(oj(data_dir, "cmu_covidcast.csv"), header=True, index=False)

    return raw
示例#10
0
def get_covid_cast_signal(data_source=str,
                          signal=str,
                          start_day=datetime.date,
                          end_day=datetime.date,
                          geo_type="county"):
    """
    Get data from covid cast.

    Parameters:
    - data_source: String identifying the data source to query, such 
        as ``"fb-survey"``.
    - signal: String identifying the signal from that source to query,
        such as ``"smoothed_cli"``.
    - start_day: Query data beginning on this date. Provided as as
        ``datetime.date`` object. If ``start_day`` is ``None``, defaults
        to the first day data is available for this signal.
    - end_day: Query data up to this date, inclusive. Provided as a
        ``datetime.date`` object. If ``end_day`` is ``None``, defaults
        to the most recent day data is available for this signal.
    - geo_type: The geography type for which to request this data, such as
        ``"county"`` or ``"state"``. Available types are described in the
        COVIDcast signal documentation. Defaults to ``"county"``.

    """
    try:
        data = covidcast.signal(data_source, signal, start_day, end_day,
                                geo_type)
        logger.info(
            f"~~ Downloaded Successfully: SOURCE={data_source}, SIGNAL={signal} ~~"
        )
        return data

    except:
        logger.error(
            f"Problem downloading: SOURCE={data_source}, SIGNAL={signal}",
            exc_info=True)
示例#11
0
import covidcast
from datetime import date
from matplotlib import pyplot as plt
data = covidcast.signal("fb-survey",
                        "smoothed_cli",
                        start_day=date(2020, 8, 4),
                        end_day=date(2020, 8, 4),
                        geo_type="county")
geo_data = covidcast.get_geo_df(data)
CA = geo_data.loc[geo_data.state_fips == "06", :]
CA = CA.to_crs("EPSG:3395")
CA.plot(column="value", figsize=(5, 5), legend=True)
plt.axis("off")
plt.show()
示例#12
0
class TestDelay:
    #This is mostly code copied from Maria's notebooks
    fl_line_data = "tests/test_data/FL_line_list.csv"
    us_zip_data_path = "tests/test_data/02_20_uszips.csv"

    florida_df = pd.read_csv(fl_line_data,
                             parse_dates=["Case_", "EventDate", "ChartDate"])
    florida_delays = (florida_df.ChartDate - florida_df.EventDate).dt.days
    florida_delays = florida_delays[florida_delays.gt(0)
                                    & florida_delays.lt(60)]
    fl_delay_dist = Delay.get_delay_distribution(florida_delays)
    start_date = datetime(2020, 4, 15)
    end_date = datetime(2020, 7, 15)

    cases_df = covidcast.signal(
        'indicator-combination',
        'confirmed_7dav_incidence_num',
        start_date,
        end_date,
        geo_type='county',
    )

    cumulative_cases_df = covidcast.signal(
        'indicator-combination',
        'confirmed_7dav_cumulative_num',
        end_date,
        end_date,
        geo_type='county',
    )

    thresh_geos = cumulative_cases_df[
        cumulative_cases_df.value > 500].geo_value

    # get all florida fips codes
    geo_map = pd.read_csv(
        us_zip_data_path,
        usecols=["fips", "state_id", "population"],
        dtype={"state_id": str},
        converters={"fips": lambda x: str(x).zfill(5)},
    )

    florida_geo = geo_map[geo_map.state_id.eq("FL")]
    florida_population = florida_geo.groupby(
        "fips").population.sum().reset_index()
    florida_fips = florida_geo.fips.unique()

    cases_df = cases_df.set_index(["geo_value"])
    geos = cases_df.index.unique()
    geos = geos[geos.isin(florida_fips)]  # only keep florida geos
    geos = geos[geos.isin(thresh_geos)]  # counties with >500 cumulative cases

    def test_deconv(self):
        delay_dist = self.fl_delay_dist

        geo = "12086"
        cases = self.cases_df.loc[geo].sort_values(by='time_value')
        n = cases.value.shape[0]
        train_time = cases.time_value[:]
        train_cases_ = cases[cases.time_value.isin(train_time)]

        # dow_adjust and deconvolution will both be done in Delay.deconv
        train_cases = dow_adjust_cases(train_cases_, lam=10)
        sub_infections = np.clip(
            admm_deconvolution_v2(train_cases,
                                  delay_dist,
                                  3000,
                                  3000,
                                  n_iters=500,
                                  k=2), 0, np.inf)

        assert np.array_equal(
            sub_infections, Delay.deconv(
                train_cases_,
                delay_dist)), "The two deconvolved arrays do not match"

    def test_conv(self):
        x = tf.random.uniform((100, 1), minval=0, maxval=100)
        Delay.conv(x, self.fl_delay_dist)
def check_source(data_source, meta, params, grace, logger):
    """Iterate over all signals from a source and check for problems.

    Possible problems:

    - Newest available data exceeds max age.
    - Gap between subsequent data points exceeds max gap.

    For example, consider a source with a max age of 5 days and max gap of 1
    day. If today is 2020-10-15, and the latest available data is from
    2020-10-09, the max age is exceeded. If there is no data available on
    2020-10-07, but there is on 2020-10-06 and 2020-10-08, there is a gap of 2
    days and the max gap is exceeded.

    The gap window controls how much data we check for gaps -- a gap window of
    10 days means we check the most recent 10 days of data. Defaults to 7.

    """

    source_config = params[data_source]
    gap_window = pd.Timedelta(days=source_config.get("gap_window", 7))
    max_allowed_gap = source_config.get("max_gap", 1)

    signals = meta[meta.data_source == data_source]

    now = pd.Timestamp.now()

    age_complaints = {}
    gap_complaints = {}

    for _, row in signals.iterrows():
        if "retired-signals" in source_config and \
           row["signal"] in source_config["retired-signals"]:
            continue

        # Check max age
        age = (now - row["max_time"]).days

        if age > source_config["max_age"] + grace:
            if row["signal"] not in age_complaints:
                age_complaints[row["signal"]] = Complaint(
                    "is more than {age} days old".format(age=age), data_source,
                    row["signal"], [row["geo_type"]], row["max_time"],
                    source_config["maintainers"])
            else:
                age_complaints[row["signal"]].geo_types.append(row["geo_type"])

        # Check max gap
        if max_allowed_gap == -1:
            # No gap detection for this source
            continue

        logger.info("Retrieving signal",
                    source=data_source,
                    signal=row["signal"],
                    start_day=(row["max_time"] -
                               gap_window).strftime("%Y-%m-%d"),
                    end_day=row["max_time"].strftime("%Y-%m-%d"),
                    geo_type=row["geo_type"])

        latest_data = covidcast.signal(data_source,
                                       row["signal"],
                                       start_day=row["max_time"] - gap_window,
                                       end_day=row["max_time"],
                                       geo_type=row["geo_type"])

        # convert numpy datetime values to pandas datetimes and then to
        # datetime.date, so we can work with timedeltas after
        unique_dates = [
            pd.to_datetime(val).date()
            for val in latest_data["time_value"].unique()
        ]

        gap_days = [
            (day - prev_day).days
            for day, prev_day in zip(unique_dates[1:], unique_dates[:-1])
        ]
        gap = max(gap_days)

        if gap > max_allowed_gap:
            if row["signal"] not in gap_complaints:
                gap_complaints[row["signal"]] = Complaint(
                    "has a {gap}-day gap of missing data in its most recent "
                    "{gap_window} days of data".format(
                        gap=gap, gap_window=gap_window.days), data_source,
                    row["signal"], [row["geo_type"]], row["max_time"],
                    source_config["maintainers"])
            else:
                gap_complaints[row["signal"]].geo_types.append(row["geo_type"])

    return list(age_complaints.values()) + list(gap_complaints.values())
示例#14
0
def check_source(data_source, meta, params, grace, logger):
    """Iterate over all signals from a source and check for problems.

    Possible problems:

    - Newest available data exceeds max age.
    - Gap between subsequent data points exceeds max gap.

    For example, consider a source with a max age of 5 days and max gap of 1
    day. If today is 2020-10-15, and the latest available data is from
    2020-10-09, the max age is exceeded. If there is no data available on
    2020-10-07, but there is on 2020-10-06 and 2020-10-08, there is a gap of 2
    days and the max gap is exceeded.

    The gap window controls how much data we check for gaps -- a gap window of
    10 days means we check the most recent 10 days of data. Defaults to 7.

    """

    source_config = params[data_source]
    gap_window = pd.Timedelta(days=source_config.get("gap_window", 7))
    max_allowed_gap = source_config.get("max_gap", 1)

    signals = meta[meta.data_source == data_source]

    now = pd.Timestamp.now()

    age_complaints = {}
    gap_complaints = {}

    for _, row in signals.iterrows():
        if "retired-signals" in source_config and \
           row["signal"] in source_config["retired-signals"]:
            continue

        logger.info("Retrieving signal",
            source=data_source,
            signal=row["signal"],
            start_day=(datetime.now() - timedelta(days = 14)).strftime("%Y-%m-%d"),
            end_day=datetime.now().strftime("%Y-%m-%d"),
            geo_type=row["geo_type"])

        latest_data = covidcast.signal(
            data_source, row["signal"],
            start_day=datetime.now() - timedelta(days = 14),
            end_day=datetime.now(),
            geo_type=row["geo_type"]
        )

        current_lag_in_days = (now - row["max_time"]).days
        lag_calculated_from_api = False

        if latest_data is not None:
            unique_dates = [pd.to_datetime(val).date()
                            for val in latest_data["time_value"].unique()]
            current_lag_in_days = (datetime.now().date() - max(unique_dates)).days
            lag_calculated_from_api = True

        logger.info("Signal lag",
                    current_lag_in_days = current_lag_in_days,
                    data_source = data_source,
                    signal = row["signal"],
                    geo_type=row["geo_type"],
                    lag_calculated_from_api = lag_calculated_from_api)

        if current_lag_in_days > source_config["max_age"] + grace:
            if row["signal"] not in age_complaints:
                age_complaints[row["signal"]] = Complaint(
                    "is {current_lag_in_days} days old".format(current_lag_in_days=current_lag_in_days),
                    data_source,
                    row["signal"],
                    [row["geo_type"]],
                    row["max_time"],
                    source_config["maintainers"])
            else:
                age_complaints[row["signal"]].geo_types.append(row["geo_type"])

        # Check max gap
        if max_allowed_gap == -1 or latest_data is None:
            # No gap detection for this source
            continue

        # convert numpy datetime values to pandas datetimes and then to
        # datetime.date, so we can work with timedeltas after
        unique_issues = [pd.to_datetime(val).date()
                        for val in latest_data["issue"].unique()]

        gap_days = [(day - prev_day).days
                    for day, prev_day in zip(unique_dates[1:], unique_dates[:-1])]

        # If we only have a single day of data available then gap days will be
        # empty.
        if not gap_days:
            logger.info(
                "Not enough data to calculate gap days.",
                data_source=data_source,
                signal=row["signal"],
                geo_type=row["geo_type"])
            continue

        gap = max(gap_days) - 1
        logger.info("Detecting days with data present",
                    data_source = data_source,
                    signal = row["signal"],
                    geo_type=row["geo_type"],
                    most_recent_dates_with_data = [x.strftime("%Y-%m-%d") for x in unique_dates],
                    gap_days = gap_days,
                    max_gap = gap,
                    issue_dates = [x.strftime("%Y-%m-%d") for x in unique_issues])

        if gap > max_allowed_gap:
            if row["signal"] not in gap_complaints:
                gap_complaints[row["signal"]] = Complaint(
                    "has a {gap}-day gap of missing data in its most recent "
                    "{gap_window} days of data".format(gap=gap, gap_window=gap_window.days),
                    data_source,
                    row["signal"],
                    [row["geo_type"]],
                    datetime.now(),
                    source_config["maintainers"])
            else:
                gap_complaints[row["signal"]].geo_types.append(row["geo_type"])

    return list(age_complaints.values()) + list(gap_complaints.values())