Пример #1
0
def assemble_time_series(df):
    ts = get_time_series(df)
    deltas = ts[schema.keys()]\
        .rename(columns = schema)
    deltas = deltas.reindex(pd.date_range(deltas.index.min(),
                                          deltas.index.max()),
                            fill_value=0)
    merged = deltas.merge(deltas.cumsum(axis=0).rename(columns=lambda _: _[1]),
                          left_index=True,
                          right_index=True).astype(int)
    merged.index.name = "date"
    merged.columns.name = None
    return merged
Пример #2
0
def get_state_timeseries(states=["Tamil Nadu"],
                         download: bool = False) -> pd.DataFrame:
    paths = {
        "v3": [data_path(i) for i in (1, 2)],
        "v4": [data_path(i) for i in range(3, 25)]
    }
    if download:
        for target in paths['v3'] + paths['v4']:
            download_data(data, target)
    return load_all_data(v3_paths = [data/filepath for filepath in paths['v3']],  v4_paths = [data/filepath for filepath in paths['v4']])\
        .query("detected_state in @states" if states != "*" else "detected_state != 'NULL'", engine = "python")\
        .pipe(lambda _: get_time_series(_, ["detected_state", "detected_district"]))\
        .drop(columns = ["date", "time", "delta", "logdelta"])\
        .rename(columns = {
            "Deceased":     "dD",
            "Hospitalized": "dT",
            "Recovered":    "dR"
        })
Пример #3
0
def get_state_timeseries(
    states = "*", 
    download: bool = False, 
    aggregation_cols = ["detected_state", "detected_district"], 
    last_API_file: int = 27) -> pd.DataFrame:
    """ load state- and district-level data, downloading source files if specified """
    paths = {"v3": [data_path(i) for i in (1, 2)], "v4": [data_path(i) for i in range(3, last_API_file)]}
    if download:
        for target in paths['v3'] + paths['v4']: 
            download_data(data, target)
    return load_all_data(v3_paths = [data/filepath for filepath in paths['v3']],  v4_paths = [data/filepath for filepath in paths['v4']])\
        .query("detected_state in @states" if states != "*" else "detected_state != 'NULL'")\
        .pipe(lambda _: get_time_series(_, aggregation_cols))\
        .drop(columns = ["date", "time", "delta", "logdelta"])\
        .rename(columns = {
            "Deceased":     "dD",
            "Hospitalized": "dT",
            "Recovered":    "dR"
        })
Пример #4
0
plt.set_theme("substack")
# define data versions for api files
paths = {
    "v3": [data_path(i) for i in (1, 2)],
    "v4": [data_path(i) for i in range(3, 26)]
}

# for target in paths['v3'] + paths['v4']:
#     download_data(data, target)

df = load_all_data(v3_paths=[data / filepath for filepath in paths['v3']],
                   v4_paths=[data / filepath for filepath in paths['v4']])
data_recency = str(df["date_announced"].max()).split()[0]
run_date = str(pd.Timestamp.now()).split()[0]

ts = get_time_series(df, "detected_state")

states = [
    "Maharashtra", "Punjab", "West Bengal", "Bihar", "Delhi", "Andhra Pradesh",
    "Telangana", "Tamil Nadu", "Madhya Pradesh"
]

for state in states[:1]:
    print(state)
    print("  + running estimation...")

    (inf_dates, inf_Rt_pred, inf_Rt_CI_upper, inf_Rt_CI_lower, inf_T_pred,
     inf_T_CI_upper, inf_T_CI_lower, inf_total_cases, inf_new_cases_ts,
     inf_anomalies, inf_anomaly_dates) = analytical_MPVS(
         ts.loc[state].Hospitalized,
         CI=CI,
Пример #5
0
plt.sca(lax.twinx())
plt.plot(df["TT"][:, "delta", "confirmed"].index, smoothed(df["TT"][:, "delta", "confirmed"].values), label = "Daily Cases", color = plt.PRED_PURPLE)
plt.legend(loc = 'upper right')
plt.PlotDevice().ylabel("new cases", rotation = -90, labelpad = 50)
plt.ylim(bottom = 0)
plt.sca(lax)
plt.show()

# cases vs deaths
from pathlib import Path
data = Path("./data")
paths = {"v3": [data_path(i) for i in (1, 2)], "v4": [data_path(i) for i in range(3, 27)]}
for target in paths['v3'] + paths['v4']: 
    download_data(data, target)
df = load_all_data(v3_paths = [data/filepath for filepath in paths['v3']],  v4_paths = [data/filepath for filepath in paths['v4']])\
    .pipe(lambda _: get_time_series(_, ["detected_state"]))\
    .drop(columns = ["date", "time", "delta", "logdelta"])\
    .rename(columns = {
        "Deceased":     "dD",
        "Hospitalized": "dT",
        "Recovered":    "dR"
    }).sum(level = -1).sort_index()

plt.plot(df.index, smoothed(df.dD.values), label = "Daily Deaths", color = plt.RED)
plt.text(s = "national lockdown", x = pd.to_datetime("April 27, 2020"), y = 200, fontdict = plt.theme.note, ha = "center", va = "top")
plt.legend(loc = 'upper left')
plt.ylim(bottom = 0)
lax = plt.gca()
plt.sca(lax.twinx())
plt.plot(df.index, smoothed(df.dT.values), label = "Daily Cases", color = plt.PRED_PURPLE)
plt.legend(loc = 'upper right')
Пример #6
0
CI = 0.95

paths = {
    "v3": [data_path(_) for _ in (1, 2)],
    "v4": [data_path(_) for _ in range(3, 18)]
}

for target in paths['v3'] + paths['v4']:
    download_data(data, target)

dfn = load_all_data(v3_paths=[data / filepath for filepath in paths['v3']],
                    v4_paths=[data / filepath for filepath in paths['v4']])

delay = pd.read_csv(data / "bihar_delay.csv").set_index("delay")

state_ts = get_time_series(dfn, "detected_state").loc["Bihar"].Hospitalized
# state_ts = delay_adjust(state_ts, np.squeeze(delay.values))
state_ts = state_ts[state_ts.index >= "2020-03-26"]
district_names, population_counts, _ = etl.district_migration_matrix(
    data / "Migration Matrix - District.csv")
populations = dict(zip(district_names, population_counts))

# first, look at state level predictions
(dates, Rt_pred, Rt_CI_upper, Rt_CI_lower, T_pred, T_CI_upper, T_CI_lower,
 total_cases, new_cases_ts, anomalies, anomaly_dates) = analytical_MPVS(
     state_ts,
     CI=CI,
     smoothing=notched_smoothing(window=smoothing),
     totals=False)

plt.Rt(dates, Rt_pred[1:], Rt_CI_upper[1:], Rt_CI_lower[1:], CI, ymin=0, ymax=3)\
Пример #7
0
    # define data versions for api files
    paths = {
        "v3": [data_path(i) for i in (1, 2)],
        "v4": [data_path(i) for i in (3, 4, 5, 6, 7, 8)]
    }

    # download data from india covid 19 api
    for target in paths['v3'] + paths['v4']:
        download_data(data, target)

    df = load_all_data(v3_paths=[data / filepath for filepath in paths['v3']],
                       v4_paths=[data / filepath for filepath in paths['v4']])
    data_recency = str(df["date_announced"].max()).split()[0]
    run_date = str(pd.Timestamp.now()).split()[0]

    ts = get_time_series(df[df.detected_state == "Delhi"])

    (dates, RR_pred, RR_CI_upper, RR_CI_lower, T_pred, T_CI_upper, T_CI_lower,
     total_cases, new_cases_ts, anomalies,
     anomaly_dates) = analytical_MPVS(ts.delta[ts.delta > 0],
                                      CI=CI,
                                      smoothing=convolution(window=smoothing))
    #= analytical_MPVS(ts.Hospitalized[ts.Hospitalized > 0], CI = CI, smoothing = lambda ts: box_filter(ts, smoothing, 10))

    np.random.seed(33)
    delhi = Model([
        ModelUnit("Delhi",
                  18_000_000,
                  I0=T_pred[-1],
                  RR0=RR_pred[-1],
                  mobility=0)
Пример #8
0
        "v4": [
            "raw_data3.csv", "raw_data4.csv", "raw_data5.csv", "raw_data6.csv",
            "raw_data7.csv", "raw_data8.csv", "raw_data9.csv",
            "raw_data10.csv", "raw_data11.csv"
        ]
    }

    # download data from india covid 19 api
    for target in paths['v3'] + paths['v4']:
        download_data(data, target)

    # run rolling regressions on historical national case data
    dfn = load_all_data(v3_paths=[data / filepath for filepath in paths['v3']],
                        v4_paths=[data / filepath for filepath in paths['v4']])
    data_recency = str(dfn["date_announced"].max()).split()[0]
    tsn = get_time_series(dfn)
    grn = estimate(tsn, smoothing)

    # disaggregate down to states
    tss = get_time_series(dfn, 'detected_state').loc[states]

    grs = tss.groupby(level=0).apply(lambda ts: estimate(ts, smoothing))

    # voluntary and mandatory reproductive numbers
    Rvn = np.mean(grn["2020-03-24":"2020-03-31"].R)
    Rmn = np.mean(grn["2020-04-01":].R)

    Rvs = {
        s: np.mean(grs.loc[s].loc["2020-03-24":"2020-03-31"].R)
        if s in grs.index else Rvn
        for s in states
Пример #9
0
    "v3": [data_path(i) for i in (1, 2)],
    "v4": [data_path(i) for i in range(3, 26)]
}

for target in paths['v3'] + paths['v4']:
    try:
        download_data(data, target)
    except:
        pass

df = load_all_data(v3_paths=[data / filepath for filepath in paths['v3']],
                   v4_paths=[data / filepath for filepath in paths['v4']])
data_recency = str(df["date_announced"].max()).split()[0]
run_date = str(pd.Timestamp.now()).split()[0]

ts = get_time_series(df, ["detected_state", "detected_district"])

focus = ts.loc[[
    "Maharashtra", "Madhya Pradesh", "Gujarat", "West Bengal", "Tamil Nadu"
]]
district_estimates = []

for (state, district) in focus.index.droplevel(-1).unique():
    if district in ["Unknown", "Other State"]:
        continue
    print(state, district)
    try:
        (dates, Rt_pred, RR_CI_upper, RR_CI_lower, T_pred, T_CI_upper,
         T_CI_lower, total_cases, new_cases_ts, anomalies,
         anomaly_dates) = analytical_MPVS(
             focus.loc[state, district].Hospitalized,
Пример #10
0
    "v3": [data_path(i) for i in (1, 2)],
    "v4": [data_path(i) for i in range(3, 26)]
}

# for target in paths['v3'] + paths['v4']:
#     # try:
#     #     download_data(data, target)
#     # except:
#     #     pass

df = load_all_data(v3_paths=[data / filepath for filepath in paths['v3']],
                   v4_paths=[data / filepath for filepath in paths['v4']])
data_recency = str(df["date_announced"].max()).split()[0]
run_date = str(pd.Timestamp.now()).split()[0]

ts = get_time_series(df)  #, ["detected_state", "detected_district"])

one_day = pd.Timedelta(days=1)

# fig 1

infections = ts[
    ts.date >= "May 01, 2020"].Hospitalized  #.sum(level = 2).sort_index()
smoothed = convolution("uniform")
scatter = plt.scatter(infections.index[:-7],
                      infections.values[:-7],
                      color="#CC4C75",
                      marker="s",
                      s=5,
                      alpha=0.5)
lineplot, = plt.plot(infections.index[:-7],
Пример #11
0

# set to cloud temp directory if not explicitly told to run locally
root = cwd() if len(sys.argv) > 1 and sys.argv[1] == "--local" else Path(
    "/tmp")
data = root / "data"

# model details
gamma = 0.2
smoothing = 10
CI = 0.95

download_data(data, 'state_wise_daily.csv')

state_df = load_statewise_data(data / "state_wise_daily.csv")
country_time_series = get_time_series(state_df)

estimates = []
timeseries = []

# country level
(dates, RR_pred, RR_CI_upper, RR_CI_lower, T_pred, T_CI_upper, T_CI_lower,
 total_cases, new_cases_ts, anomalies, anomaly_dates) = analytical_MPVS(
     country_time_series["Hospitalized"].iloc[:-1],
     CI=CI,
     smoothing=notched_smoothing(window=smoothing))

country_code = state_name_lookup["India"]
for row in zip(dates, RR_pred, RR_CI_upper, RR_CI_lower):
    timeseries.append((country_code, *row))
Пример #12
0
    "v3": [data_path(i) for i in (1, 2)],
    "v4": [data_path(i) for i in range(3, 27)]
}

for target in paths['v3'] + paths['v4']:
    try:
        download_data(data, target)
    except:
        pass

df = load_all_data(v3_paths=[data / filepath for filepath in paths['v3']],
                   v4_paths=[data / filepath for filepath in paths['v4']])
data_recency = str(df["date_announced"].max()).split()[0]
run_date = str(pd.Timestamp.now()).split()[0]

ts = get_time_series(df, "detected_state")

states = [
    "Tamil Nadu", "Karnataka"
]  #["Maharashtra", "Punjab", "West Bengal", "Bihar", "Delhi", "Andhra Pradesh", "Telangana", "Tamil Nadu", "Madhya Pradesh"]

for state in states:
    print(state)
    print("  + running estimation...")
    (dates, Rt_pred, RR_CI_upper, RR_CI_lower, T_pred, T_CI_upper, T_CI_lower,
     total_cases, new_cases_ts, anomalies, anomaly_dates) = analytical_MPVS(
         ts.loc[state].Hospitalized,
         CI=CI,
         smoothing=notched_smoothing(window=smoothing),
         totals=False)
    estimates = pd.DataFrame(
Пример #13
0
    b = convolve(b1, b2)
    a = convolve(a1, a2)
    notched = pd.Series(filtfilt(b, a, ts))
    notched.index = ts.index
    return notched


root = cwd()
data = mkdir(root / "data")
figs = mkdir(root / "figs")

###########################################################
# download latest case data
download_data(data, 'state_wise_daily.csv')
df = load_statewise_data(data / "state_wise_daily.csv")
ts = get_time_series(df, "state")

###########################################################
# load delay data
api_diff = pd.read_csv(data / "daily_diff.csv",
                       parse_dates=["status_change_date", "report_date"],
                       dayfirst=True)
delay = api_diff[(api_diff.current_status == "Hospitalized")
                 & (api_diff.report_date > "2020-08-02")].copy()
delay = delay.drop(
    columns=[col for col in delay.columns if col.startswith("Unnamed")] +
    ["rowhash"])
delay["newhash"] = delay[[
    "patient_number", "date_announced", "detected_district", "detected_state",
    "current_status", "status_change_date", "num_cases"
]].apply(lambda x: hash(tuple(x)), axis=1)
Пример #14
0
    ywf = fft(y * w)
    xf = np.linspace(0.0, 1.0 / (2.0 * T), N // 2)
    plt.plot(xf[1:N // 2],
             2 / N * np.abs(ywf[1:N // 2]),
             ".",
             alpha=0.7,
             label=label)


root = cwd()
data = root / "data"
figs = root / "figs"

download_data(data, 'state_wise_daily.csv')
state_df = load_statewise_data(data / "state_wise_daily.csv")
natl_time_series = get_time_series(state_df)
time_series = get_time_series(state_df, 'state')

# is there chunking in reporting?
print("checking average infection differentials...")
time_series["delta_I"] = time_series.groupby(level=0)['Hospitalized'].diff()
time_series["dow"] = time_series.index.get_level_values(1).dayofweek
plot_average_change(time_series,
                    "(All India)",
                    filename=figs / "avg_delta_I_DoW_India.png")
for state in tqdm(time_series.index.get_level_values(0).unique()):
    plot_average_change(time_series.loc[state],
                        f"({state})",
                        filename=figs / f"avg_delta_I_DoW_{state}.png")

# are anomalies falling on certain days?
Пример #15
0
# public data 
paths = { 
    "v3": [data_path(_) for _ in (1, 2)],
    "v4": [data_path(_) for _ in range(3, 13)]
}

for target in paths['v3'] + paths['v4']:
    download_data(data, target)

dfn = load_all_data(
    v3_paths = [data/filepath for filepath in paths['v3']], 
    v4_paths = [data/filepath for filepath in paths['v4']]
)
 
state_ts = get_time_series(dfn, "detected_state").loc["Bihar"]
district_names, population_counts, _ = etl.district_migration_matrix(data/"Migration Matrix - District.csv")
populations = dict(zip(district_names, population_counts))

# first, look at state level predictions
(dates_public, RR_pred_public, RR_CI_upper_public, RR_CI_lower_public, T_pred_public, T_CI_upper_public, T_CI_lower_public, total_cases_public, new_cases_ts_public, anomalies_public, anomaly_dates_public) = analytical_MPVS(state_ts.Hospitalized, CI = CI, smoothing = convolution(window = smoothing)) 
plt.plot(dates_public, RR_pred_public, label = "Estimated $R_t$", color = "midnightblue")
plt.fill_between(dates_public, RR_CI_lower_public, RR_CI_upper_public, label = f"{100*CI}% CI", color = "midnightblue", alpha = 0.3)
plt.legend(["private data estimate", "public data estimate"])
plt.show()


np.random.seed(33)
Bihar = Model([ModelUnit("Bihar", 99_000_000, I0 = T_pred[-1], RR0 = RR_pred[-1], mobility = 0)])
Bihar.run(14, np.zeros((1,1)))
Пример #16
0
# cutoff = None
# cutoff = "April 7, 2021"
cutoff = "April 14, 2021"

if cutoff:
    df = df[df.date_announced <= cutoff]
data_recency = str(df["date_announced"].max()).split()[0]
run_date     = str(pd.Timestamp.now()).split()[0]

ts = get_time_series(
    df[df.detected_state == "Tamil Nadu"], 
    ["detected_state", "detected_district"]
)\
.drop(columns = ["date", "time", "delta", "logdelta"])\
.rename(columns = {
            "Deceased":     "dD",
            "Hospitalized": "dT",
            "Recovered":    "dR"
}).droplevel(0)\
.drop(labels = ["Other State", "Railway Quarantine", "Airport Quarantine"])


district_estimates = []

simulation_initial_conditions = pd.read_csv(data/f"all_india_coalesced_initial_conditions{simulation_start.strftime('%b%d')}.csv")\
    .drop(columns = ["Unnamed: 0"])\
    .set_index(["state", "district"])\
    .loc["Tamil Nadu"]

def setup(district) -> Tuple[Callable[[str], SIR], pd.DataFrame]:
Пример #17
0
sero["hr"] = sero.hom_region.map(hom_regions_numeric)

# pull down COVID 19 India data
paths = {
    "v3": [data_path(i) for i in (1, 2)],
    "v4": [data_path(i) for i in range(3, 19)]
}
# for target in paths['v3'] + paths['v4']:
#     download_data(data, target)
df = load_all_data(v3_paths = [data/filepath for filepath in paths['v3']],  v4_paths = [data/filepath for filepath in paths['v4']])\
    .query("detected_state == 'Karnataka'")

# get all deaths in KA on Aug 29 by district
get_time_series(df, "detected_district")\
    .query("status_change_date <= 'Aug 29, 2020'", engine = "python")\
    .Deceased.sum(level = 0)\
    .drop("Other State")\
    .astype(int)\
    .to_csv(data/"ka_cumulative_deaths_aug29.csv")

# aggregate time series by hom_region
df["detected_region"] = df.detected_district.map(hom_regions_rev)
ka_ts = get_time_series(df.dropna(subset=["detected_region"]),
                        "detected_region").rename(columns={
                            "Deceased": "dD",
                            "Hospitalized": "dT",
                            "Recovered": "dR"
                        }).unstack(1).fillna(0).stack()

cols = ["dD", "dT", "dR"]
ka_ts_all = pd.concat([ka_ts, ka_ts[cols].cumsum().rename(columns = {col: col[1:] for col in cols})], axis = 1)\
    .drop(columns = ["date", "time", "delta", "logdelta"])\