Exemplo n.º 1
0
def score_all_predictions(pred_file,
                          date,
                          model_date,
                          mse=False,
                          key='cases',
                          bin_cutoffs=[20, 1000]):
    true_data = utils.get_processed_df('nyt_us_counties_daily.csv')
    cum_data = utils.get_processed_df('nyt_us_counties.csv')
    proc_score_date = utils.process_date(date, true_data)
    proc_model_date = utils.process_date(model_date, true_data)
    raw_pred_data = np.genfromtxt(pred_file,
                                  delimiter=',',
                                  skip_header=1,
                                  dtype=np.str)
    date_preds = np.array([row for row in raw_pred_data if date in row[0]])
    all_fips = np.array([row[0].split('-')[-1] for row in date_preds])
    all_preds = date_preds[:, 1:].astype(np.float)
    true_data = np.array([
        utils.get_region_data(true_data,
                              fips,
                              proc_date=proc_score_date,
                              key=key) for fips in all_fips
    ])
    cum_data = np.array([
        utils.get_region_data(cum_data,
                              fips,
                              proc_date=proc_model_date,
                              key=key) for fips in all_fips
    ])
    return get_scores(all_fips,
                      all_preds,
                      true_data,
                      cum_data,
                      mse=mse,
                      bin_cutoffs=bin_cutoffs)
Exemplo n.º 2
0
def main():
    """
        主函数
    """
    # 读取指定列的数据
    use_cols = ['enddate', 'rawpoll_clinton', 'rawpoll_trump']
    data_array = utils.load_data(filename, use_cols)

    # 处理日期格式数据,转换为yyyy-mm字符串
    proc_data_array = utils.process_date(data_array)

    # 统计每月的投票数据
    utils.get_month_stats(proc_data_array)
Exemplo n.º 3
0
 def search_more(self, dateInput='2020-10-04'):
     sql = "select * from wallet where dateInput=?"
     df = pd.read_sql(sql, self.conn, params=(process_date(dateInput),))
     # print(df)
     return df
Exemplo n.º 4
0
 def search_range(self, start, end):
     sql = "select * from wallet where dateInput>=? and dateInput<=? order by dateInput"
     df = pd.read_sql(sql, self.conn, params=(process_date(start), process_date(end)))
     # print("search range: {} and {}".format(start, end))
     # print(df)
     return df
Exemplo n.º 5
0
def make_erf_point_predictions(df,
                               county_fips,
                               key='deaths',
                               last_date_pred='2020-06-30',
                               start_date='2020-03-31',
                               boundary_date=None,
                               do_diff=True):
    '''
    df: main nyt data frame
    county_fips: fips code of the county to be fit
    key: 'deaths' for COVID-19 deaths, 'cases' for COVID-19 confirmed cases
    last_date_pred: last day to make predictions for. If 'None', stop at current day
    start_date: first date to list fitted values for. If 'None', start at beginning of dataframe. If do_diff is True,
        this should be one day before the first day you want difference values for
    boundary_date: date at which to cut off data used for fitting
    do_diff: if true, report the daily increase in cases/deaths rather than cumulative values
    '''
    num_days = int(
        utils.process_date(last_date_pred, df) -
        utils.process_date(start_date, df))
    data = utils.get_region_data(df, county_fips)
    if len(data) == 0:  # If there's no data for this FIPS, just return zeroes
        return np.zeros(num_days)
    first_date_obv_proc = np.min(data['date_processed'].values)
    boundary = None if boundary_date is None else int(
        utils.process_date(boundary_date, df) - first_date_obv_proc + 1)

    x = data['date_processed'].values[:boundary]
    if len(x) == 0:  # If there's no data for this FIPS, just return zeroes
        return np.zeros(num_days)
    if start_date is None:
        start_date_proc = first_date_obv_proc
    else:
        start_date_proc = utils.process_date(start_date, df)
    last_date_obv_proc = np.max(x)
    if last_date_pred is None:
        last_date_pred_proc = last_date_obv_proc
    else:
        last_date_pred_proc = utils.process_date(last_date_pred, df)

    y = data[key].values[:boundary]
    if np.max(
            y
    ) == 0:  # If all data we have for this FIPS is zeroes, just return zeroes
        return np.zeros(num_days)
    thresh_y = y[y >= 10]  # Isolate all days with at least 10 cases/deaths
    # If we have fewer than 5 days with substantial numbers of cases/deaths there isn't enough information to do an
    # erf fit, so just do a simple linear fit instead
    do_lin_model = len(thresh_y) < 5
    if do_lin_model:
        fit_func = lin_curve
        # Perform a linear fit on the latest 5 days of data
        fit_x, fit_y = x[-5:], y[-5:]
        # Pad with zeroes if we have fewer than 5 days of data
        if len(fit_x) < 5:
            fit_x = np.concatenate((np.zeros(5 - len(fit_x)), fit_x))
            fit_y = np.concatenate((np.zeros(5 - len(fit_y)), fit_y))
        fit_params0 = [0, 0]
        # The slope should be at least 0 and at most the largest 1-day increase
        # The intercept can be very low but shouldn't be above the minimum data value
        fit_bounds = [[0, -100 * np.max(y)],
                      [max(1, np.max(np.diff(fit_y))),
                       np.min(y)]]
    else:
        fit_func = erf_curve
        fit_x, fit_y = x, y
        fit_params0 = [np.log10(2 * np.max(data[key])), 0.1, 30]
        # The max value should be between the current max and 100x the current max
        # The slope was given a wide range around common values
        # The infection shouldn't peak before the data started or after the end of ~July
        fit_bounds = [
            bnd for bnd in zip(*[[
                np.log10(np.max(data[key])),
                np.log10(100 * np.max(data[key]))
            ], [0.001, 10], [0, 200]])
        ]
    # Use scipy to fit either a linear or erf model to the data
    popt, pcov = curve_fit(fit_func,
                           fit_x,
                           fit_y,
                           p0=fit_params0,
                           bounds=fit_bounds)
    t = np.arange(start_date_proc, last_date_pred_proc + 1)
    if do_diff:
        return np.diff(run_model(fit_func, popt, t))
    return run_model(fit_func, popt, t)