Exemplo n.º 1
0
def predict_ensembles(ensembles: List[api.Ensemble], dataset, run_id,
                      n_cores: int) -> None:
    """ Predict for ensembles """

    # Get empty df of times predict to hold predictions
    df_pred_ensembles = dataset.get_df_cols(cols=[]).loc[get_all_times_predict(
        get_period_pairs(run_id))]

    # Use only half the cores, using all memoryerrors a rackham node.
    with mp.Pool(processes=n_cores, maxtasksperchild=1) as pool:
        with tempfile.TemporaryDirectory() as tempdir:
            results = []
            for ensemble in ensembles:
                results.append(
                    pool.apply_async(
                        predict_ensemble,
                        (
                            ensemble,
                            dataset,
                            run_id,
                            tempdir,
                        ),
                    ))
            for result in results:
                path = result.get()
                df_pred_ensembles = assign_into_df(
                    df_to=df_pred_ensembles, df_from=io.parquet_to_df(path))
    # Join ensemble and constituent predictions and write them all to disk
    store_prediction_on_disk(
        df=df_pred_ensembles.join(
            get_predictions_from_disk(run_id=run_id, dataset=dataset)),
        run_id=run_id,
        dataset=dataset,
    )
def test_assign_into_df() -> None:

    df_a = DfMocker(n_t=20).df
    df_b = df_a.copy()
    df_into = df_a.loc[:, []].copy()

    # Test we get the full frame if we give all times
    df_into = data.assign_into_df(df_to=df_into, df_from=df_a.loc[0:9])
    df_into = data.assign_into_df(df_to=df_into, df_from=df_a.loc[10:19])
    pd.testing.assert_frame_equal(df_a, df_into, check_dtype=False)

    # Test we get missing if we don't give all cols
    df_into = df_a.loc[:, []].copy()
    df_into = data.assign_into_df(df_to=df_into, df_from=df_a.loc[0:3])
    df_into = data.assign_into_df(df_to=df_into, df_from=df_a.loc[10:19])
    with pytest.raises(AssertionError):
        pd.testing.assert_frame_equal(df_a, df_into, check_dtype=False)
Exemplo n.º 3
0
def predict_ensemble(
    ensemble: api.Ensemble,
    dataset: data_api.Dataset,
    run_id: str,
    tempdir: str,
) -> str:
    """ Predict for a single ensemble """
    cols_needed = ensemble.cols_needed
    cols_data = dataset.list_cols_cached()
    cols_pred = io.list_columns_in_parquet(
        path=path_prediction(run_id, dataset))

    # Check we have all we need
    cols_missing = [
        col not in cols_data + cols_pred for col in ensemble.cols_needed
    ]
    if any(cols_missing):
        raise RuntimeError(
            f"Ensemble {ensemble.name} missing cols {cols_missing}")

    # Get constituent predictions and features (outcomes) needed for ensemble
    df_constituent = io.parquet_to_df(
        path=path_prediction(run_id, dataset),
        cols=[col for col in cols_needed if col in cols_pred],
    )
    df_data = dataset.get_df_cols(
        cols=[col for col in cols_needed if col in cols_data])
    df = df_constituent.join(df_data)

    period_pairs = get_period_pairs(run_id)

    # Empty df to hold predictions
    df_pred = df.loc[get_all_times_predict(period_pairs), []]

    for period_pair in period_pairs:
        df_pred = assign_into_df(
            df_to=df_pred,
            df_from=ensemble.predict(df=df,
                                     period_calib=period_pair[0],
                                     period_test=period_pair[1]),
        )

    log.debug(
        f"Done predicting for ensemble {ensemble.name}, writing results.")
    # Generate a random filename in the tempdir
    path = os.path.join(tempdir, f"{uuid.uuid4().hex}.parquet")
    io.df_to_parquet(df=df_pred, path=path)
    return path
Exemplo n.º 4
0
def predict_models(
    models: List[api.Model],
    dataset: data_api.Dataset,
    run_id: str,
    n_cores: int,
) -> None:
    """ Predict for models """

    # Get our calib/test period pairs
    period_pairs = get_period_pairs(run_id)

    log.info(f"Predicting for {len(models)} models "
             f"for {len(period_pairs)} period pairs.")

    # Create predictions df with predict times and no cols
    df_pred = dataset.get_df_cols(
        cols=[]).loc[get_all_times_predict(period_pairs), []]

    # Predict the models in parallel
    with mp.get_context("spawn").Pool(processes=n_cores,
                                      maxtasksperchild=1) as pool:
        # with mp.Pool(processes=n_cores, maxtasksperchild=1) as pool:
        with tempfile.TemporaryDirectory() as tempdir:
            results = []
            for period_pair in period_pairs:
                for model in models:
                    # period_pair[0] is calib period and [1] the test period
                    results.append(
                        pool.apply_async(
                            predict_model,
                            (
                                model,
                                dataset,
                                period_pair[0],
                                period_pair[1],
                                tempdir,
                            ),
                        ))
            # Collect as results become ready
            for result in results:
                path = result.get()
                log.debug(f"Insert from {path}")
                df_pred = assign_into_df(df_to=df_pred,
                                         df_from=io.parquet_to_df(path))

    log.debug("Done collecting.")
    store_prediction_on_disk(df=df_pred, run_id=run_id, dataset=dataset)
Exemplo n.º 5
0
def predict_model(
    model: api.Model,
    dataset: data_api.Dataset,
    period_calib: api.Period,
    period_test: api.Period,
    tempdir: str,
) -> str:
    """ Predict for single model """

    log.info(f"Started predicting for model {model.name} "
             f"period_calib: {period_calib.name} "
             f"period_test: {period_test.name}.")

    # Read in only features needed to predict for this model
    # @TODO: remove fillna(0), make sure input data is missing-free.
    cols_needed = model.cols_features + [model.col_outcome]
    df = dataset.get_df_cols(cols_needed).fillna(0)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", UserWarning)
        df_calib = model.predict(df=df, period=period_calib)
        df_test = model.predict(df=df, period=period_test)
        df_calibrated = model.predict_calibrated(df=df,
                                                 period_calib=period_calib,
                                                 period_test=period_test)

    all_times_predict = period_calib.times_predict + period_test.times_predict
    df_pred = rebuild_index(df.loc[all_times_predict, []])
    for df_from in [df_calib, df_test, df_calibrated]:
        df_pred = assign_into_df(df_to=df_pred, df_from=df_from)
    log.info(f"Fininshed predicting for {model.name}")

    # Generate a random filename in the tempdir
    path = os.path.join(tempdir, f"{uuid.uuid4().hex}.parquet")
    io.df_to_parquet(df=df_pred, path=path)
    return path
Exemplo n.º 6
0
                                                    n_estimators=estimators),
                    tags=["sb"])

# Lists of models are convenient
models = [model_0, model_1, model_2]
#models = [model_d0, model_d1, model_d2]
#models = [model_baseline]
# Train all models
for model in models:
    model.fit_estimators(df)

df = df.loc[df.in_africa == 1]

for model in models:
    df_predictions = model.predict(df)
    df = assign_into_df(df, df_predictions)
    df_predictions = model.predict_calibrated(df=df,
                                              period_calib=period_calib,
                                              period_test=period_test)
    df = assign_into_df(df, df_predictions)

for model in models:
    model.save()

for model in models:
    model.evaluate(df)

partition = "test"

for model in models:
    for calib in ["uncalibrated", "calibrated"]:
    periods=periods
)
ensembles = [avg_ensemble]
ensembles_delta = [avg_ensemble_delta]

#for model in models:
#    model.fit_estimators(df)

# Predict and store predictions for their specified steps and periods in df
for model in models:
    # Uncalibrated predictions
    df_pred = model.predict(df)
    # assign_into_df takes care to only overwrite rows with actual values
    # This way we can keep all periods in the same df
    # It's also idempotent, no joining, so run as many times as you like.
    df = assign_into_df(df_to=df, df_from=df_pred)

    # Calibrated predictions
    df_pred = model.predict_calibrated(
        df=df,
        period_calib=period_calib,
        period_test=period_test,
    )
    df = assign_into_df(df_to=df, df_from=df_pred)
    df_pred = model.predict_calibrated(
        df=df,
        period_calib = period_calib,
        period_test = period_test
    )
    df = assign_into_df(df_to=df, df_from=df_pred)