def predict_ensembles(ensembles: List[api.Ensemble], dataset, run_id, n_cores: int) -> None: """ Predict for ensembles """ # Get empty df of times predict to hold predictions df_pred_ensembles = dataset.get_df_cols(cols=[]).loc[get_all_times_predict( get_period_pairs(run_id))] # Use only half the cores, using all memoryerrors a rackham node. with mp.Pool(processes=n_cores, maxtasksperchild=1) as pool: with tempfile.TemporaryDirectory() as tempdir: results = [] for ensemble in ensembles: results.append( pool.apply_async( predict_ensemble, ( ensemble, dataset, run_id, tempdir, ), )) for result in results: path = result.get() df_pred_ensembles = assign_into_df( df_to=df_pred_ensembles, df_from=io.parquet_to_df(path)) # Join ensemble and constituent predictions and write them all to disk store_prediction_on_disk( df=df_pred_ensembles.join( get_predictions_from_disk(run_id=run_id, dataset=dataset)), run_id=run_id, dataset=dataset, )
def test_assign_into_df() -> None: df_a = DfMocker(n_t=20).df df_b = df_a.copy() df_into = df_a.loc[:, []].copy() # Test we get the full frame if we give all times df_into = data.assign_into_df(df_to=df_into, df_from=df_a.loc[0:9]) df_into = data.assign_into_df(df_to=df_into, df_from=df_a.loc[10:19]) pd.testing.assert_frame_equal(df_a, df_into, check_dtype=False) # Test we get missing if we don't give all cols df_into = df_a.loc[:, []].copy() df_into = data.assign_into_df(df_to=df_into, df_from=df_a.loc[0:3]) df_into = data.assign_into_df(df_to=df_into, df_from=df_a.loc[10:19]) with pytest.raises(AssertionError): pd.testing.assert_frame_equal(df_a, df_into, check_dtype=False)
def predict_ensemble( ensemble: api.Ensemble, dataset: data_api.Dataset, run_id: str, tempdir: str, ) -> str: """ Predict for a single ensemble """ cols_needed = ensemble.cols_needed cols_data = dataset.list_cols_cached() cols_pred = io.list_columns_in_parquet( path=path_prediction(run_id, dataset)) # Check we have all we need cols_missing = [ col not in cols_data + cols_pred for col in ensemble.cols_needed ] if any(cols_missing): raise RuntimeError( f"Ensemble {ensemble.name} missing cols {cols_missing}") # Get constituent predictions and features (outcomes) needed for ensemble df_constituent = io.parquet_to_df( path=path_prediction(run_id, dataset), cols=[col for col in cols_needed if col in cols_pred], ) df_data = dataset.get_df_cols( cols=[col for col in cols_needed if col in cols_data]) df = df_constituent.join(df_data) period_pairs = get_period_pairs(run_id) # Empty df to hold predictions df_pred = df.loc[get_all_times_predict(period_pairs), []] for period_pair in period_pairs: df_pred = assign_into_df( df_to=df_pred, df_from=ensemble.predict(df=df, period_calib=period_pair[0], period_test=period_pair[1]), ) log.debug( f"Done predicting for ensemble {ensemble.name}, writing results.") # Generate a random filename in the tempdir path = os.path.join(tempdir, f"{uuid.uuid4().hex}.parquet") io.df_to_parquet(df=df_pred, path=path) return path
def predict_models( models: List[api.Model], dataset: data_api.Dataset, run_id: str, n_cores: int, ) -> None: """ Predict for models """ # Get our calib/test period pairs period_pairs = get_period_pairs(run_id) log.info(f"Predicting for {len(models)} models " f"for {len(period_pairs)} period pairs.") # Create predictions df with predict times and no cols df_pred = dataset.get_df_cols( cols=[]).loc[get_all_times_predict(period_pairs), []] # Predict the models in parallel with mp.get_context("spawn").Pool(processes=n_cores, maxtasksperchild=1) as pool: # with mp.Pool(processes=n_cores, maxtasksperchild=1) as pool: with tempfile.TemporaryDirectory() as tempdir: results = [] for period_pair in period_pairs: for model in models: # period_pair[0] is calib period and [1] the test period results.append( pool.apply_async( predict_model, ( model, dataset, period_pair[0], period_pair[1], tempdir, ), )) # Collect as results become ready for result in results: path = result.get() log.debug(f"Insert from {path}") df_pred = assign_into_df(df_to=df_pred, df_from=io.parquet_to_df(path)) log.debug("Done collecting.") store_prediction_on_disk(df=df_pred, run_id=run_id, dataset=dataset)
def predict_model( model: api.Model, dataset: data_api.Dataset, period_calib: api.Period, period_test: api.Period, tempdir: str, ) -> str: """ Predict for single model """ log.info(f"Started predicting for model {model.name} " f"period_calib: {period_calib.name} " f"period_test: {period_test.name}.") # Read in only features needed to predict for this model # @TODO: remove fillna(0), make sure input data is missing-free. cols_needed = model.cols_features + [model.col_outcome] df = dataset.get_df_cols(cols_needed).fillna(0) with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) df_calib = model.predict(df=df, period=period_calib) df_test = model.predict(df=df, period=period_test) df_calibrated = model.predict_calibrated(df=df, period_calib=period_calib, period_test=period_test) all_times_predict = period_calib.times_predict + period_test.times_predict df_pred = rebuild_index(df.loc[all_times_predict, []]) for df_from in [df_calib, df_test, df_calibrated]: df_pred = assign_into_df(df_to=df_pred, df_from=df_from) log.info(f"Fininshed predicting for {model.name}") # Generate a random filename in the tempdir path = os.path.join(tempdir, f"{uuid.uuid4().hex}.parquet") io.df_to_parquet(df=df_pred, path=path) return path
n_estimators=estimators), tags=["sb"]) # Lists of models are convenient models = [model_0, model_1, model_2] #models = [model_d0, model_d1, model_d2] #models = [model_baseline] # Train all models for model in models: model.fit_estimators(df) df = df.loc[df.in_africa == 1] for model in models: df_predictions = model.predict(df) df = assign_into_df(df, df_predictions) df_predictions = model.predict_calibrated(df=df, period_calib=period_calib, period_test=period_test) df = assign_into_df(df, df_predictions) for model in models: model.save() for model in models: model.evaluate(df) partition = "test" for model in models: for calib in ["uncalibrated", "calibrated"]:
periods=periods ) ensembles = [avg_ensemble] ensembles_delta = [avg_ensemble_delta] #for model in models: # model.fit_estimators(df) # Predict and store predictions for their specified steps and periods in df for model in models: # Uncalibrated predictions df_pred = model.predict(df) # assign_into_df takes care to only overwrite rows with actual values # This way we can keep all periods in the same df # It's also idempotent, no joining, so run as many times as you like. df = assign_into_df(df_to=df, df_from=df_pred) # Calibrated predictions df_pred = model.predict_calibrated( df=df, period_calib=period_calib, period_test=period_test, ) df = assign_into_df(df_to=df, df_from=df_pred) df_pred = model.predict_calibrated( df=df, period_calib = period_calib, period_test = period_test ) df = assign_into_df(df_to=df, df_from=df_pred)