def predict_ensembles(ensembles: List[api.Ensemble], dataset, run_id, n_cores: int) -> None: """ Predict for ensembles """ # Get empty df of times predict to hold predictions df_pred_ensembles = dataset.get_df_cols(cols=[]).loc[get_all_times_predict( get_period_pairs(run_id))] # Use only half the cores, using all memoryerrors a rackham node. with mp.Pool(processes=n_cores, maxtasksperchild=1) as pool: with tempfile.TemporaryDirectory() as tempdir: results = [] for ensemble in ensembles: results.append( pool.apply_async( predict_ensemble, ( ensemble, dataset, run_id, tempdir, ), )) for result in results: path = result.get() df_pred_ensembles = assign_into_df( df_to=df_pred_ensembles, df_from=io.parquet_to_df(path)) # Join ensemble and constituent predictions and write them all to disk store_prediction_on_disk( df=df_pred_ensembles.join( get_predictions_from_disk(run_id=run_id, dataset=dataset)), run_id=run_id, dataset=dataset, )
def get_df_cols(self, cols): """ Get a subset of cols from cached df """ if os.path.isfile(self.path): log.debug(f"Found df on file for {self.name} at {self.path}") return io.parquet_to_df(path=self.path, cols=cols) else: raise RuntimeError( "get_df_cols() called but no cached df, run refresh() first.")
def get_predictions_from_disk( run_id: str, dataset: data_api.Dataset, cols: Optional[List[str]] = None) -> pd.DataFrame: """ Read predictions from disk """ log.info("Reading predictions from disk") return io.parquet_to_df(path=path_prediction(run_id=run_id, dataset=dataset), cols=cols)
def get_df_cols(self, cols): """ Get a dataframe with a subset of columns TODO: Optimise to only read subset of cols from cache """ log.debug(f"Getting {cols} cols from {self.name}") if os.path.isfile(self.path): df = io.parquet_to_df(path=self.path, cols=cols) else: df = self.df[cols] return df
def df(self): """ Get the datafraem from the dataset """ if os.path.isfile(self.path): df = io.parquet_to_df(self.path) else: df = self.refresh() if self.cols: for col in self.cols: if col not in df: log.warning(f"Col {col} missing. Not in the sources?") return df
def predict_ensemble( ensemble: api.Ensemble, dataset: data_api.Dataset, run_id: str, tempdir: str, ) -> str: """ Predict for a single ensemble """ cols_needed = ensemble.cols_needed cols_data = dataset.list_cols_cached() cols_pred = io.list_columns_in_parquet( path=path_prediction(run_id, dataset)) # Check we have all we need cols_missing = [ col not in cols_data + cols_pred for col in ensemble.cols_needed ] if any(cols_missing): raise RuntimeError( f"Ensemble {ensemble.name} missing cols {cols_missing}") # Get constituent predictions and features (outcomes) needed for ensemble df_constituent = io.parquet_to_df( path=path_prediction(run_id, dataset), cols=[col for col in cols_needed if col in cols_pred], ) df_data = dataset.get_df_cols( cols=[col for col in cols_needed if col in cols_data]) df = df_constituent.join(df_data) period_pairs = get_period_pairs(run_id) # Empty df to hold predictions df_pred = df.loc[get_all_times_predict(period_pairs), []] for period_pair in period_pairs: df_pred = assign_into_df( df_to=df_pred, df_from=ensemble.predict(df=df, period_calib=period_pair[0], period_test=period_pair[1]), ) log.debug( f"Done predicting for ensemble {ensemble.name}, writing results.") # Generate a random filename in the tempdir path = os.path.join(tempdir, f"{uuid.uuid4().hex}.parquet") io.df_to_parquet(df=df_pred, path=path) return path
def predict_models( models: List[api.Model], dataset: data_api.Dataset, run_id: str, n_cores: int, ) -> None: """ Predict for models """ # Get our calib/test period pairs period_pairs = get_period_pairs(run_id) log.info(f"Predicting for {len(models)} models " f"for {len(period_pairs)} period pairs.") # Create predictions df with predict times and no cols df_pred = dataset.get_df_cols( cols=[]).loc[get_all_times_predict(period_pairs), []] # Predict the models in parallel with mp.get_context("spawn").Pool(processes=n_cores, maxtasksperchild=1) as pool: # with mp.Pool(processes=n_cores, maxtasksperchild=1) as pool: with tempfile.TemporaryDirectory() as tempdir: results = [] for period_pair in period_pairs: for model in models: # period_pair[0] is calib period and [1] the test period results.append( pool.apply_async( predict_model, ( model, dataset, period_pair[0], period_pair[1], tempdir, ), )) # Collect as results become ready for result in results: path = result.get() log.debug(f"Insert from {path}") df_pred = assign_into_df(df_to=df_pred, df_from=io.parquet_to_df(path)) log.debug("Done collecting.") store_prediction_on_disk(df=df_pred, run_id=run_id, dataset=dataset)
], ) log = logging.getLogger(__name__) level = "cm" model_path = "./models/{sub}" out_paths = { "evaluation": model_path.format(sub="evaluation"), "features": model_path.format(sub="features") } for k, v in out_paths.items(): if not os.path.isdir(v): os.makedirs(v) path = "~/OpenViEWS2/storage/data/datasets/manual.parquet" # change to your path cm_global_imp_0 = io.parquet_to_df(path) df = cm_global_imp_0 df_mdums = pd.get_dummies(df["month"], prefix="mdum") df_ydums = pd.get_dummies(df["year"], prefix="ydum") df = df.join(df_mdums) df = df.join(df_ydums) import pandas as pd konstanz_df = pd.read_csv("~/OpenViEWS2/storage/data/konstanz/konstanz.csv", low_memory=False) #konstanz_df.head() list(konstanz_df.columns) #konstanz_df.index
def df(self) -> pd.DataFrame: """ Get the dataframe """ if not os.path.isfile(self.path): self.refresh() return io.parquet_to_df(self.path)