def score_model_performance_daily(self, date_from, date_to): features = ['prediction_id', self.target_feature] res = {} for (curr_date, files) in ModelReview._prediction_files_by_day( self.model_path, date_from, date_to, "_*_actuals.feather.zstd"): df_actuals = DataFrame({}) for (file, df) in DataFrame.load_from_files(files, features): df_actuals.df = pd.concat([df_actuals.df, df.df]) if df_actuals.count() > 0: df_actuals.df.rename( columns={self.target_feature: 'a2ml_actual'}, inplace=True) scores = self._process_actuals(ds_actuals=df_actuals, calc_score=True) res[str(curr_date)] = scores[self.options.get('score_name')] return res
def _process_actuals(self, ds_actuals, prediction_group_id=None, primary_prediction_group_id=None, primary_model_path=None, actual_date=None, actuals_id=None, calc_score=False, raise_not_found=False): ds_actuals.df.rename(columns={"actual": 'a2ml_actual'}, inplace=True) actuals_count = ds_actuals.count() primary_ds = None if primary_prediction_group_id: files = ModelReview._get_prediction_files( primary_model_path, primary_prediction_group_id) for (_, df) in DataFrame.load_from_files(files, features=['prediction_id']): primary_ds = df # should be only one file break origin_dtypes = [] origin_columns = [] prediction_files = ModelReview._get_prediction_files( self.model_path, prediction_group_id) actual_index = False for (file, df_prediction_results ) in DataFrame.load_from_files(prediction_files): origin_dtypes = df_prediction_results.df.dtypes origin_columns = df_prediction_results.df.columns if primary_ds is not None: ds_actuals.df[ 'prediction_id'] = ModelReview._map_primary_prediction_id_to_candidate( ds_actuals.df['prediction_id'], primary_ds.df['prediction_id'], df_prediction_results.df['prediction_id']) if not actual_index: ds_actuals.df.set_index('prediction_id', inplace=True) actual_index = True underscore_split = os.path.basename(file['path']).split('_') if len(underscore_split ) == 3: # date_group-id_suffix (new file name with date) prediction_group_id = underscore_split[1] else: # group-id_suffix (old file name without date) prediction_group_id = underscore_split[0] df_prediction_results.df[ 'prediction_group_id'] = prediction_group_id matched_scope = df_prediction_results.df[ df_prediction_results.df['prediction_id'].isin( ds_actuals.df.index)] matched_scope.set_index('prediction_id', inplace=True) ds_actuals.df = ds_actuals.df.combine_first(matched_scope) match_count = ds_actuals.df.count()[self.target_feature] if actuals_count == match_count or primary_ds is not None: break if raise_not_found and match_count == 0 and primary_ds is None: raise Exception( "Actual Prediction IDs not found in model predictions.") ds_actuals.df.reset_index(inplace=True) ds_actuals.dropna(columns=[self.target_feature, 'a2ml_actual']) # combine_first changes orginal non float64 types to float64 when NaN values appear during merging tables # Good explanations https://stackoverflow.com/a/15353297/898680 # Fix: store original datypes and force them after merging for col in origin_columns: if col != 'prediction_id': ds_actuals.df[col] = ds_actuals.df[col].astype( origin_dtypes[col], copy=False) ds_actuals.df['a2ml_actual'] = ds_actuals.df['a2ml_actual'].astype( origin_dtypes[self.target_feature], copy=False) result = True if calc_score: ds_true = DataFrame({}) ds_true.df = ds_actuals.df[[ 'a2ml_actual' ]].rename(columns={'a2ml_actual': self.target_feature}) y_pred, _ = ModelHelper.preprocess_target_ds( self.model_path, ds_actuals) y_true, _ = ModelHelper.preprocess_target_ds( self.model_path, ds_true) result = ModelHelper.calculate_scores(self.options, y_test=y_true, y_pred=y_pred) return result