Пример #1
0
    def score_model_performance_daily(self, date_from, date_to):
        features = ['prediction_id', self.target_feature]
        res = {}

        for (curr_date, files) in ModelReview._prediction_files_by_day(
                self.model_path, date_from, date_to,
                "_*_actuals.feather.zstd"):
            df_actuals = DataFrame({})
            for (file, df) in DataFrame.load_from_files(files, features):
                df_actuals.df = pd.concat([df_actuals.df, df.df])

            if df_actuals.count() > 0:
                df_actuals.df.rename(
                    columns={self.target_feature: 'a2ml_actual'}, inplace=True)
                scores = self._process_actuals(ds_actuals=df_actuals,
                                               calc_score=True)
                res[str(curr_date)] = scores[self.options.get('score_name')]

        return res
Пример #2
0
    def _process_actuals(self,
                         ds_actuals,
                         prediction_group_id=None,
                         primary_prediction_group_id=None,
                         primary_model_path=None,
                         actual_date=None,
                         actuals_id=None,
                         calc_score=False,
                         raise_not_found=False):

        ds_actuals.df.rename(columns={"actual": 'a2ml_actual'}, inplace=True)

        actuals_count = ds_actuals.count()

        primary_ds = None
        if primary_prediction_group_id:
            files = ModelReview._get_prediction_files(
                primary_model_path, primary_prediction_group_id)
            for (_,
                 df) in DataFrame.load_from_files(files,
                                                  features=['prediction_id']):
                primary_ds = df
                # should be only one file
                break

        origin_dtypes = []
        origin_columns = []
        prediction_files = ModelReview._get_prediction_files(
            self.model_path, prediction_group_id)
        actual_index = False

        for (file, df_prediction_results
             ) in DataFrame.load_from_files(prediction_files):
            origin_dtypes = df_prediction_results.df.dtypes
            origin_columns = df_prediction_results.df.columns

            if primary_ds is not None:
                ds_actuals.df[
                    'prediction_id'] = ModelReview._map_primary_prediction_id_to_candidate(
                        ds_actuals.df['prediction_id'],
                        primary_ds.df['prediction_id'],
                        df_prediction_results.df['prediction_id'])

            if not actual_index:
                ds_actuals.df.set_index('prediction_id', inplace=True)
                actual_index = True

            underscore_split = os.path.basename(file['path']).split('_')

            if len(underscore_split
                   ) == 3:  # date_group-id_suffix (new file name with date)
                prediction_group_id = underscore_split[1]
            else:  # group-id_suffix (old file name without date)
                prediction_group_id = underscore_split[0]

            df_prediction_results.df[
                'prediction_group_id'] = prediction_group_id

            matched_scope = df_prediction_results.df[
                df_prediction_results.df['prediction_id'].isin(
                    ds_actuals.df.index)]
            matched_scope.set_index('prediction_id', inplace=True)
            ds_actuals.df = ds_actuals.df.combine_first(matched_scope)

            match_count = ds_actuals.df.count()[self.target_feature]
            if actuals_count == match_count or primary_ds is not None:
                break

        if raise_not_found and match_count == 0 and primary_ds is None:
            raise Exception(
                "Actual Prediction IDs not found in model predictions.")

        ds_actuals.df.reset_index(inplace=True)
        ds_actuals.dropna(columns=[self.target_feature, 'a2ml_actual'])

        # combine_first changes orginal non float64 types to float64 when NaN values appear during merging tables
        # Good explanations https://stackoverflow.com/a/15353297/898680
        # Fix: store original datypes and force them after merging
        for col in origin_columns:
            if col != 'prediction_id':
                ds_actuals.df[col] = ds_actuals.df[col].astype(
                    origin_dtypes[col], copy=False)

        ds_actuals.df['a2ml_actual'] = ds_actuals.df['a2ml_actual'].astype(
            origin_dtypes[self.target_feature], copy=False)

        result = True
        if calc_score:
            ds_true = DataFrame({})
            ds_true.df = ds_actuals.df[[
                'a2ml_actual'
            ]].rename(columns={'a2ml_actual': self.target_feature})

            y_pred, _ = ModelHelper.preprocess_target_ds(
                self.model_path, ds_actuals)
            y_true, _ = ModelHelper.preprocess_target_ds(
                self.model_path, ds_true)

            result = ModelHelper.calculate_scores(self.options,
                                                  y_test=y_true,
                                                  y_pred=y_pred)

        return result