def as_dmd(self):
        train = DMD(x=self.training_data[0], y=self.training_data[1],
                    samples_meta=None, columns_meta={DMD.FEATURE_NAMES: self.column_names(),
                                                     DMD.FEATURE_TYPES: self.feature_types()})

        test = DMD(x=self.testing_data[0], y=self.testing_data[1],
                   samples_meta=None, columns_meta={DMD.FEATURE_NAMES: self.column_names(),
                                                    DMD.FEATURE_TYPES: self.feature_types()})
        return train, test
示例#2
0
    def get_shuffled_x(cls,
                       dmdx: DMD,
                       index=None,
                       dmd_train=None,
                       method=SensitivityTypes.shuffled,
                       seed=0,
                       model_support_dmd=False):
        if index is None:
            return dmdx.values

        x_copy = numpy.copy(dmdx.values)
        if method == SensitivityTypes.shuffled:
            x_copy = cls.shuffle_x(x_copy,
                                   dmd_train=dmd_train,
                                   index=index,
                                   seed=index + seed)
        if method == SensitivityTypes.missing:
            x_copy[:, index] = np.nan

        if model_support_dmd:
            return DMD(x=x_copy,
                       samples_meta=dmdx._samples_meta,
                       columns_meta=dmdx._columns_meta,
                       splitter=dmdx.splitter)
        else:
            return x_copy
示例#3
0
 def predict(self, dmd: DMD):
     if self.dmd_supported:
         if not isinstance(dmd, DMD):
             dmd = DMD(x=dmd)
         return self.model.predict(dmd)
     else:
         if isinstance(dmd, DMD):
             x = dmd.values
         else:
             x = dmd
         return self.model.predict(x)
示例#4
0
 def get_data(self, is_classification, seed=0):
     rs = numpy.random.RandomState(seed)
     x = rs.rand(200, 3)
     x[:, 1] = 0
     # 1st is double importance, 2nd has no importance
     y = numpy.sum(x, axis=1) + 2 * x[:, 0]
     if is_classification:
         y = numpy.round(y, 0).astype(int)
     return DMD(x=x, y=y,
                columns_meta={DMD.FEATURE_NAMES: ['f_' + str(k) for k in
                                                  range(x.shape[1])]})
示例#5
0
    def get_data(self, is_classification):
        x = numpy.random.rand(1000, 10)

        # 1st is double importance, 2nd has no importance
        y = self._func(x, is_classification=is_classification)
        return DMD(
            x=x,
            y=y,
            columns_meta={
                DMD.FEATURE_NAMES: ['f_' + str(k) for k in range(x.shape[1])]
            },
            samples_meta={'sample_weight': numpy.random.rand(x.shape[0])})
示例#6
0
    def as_dmd(self):
        train = DMD(x=self.training_data[0],
                    y=self.training_data[1],
                    samples_meta=None,
                    columns_meta={
                        DMD.FEATURE_NAMES: self.column_names(),
                        DMD.FEATURE_TYPES: self.feature_types()
                    },
                    target_labels=self.labels,
                    categorical_encoding=self.categorical_encoding)

        test = DMD(x=self.testing_data[0],
                   y=self.testing_data[1],
                   samples_meta=None,
                   columns_meta={
                       DMD.FEATURE_NAMES: self.column_names(),
                       DMD.FEATURE_TYPES: self.feature_types()
                   },
                   target_labels=self.labels,
                   categorical_encoding=self.categorical_encoding)
        return train, test
示例#7
0
    def test_concat(self):
        dmd1 = self.get_data(is_classification=False)
        dmd2 = self.get_data(is_classification=True)

        self.assertEqual(dmd1.n_samples, dmd2.n_samples)

        dmd = DMD.concat([dmd1, dmd2], axis=0)

        self.assertEqual(dmd.n_samples, 2 * dmd2.n_samples)
        self.assertEqual(dmd._x.shape[0], 2 * dmd1._y.shape[0])
        self.assertEqual(dmd._x.shape[0], 2 * dmd1._samples_meta.shape[0])

        self.assertEqual(dmd.n_features, dmd2.n_features)
示例#8
0
    def prepare_dataset_for_score_quality(cls, dmd_train: DMD, dmd_test: DMD):
        '''

        :param dmd_train: train set
        :param dmd_test: test set
        :return: dataset with target of test/train
        '''

        dmd = DMD.concat([dmd_train, dmd_test])
        new_label = [0] * dmd_train.n_samples + [1] * dmd_test.n_samples
        dmd.set_target(new_label)

        train, test = dmd.split(ratio=dmd_test.n_samples / (dmd_train.n_samples + dmd_test.n_samples))
        return train, test
    def setUp(self) -> None:
        rs = numpy.random.RandomState(0)
        x = rs.randn(1000, 10)

        x[:, 0:5] = numpy.round(x[:, 0:5], 0)
        x[0, 0] = 10
        x[0, 9] = 10
        x[1, 9] = -4

        y = numpy.copy(x[:, 0])

        x[10, :] = numpy.nan
        x[:, 1] = numpy.nan
        x[:700, 2] = numpy.nan

        self.dataset = DMD(x=x,
                           y=y,
                           columns_meta={
                               DMD.FEATURE_TYPES:
                               5 * [FeatureTypes.categorical] +
                               5 * [FeatureTypes.numerical]
                           })
    def _gen_data(self, seed, offset=0.):
        rs = numpy.random.RandomState(seed)
        x = rs.randn(1000, 10)

        x[:, 0:5] = numpy.round(x[:, 0:5], 0)
        x[0, 0] = 10
        x[0, 9] = 10
        x[1, 9] = -4

        x[:, 7] += offset
        y = numpy.copy(x[:, 0])

        x[10, :] = numpy.nan
        x[:, 1] = numpy.nan
        x[:700, 2] = numpy.nan

        return DMD(x=x,
                   y=y,
                   columns_meta={
                       DMD.FEATURE_TYPES:
                       5 * [FeatureTypes.categorical] +
                       5 * [FeatureTypes.numerical]
                   })
示例#11
0
    def __init__(self,
                 model,
                 xtrain=None,
                 ytrain=None,
                 sample_meta_train: dict = None,
                 xtest=None,
                 ytest=None,
                 sample_meta_test: dict = None,
                 columns_meta: dict = None,
                 feature_names: list = None,
                 feature_types: list = None,
                 categorical_encoding: dict = None,
                 metric: [str, Metric] = None,
                 splitter: str = 'shuffled',
                 target_labels: dict = None):
        """

        :param model: Model trained on training data provided

        :param xtrain: X training data. if DMD is provided, ytrain and any additional metadata is ignored.
        :param ytrain: Y training data.
        :param sample_meta_train: generic way to provide meta information on each sample in train data (e.g. sample weight) {key : [list of values]}.

        :param xtest: X test data. if DMD is provided, ytest and any additional metadata is ignored..
        :param ytest: Y test data. if DMD is provided,
        :param sample_meta_test: generic way to provide meta information on each sample in test data (e.g. sample weight) {key : [list of values]}.

        :param columns_meta: generic way to provide meta information on each feature (e.g. feature name) {key : [list of values]}.
        :param feature_names: feature name for each feature
        :param feature_types: feature type for each feature: NUMERICAL or CATEGORICAL
        :param categorical_encoding: For each column of categorical feature type, provide a dictionary of the structure
        {feature_names: {index: class name}}. This information will allow providing more readable reports.

        :param metric: Target metric
        :param splitter: Splitter
        :param target_labels: categorical encoding for target variable in the format of {index: class name}.
        """
        self.model = model

        if isinstance(splitter, str):
            if splitter == 'shuffled':
                splitter = ShuffleSplitter
            elif splitter == 'stratified':
                splitter = StratifiedSplitter
            else:
                raise NotImplementedError(
                    "splitter='{}' is not supported".format(splitter))
        else:
            if not hasattr(splitter, 'split'):
                raise ValueError(
                    "splitter='{}' does not supported split() operation".
                    format(splitter))
            else:
                raise NotImplementedError(
                    "splitter='{}' is not supported".format(splitter))

        if ytrain is not None:
            shape = getattr(ytrain, 'shape', (1, 1))
            if len(shape) == 2 and shape[1] > 1:
                raise NotImplementedError(
                    "Pytrust does not support multilabel (ytrain.shape[1]>1) analysis. "
                    "In order to use Pytolemaic package, please wrap you model so model.predict(X) will return a single vector. "
                )
        if ytest is not None:
            shape = getattr(ytest, 'shape', (1, 1))
            if len(shape) == 2 and shape[1] > 1:
                raise NotImplementedError(
                    "Pytrust does not support multilabel (ytest.shape[1]>1) analysis. "
                    "In order to use Pytolemaic package, please wrap you model so model.predict(X) will return a single vector. "
                )

        self.train = xtrain
        if self.train is not None and not isinstance(self.train, DMD):
            self.train = DMD(x=xtrain,
                             y=ytrain,
                             samples_meta=sample_meta_train,
                             columns_meta=columns_meta,
                             feature_names=feature_names,
                             feature_types=feature_types,
                             categorical_encoding=categorical_encoding,
                             splitter=splitter,
                             target_labels=target_labels)

        self.test = xtest
        if self.test is not None and not isinstance(self.test, DMD):
            self.test = DMD(x=xtest,
                            y=ytest,
                            samples_meta=sample_meta_test,
                            columns_meta=columns_meta,
                            feature_names=feature_names,
                            feature_types=feature_types,
                            categorical_encoding=categorical_encoding,
                            splitter=splitter,
                            target_labels=target_labels)

        if metric is None:
            if GeneralUtils.is_classification(model):
                metric = Metrics.recall
            else:
                metric = Metrics.mae

        self.metric = metric.name if isinstance(metric, Metric) else metric

        # todo
        self._validate_input()

        self.sensitivity = SensitivityAnalysis()

        self._uncertainty_models = {}
        self.covariance_shift = None
        self._cache = {}
示例#12
0
    def sensitivity_analysis(self,
                             model,
                             dmd_test: DMD,
                             metric,
                             dmd_train=None,
                             method=SensitivityTypes.shuffled,
                             raw_scores=False,
                             y_pred=None) -> SensitivityOfFeaturesReport:

        self.model_support_dmd = GeneralUtils.dmd_supported(model, dmd_test)
        x = dmd_test if self.model_support_dmd else dmd_test.values

        y_pred = y_pred or model.predict(x)
        ytest = dmd_test.target

        score_function = self.metrics[metric].function
        if metric in ['auc', 'logloss'] and ytest is not None:
            base_score = score_function(ytest, model.predict_proba(x))
            y_pred = ytest
        else:
            base_score = 0
            y_pred = y_pred

        predict_function = model.predict_proba if self.metrics[
            metric].is_proba \
            else model.predict

        scores = {}
        for i, name in enumerate(dmd_test.feature_names):
            if dmd_test.n_samples > self.max_samples_to_use:
                rs = numpy.random.RandomState(i)
                subset = rs.permutation(
                    dmd_test.n_samples)[:self.max_samples_to_use]
                dmd_test_ = dmd_test.split_by_indices(subset)
                y_pred_ = y_pred[subset]
            else:
                dmd_test_ = dmd_test
                y_pred_ = y_pred

            shuffled_x = self.get_shuffled_x(
                dmd_test_,
                i,
                dmd_train=dmd_train,
                method=method,
                model_support_dmd=self.model_support_dmd)
            shuffled_pred = predict_function(shuffled_x)

            if base_score > 0:
                scores[name] = 1 - abs(
                    base_score - score_function(y_pred_, shuffled_pred)
                )  # higher difference - more impact so add 1- in front
            else:
                scores[name] = score_function(
                    y_pred_, shuffled_pred)  # higher score - less impact

        if raw_scores:
            # description = "The raw scores of how each feature affects the model's predictions."
            return SensitivityOfFeaturesReport(
                method=method,
                sensitivities=scores,
                stats_report=self._sensitivity_stats_report(scores))

        # higher score / lower loss means the shuffled feature did less impact
        if self.metrics[metric].is_loss:
            impact = scores
        else:
            impact = {name: 1 - score for name, score in scores.items()}

        total_impact = sum([score for score in impact.values()])
        impact = {
            name: float(score / total_impact)
            for name, score in impact.items()
        }
        impact = GeneralUtils.round_values(impact)

        # description="The impact of each feature on model's predictions. "
        #             "Higher value mean larger impact (0 means no impact at all). "
        #             "Values are normalized to 1.")
        return SensitivityOfFeaturesReport(
            method=method,
            sensitivities=impact,
            stats_report=self._sensitivity_stats_report(sensitivities=impact))