def as_dmd(self): train = DMD(x=self.training_data[0], y=self.training_data[1], samples_meta=None, columns_meta={DMD.FEATURE_NAMES: self.column_names(), DMD.FEATURE_TYPES: self.feature_types()}) test = DMD(x=self.testing_data[0], y=self.testing_data[1], samples_meta=None, columns_meta={DMD.FEATURE_NAMES: self.column_names(), DMD.FEATURE_TYPES: self.feature_types()}) return train, test
def get_shuffled_x(cls, dmdx: DMD, index=None, dmd_train=None, method=SensitivityTypes.shuffled, seed=0, model_support_dmd=False): if index is None: return dmdx.values x_copy = numpy.copy(dmdx.values) if method == SensitivityTypes.shuffled: x_copy = cls.shuffle_x(x_copy, dmd_train=dmd_train, index=index, seed=index + seed) if method == SensitivityTypes.missing: x_copy[:, index] = np.nan if model_support_dmd: return DMD(x=x_copy, samples_meta=dmdx._samples_meta, columns_meta=dmdx._columns_meta, splitter=dmdx.splitter) else: return x_copy
def predict(self, dmd: DMD): if self.dmd_supported: if not isinstance(dmd, DMD): dmd = DMD(x=dmd) return self.model.predict(dmd) else: if isinstance(dmd, DMD): x = dmd.values else: x = dmd return self.model.predict(x)
def get_data(self, is_classification, seed=0): rs = numpy.random.RandomState(seed) x = rs.rand(200, 3) x[:, 1] = 0 # 1st is double importance, 2nd has no importance y = numpy.sum(x, axis=1) + 2 * x[:, 0] if is_classification: y = numpy.round(y, 0).astype(int) return DMD(x=x, y=y, columns_meta={DMD.FEATURE_NAMES: ['f_' + str(k) for k in range(x.shape[1])]})
def get_data(self, is_classification): x = numpy.random.rand(1000, 10) # 1st is double importance, 2nd has no importance y = self._func(x, is_classification=is_classification) return DMD( x=x, y=y, columns_meta={ DMD.FEATURE_NAMES: ['f_' + str(k) for k in range(x.shape[1])] }, samples_meta={'sample_weight': numpy.random.rand(x.shape[0])})
def as_dmd(self): train = DMD(x=self.training_data[0], y=self.training_data[1], samples_meta=None, columns_meta={ DMD.FEATURE_NAMES: self.column_names(), DMD.FEATURE_TYPES: self.feature_types() }, target_labels=self.labels, categorical_encoding=self.categorical_encoding) test = DMD(x=self.testing_data[0], y=self.testing_data[1], samples_meta=None, columns_meta={ DMD.FEATURE_NAMES: self.column_names(), DMD.FEATURE_TYPES: self.feature_types() }, target_labels=self.labels, categorical_encoding=self.categorical_encoding) return train, test
def test_concat(self): dmd1 = self.get_data(is_classification=False) dmd2 = self.get_data(is_classification=True) self.assertEqual(dmd1.n_samples, dmd2.n_samples) dmd = DMD.concat([dmd1, dmd2], axis=0) self.assertEqual(dmd.n_samples, 2 * dmd2.n_samples) self.assertEqual(dmd._x.shape[0], 2 * dmd1._y.shape[0]) self.assertEqual(dmd._x.shape[0], 2 * dmd1._samples_meta.shape[0]) self.assertEqual(dmd.n_features, dmd2.n_features)
def prepare_dataset_for_score_quality(cls, dmd_train: DMD, dmd_test: DMD): ''' :param dmd_train: train set :param dmd_test: test set :return: dataset with target of test/train ''' dmd = DMD.concat([dmd_train, dmd_test]) new_label = [0] * dmd_train.n_samples + [1] * dmd_test.n_samples dmd.set_target(new_label) train, test = dmd.split(ratio=dmd_test.n_samples / (dmd_train.n_samples + dmd_test.n_samples)) return train, test
def setUp(self) -> None: rs = numpy.random.RandomState(0) x = rs.randn(1000, 10) x[:, 0:5] = numpy.round(x[:, 0:5], 0) x[0, 0] = 10 x[0, 9] = 10 x[1, 9] = -4 y = numpy.copy(x[:, 0]) x[10, :] = numpy.nan x[:, 1] = numpy.nan x[:700, 2] = numpy.nan self.dataset = DMD(x=x, y=y, columns_meta={ DMD.FEATURE_TYPES: 5 * [FeatureTypes.categorical] + 5 * [FeatureTypes.numerical] })
def _gen_data(self, seed, offset=0.): rs = numpy.random.RandomState(seed) x = rs.randn(1000, 10) x[:, 0:5] = numpy.round(x[:, 0:5], 0) x[0, 0] = 10 x[0, 9] = 10 x[1, 9] = -4 x[:, 7] += offset y = numpy.copy(x[:, 0]) x[10, :] = numpy.nan x[:, 1] = numpy.nan x[:700, 2] = numpy.nan return DMD(x=x, y=y, columns_meta={ DMD.FEATURE_TYPES: 5 * [FeatureTypes.categorical] + 5 * [FeatureTypes.numerical] })
def __init__(self, model, xtrain=None, ytrain=None, sample_meta_train: dict = None, xtest=None, ytest=None, sample_meta_test: dict = None, columns_meta: dict = None, feature_names: list = None, feature_types: list = None, categorical_encoding: dict = None, metric: [str, Metric] = None, splitter: str = 'shuffled', target_labels: dict = None): """ :param model: Model trained on training data provided :param xtrain: X training data. if DMD is provided, ytrain and any additional metadata is ignored. :param ytrain: Y training data. :param sample_meta_train: generic way to provide meta information on each sample in train data (e.g. sample weight) {key : [list of values]}. :param xtest: X test data. if DMD is provided, ytest and any additional metadata is ignored.. :param ytest: Y test data. if DMD is provided, :param sample_meta_test: generic way to provide meta information on each sample in test data (e.g. sample weight) {key : [list of values]}. :param columns_meta: generic way to provide meta information on each feature (e.g. feature name) {key : [list of values]}. :param feature_names: feature name for each feature :param feature_types: feature type for each feature: NUMERICAL or CATEGORICAL :param categorical_encoding: For each column of categorical feature type, provide a dictionary of the structure {feature_names: {index: class name}}. This information will allow providing more readable reports. :param metric: Target metric :param splitter: Splitter :param target_labels: categorical encoding for target variable in the format of {index: class name}. """ self.model = model if isinstance(splitter, str): if splitter == 'shuffled': splitter = ShuffleSplitter elif splitter == 'stratified': splitter = StratifiedSplitter else: raise NotImplementedError( "splitter='{}' is not supported".format(splitter)) else: if not hasattr(splitter, 'split'): raise ValueError( "splitter='{}' does not supported split() operation". format(splitter)) else: raise NotImplementedError( "splitter='{}' is not supported".format(splitter)) if ytrain is not None: shape = getattr(ytrain, 'shape', (1, 1)) if len(shape) == 2 and shape[1] > 1: raise NotImplementedError( "Pytrust does not support multilabel (ytrain.shape[1]>1) analysis. " "In order to use Pytolemaic package, please wrap you model so model.predict(X) will return a single vector. " ) if ytest is not None: shape = getattr(ytest, 'shape', (1, 1)) if len(shape) == 2 and shape[1] > 1: raise NotImplementedError( "Pytrust does not support multilabel (ytest.shape[1]>1) analysis. " "In order to use Pytolemaic package, please wrap you model so model.predict(X) will return a single vector. " ) self.train = xtrain if self.train is not None and not isinstance(self.train, DMD): self.train = DMD(x=xtrain, y=ytrain, samples_meta=sample_meta_train, columns_meta=columns_meta, feature_names=feature_names, feature_types=feature_types, categorical_encoding=categorical_encoding, splitter=splitter, target_labels=target_labels) self.test = xtest if self.test is not None and not isinstance(self.test, DMD): self.test = DMD(x=xtest, y=ytest, samples_meta=sample_meta_test, columns_meta=columns_meta, feature_names=feature_names, feature_types=feature_types, categorical_encoding=categorical_encoding, splitter=splitter, target_labels=target_labels) if metric is None: if GeneralUtils.is_classification(model): metric = Metrics.recall else: metric = Metrics.mae self.metric = metric.name if isinstance(metric, Metric) else metric # todo self._validate_input() self.sensitivity = SensitivityAnalysis() self._uncertainty_models = {} self.covariance_shift = None self._cache = {}
def sensitivity_analysis(self, model, dmd_test: DMD, metric, dmd_train=None, method=SensitivityTypes.shuffled, raw_scores=False, y_pred=None) -> SensitivityOfFeaturesReport: self.model_support_dmd = GeneralUtils.dmd_supported(model, dmd_test) x = dmd_test if self.model_support_dmd else dmd_test.values y_pred = y_pred or model.predict(x) ytest = dmd_test.target score_function = self.metrics[metric].function if metric in ['auc', 'logloss'] and ytest is not None: base_score = score_function(ytest, model.predict_proba(x)) y_pred = ytest else: base_score = 0 y_pred = y_pred predict_function = model.predict_proba if self.metrics[ metric].is_proba \ else model.predict scores = {} for i, name in enumerate(dmd_test.feature_names): if dmd_test.n_samples > self.max_samples_to_use: rs = numpy.random.RandomState(i) subset = rs.permutation( dmd_test.n_samples)[:self.max_samples_to_use] dmd_test_ = dmd_test.split_by_indices(subset) y_pred_ = y_pred[subset] else: dmd_test_ = dmd_test y_pred_ = y_pred shuffled_x = self.get_shuffled_x( dmd_test_, i, dmd_train=dmd_train, method=method, model_support_dmd=self.model_support_dmd) shuffled_pred = predict_function(shuffled_x) if base_score > 0: scores[name] = 1 - abs( base_score - score_function(y_pred_, shuffled_pred) ) # higher difference - more impact so add 1- in front else: scores[name] = score_function( y_pred_, shuffled_pred) # higher score - less impact if raw_scores: # description = "The raw scores of how each feature affects the model's predictions." return SensitivityOfFeaturesReport( method=method, sensitivities=scores, stats_report=self._sensitivity_stats_report(scores)) # higher score / lower loss means the shuffled feature did less impact if self.metrics[metric].is_loss: impact = scores else: impact = {name: 1 - score for name, score in scores.items()} total_impact = sum([score for score in impact.values()]) impact = { name: float(score / total_impact) for name, score in impact.items() } impact = GeneralUtils.round_values(impact) # description="The impact of each feature on model's predictions. " # "Higher value mean larger impact (0 means no impact at all). " # "Values are normalized to 1.") return SensitivityOfFeaturesReport( method=method, sensitivities=impact, stats_report=self._sensitivity_stats_report(sensitivities=impact))