Пример #1
0
class KDE():

    AVALIBLE_BW_METHODS = [
        'ISJ', 'scott', 'silverman', 'mean_distance', 'std_distance',
        'median_distance'
    ]

    def __init__(self,
                 bw='std_distance',
                 space_transformer=PCA,
                 implementation='sklearn',
                 st_kws={},
                 **kde_kws):

        if bw.__class__ == str:
            assert bw in self.AVALIBLE_BW_METHODS, f"if str, bw should be one of {self.AVALIBLE_BW_METHODS}, not {bw}"
        if not isinstance(bw, (str, float, np.float64, np.float32, np.float)):
            raise TypeError(f'bw should be str or float, not {bw.__class__}')
        self.bw = bw
        self._space_transformer = space_transformer if not space_transformer is None else IDENTITY_TRANSFORMER
        self.kde_kws = kde_kws
        self.st_kws = st_kws
        if not implementation in ['scipy', 'sklearn', 'awkde']:
            raise ValueError(
                f'implementation should be one of ["sklearn","scipy","awkde"], not {implementation}'
            )

        self.implementation = implementation

    def _check_X_2d(self, X):
        X = np.array(X)
        #reshape if shape == (n_samples,)
        X = X if len(X.shape) > 1 else X.reshape(-1, 1)
        return X

    def _check_input_dims_match(self, X):
        if X.shape[-1] != self.n_dim:
            raise ValueError(
                f'X dimensions space should be the same size as fitted distribution ({self.n_dim}), got {X.shape[-1]} instead'
            )

    def _get_bw_each_dim(self, X, bw_method):
        if bw_method in ['ISJ', 'scott', 'silverman']:
            return np.array([
                kdepy.FFTKDE(bw=bw_method).bw(X[:, i:i + 1])
                for i in range(X.shape[-1])
            ])
        elif bw_method == 'mean_distance':
            return np.array([
                agg_smallest_distance(X[:, i].reshape(1, X.shape[0], 1),
                                      np.mean) for i in range(X.shape[-1])
            ])
        elif bw_method == 'median_distance':
            return np.array([
                agg_smallest_distance(X[:, i].reshape(1, X.shape[0], 1),
                                      np.median) for i in range(X.shape[-1])
            ])
        elif bw_method == 'std_distance':
            return np.array([
                agg_smallest_distance(X[:, i].reshape(1, X.shape[0], 1),
                                      np.std) for i in range(X.shape[-1])
            ])

    def _preprocess_fit(self, X):
        '''
        preprocess data prior to fit. ensure len >2 and add some white noise to avoid eigenvalues errors in space transform
        '''
        X = self._check_X_2d(X)
        if len(X) < 2:
            X = np.concatenate([X, X])
        X = add_noise(X, 1e-9)
        return X

    def fit(self, X, y=None, sample_weight=None):
        #preprocess X
        X = self._preprocess_fit(X)
        #fit and transform X with manifold learner (self.space_transformer)
        if isinstance(self._space_transformer, type):
            self._space_transformer = self._space_transformer(**{
                **self.st_kws,
                **{
                    'n_components': X.shape[-1],
                    'whiten': True
                }
            })

        X = self._space_transformer.fit_transform(X)
        # calculate bw
        if self.bw.__class__ == str:
            bw = self._get_bw_each_dim(X, self.bw)
            bw = np.sqrt(np.sum(bw**2))
        else:
            warn(
                'passing a float value for bw is not recomended since X will be transformed by space_transformer before fitting and bw value may not make sence in new trnasformed space'
            )
            bw = self.bw

        #ensure bw is positive
        bw = max(1e-6, bw)
        #kde
        if self.implementation == 'sklearn':
            self.estimator = KernelDensity(**{
                **{
                    'bandwidth': bw
                },
                **self.kde_kws
            }).fit(X, y, sample_weight=sample_weight)
        elif self.implementation == 'scipy':
            self.estimator = stats.gaussian_kde(X.T, bw_method=bw)
        elif self.implementation == 'awkde':
            self.estimator = awkde.GaussianKDE(**{
                **{
                    'glob_bw': bw
                },
                **self.kde_kws
            })
            self.estimator.fit(X=X, weights=sample_weight)
        else:
            raise ValueError(
                f'self.implementation should be one of ["sklearn","scipy","awkde"], not {self.implementation}'
            )

        self._transformed_bw_value = bw
        self.n_dim = X.shape[-1]
        return self

    def evaluate(self, data):
        data = self._check_X_2d(data)
        #transform input
        data = self._space_transformer.transform(data)
        self._check_input_dims_match(data)
        #get likelihoods
        if self.implementation == 'sklearn':
            likelihood = np.exp(self.estimator.score_samples(data))
        elif self.implementation == 'scipy':
            likelihood = self.estimator.pdf(data.T)
        elif self.implementation == 'awkde':
            likelihood = self.estimator.predict(data)
        else:
            raise ValueError(
                f'self.implementation should be one of ["sklearn","scipy","awkde"], not {self.implementation}'
            )

        return likelihood

    def predict(self, X):
        return self.evaluate(X)

    def pdf(self, data):
        return self.evaluate(data)

    def rvs(self, size=1, random_state=None):
        sample_size = size
        if self.implementation == 'sklearn':
            samples = self.estimator.sample(n_samples=sample_size,
                                            random_state=random_state)
        elif self.implementation == 'scipy':
            samples = self.estimator.resample(sample_size, random_state).T
        elif self.implementation == 'awkde':
            samples = self.estimator.sample(n_samples=sample_size,
                                            random_state=random_state)
        else:
            raise ValueError(
                f'self.implementation should be one of ["sklearn","scipy","awkde"], not {self.implementation}'
            )
        #inverse transform samples
        samples = self._space_transformer.inverse_transform(samples)
        return samples

    def sample(self, sample_size=1, random_state=None):
        return self.rvs(sample_size, random_state)

    def entropy(self, sample_size=100):
        return np.mean(-np.log2(self.evaluate(self.rvs(size=sample_size))))

    def cdf(self, data, sample_size=1000):
        samples = self.sample(sample_size=sample_size)
        # fix shape in order to work with _quantile
        samples = samples.reshape(1, *samples.shape)
        return _quantile(data.reshape(1, *data.shape), samples)

    def ppf(self, data, sample_size=100):
        #estimate using sampling and QuantileTransformer since integration is too costly
        data = np.array(data)
        assert (data.min() >= 0) and (data.max() <=
                                      1), 'data contains values < 0 or > 1'
        samples = self.sample(sample_size=sample_size)
        return QuantileTransformer(n_quantiles=min(
            1000, samples.shape[0])).fit(samples).inverse_transform(data)

    def _make_conditioning_grid(self, condition_dict={}, resolution=None):
        samples, likelihood = self.sample(
            1000)  #estimate min and max intervals
        argsrt = np.argsort(likelihood)[::-1]
        likelihood_msk = likelihood[argsrt].cumsum() < 0.99 * likelihood.sum()
        likelihood_msk = argsrt[likelihood_msk]
        #ignore points with low likelihood
        grid_min, grid_max = samples[likelihood_msk].min(
            axis=0), samples[likelihood_msk].max(axis=0)
        dim_grid = []
        for dim in range(grid_min.shape[0]):
            dim_min, dim_max = grid_min[dim], grid_max[dim]
            if not dim in condition_dict:
                dim_grid.append(np.linspace(dim_min, dim_max, resolution))
            else:
                dim_grid.append(
                    np.linspace(condition_dict[dim], condition_dict[dim],
                                resolution))
        return np.array(dim_grid).T