class KDE(): AVALIBLE_BW_METHODS = [ 'ISJ', 'scott', 'silverman', 'mean_distance', 'std_distance', 'median_distance' ] def __init__(self, bw='std_distance', space_transformer=PCA, implementation='sklearn', st_kws={}, **kde_kws): if bw.__class__ == str: assert bw in self.AVALIBLE_BW_METHODS, f"if str, bw should be one of {self.AVALIBLE_BW_METHODS}, not {bw}" if not isinstance(bw, (str, float, np.float64, np.float32, np.float)): raise TypeError(f'bw should be str or float, not {bw.__class__}') self.bw = bw self._space_transformer = space_transformer if not space_transformer is None else IDENTITY_TRANSFORMER self.kde_kws = kde_kws self.st_kws = st_kws if not implementation in ['scipy', 'sklearn', 'awkde']: raise ValueError( f'implementation should be one of ["sklearn","scipy","awkde"], not {implementation}' ) self.implementation = implementation def _check_X_2d(self, X): X = np.array(X) #reshape if shape == (n_samples,) X = X if len(X.shape) > 1 else X.reshape(-1, 1) return X def _check_input_dims_match(self, X): if X.shape[-1] != self.n_dim: raise ValueError( f'X dimensions space should be the same size as fitted distribution ({self.n_dim}), got {X.shape[-1]} instead' ) def _get_bw_each_dim(self, X, bw_method): if bw_method in ['ISJ', 'scott', 'silverman']: return np.array([ kdepy.FFTKDE(bw=bw_method).bw(X[:, i:i + 1]) for i in range(X.shape[-1]) ]) elif bw_method == 'mean_distance': return np.array([ agg_smallest_distance(X[:, i].reshape(1, X.shape[0], 1), np.mean) for i in range(X.shape[-1]) ]) elif bw_method == 'median_distance': return np.array([ agg_smallest_distance(X[:, i].reshape(1, X.shape[0], 1), np.median) for i in range(X.shape[-1]) ]) elif bw_method == 'std_distance': return np.array([ agg_smallest_distance(X[:, i].reshape(1, X.shape[0], 1), np.std) for i in range(X.shape[-1]) ]) def _preprocess_fit(self, X): ''' preprocess data prior to fit. ensure len >2 and add some white noise to avoid eigenvalues errors in space transform ''' X = self._check_X_2d(X) if len(X) < 2: X = np.concatenate([X, X]) X = add_noise(X, 1e-9) return X def fit(self, X, y=None, sample_weight=None): #preprocess X X = self._preprocess_fit(X) #fit and transform X with manifold learner (self.space_transformer) if isinstance(self._space_transformer, type): self._space_transformer = self._space_transformer(**{ **self.st_kws, **{ 'n_components': X.shape[-1], 'whiten': True } }) X = self._space_transformer.fit_transform(X) # calculate bw if self.bw.__class__ == str: bw = self._get_bw_each_dim(X, self.bw) bw = np.sqrt(np.sum(bw**2)) else: warn( 'passing a float value for bw is not recomended since X will be transformed by space_transformer before fitting and bw value may not make sence in new trnasformed space' ) bw = self.bw #ensure bw is positive bw = max(1e-6, bw) #kde if self.implementation == 'sklearn': self.estimator = KernelDensity(**{ **{ 'bandwidth': bw }, **self.kde_kws }).fit(X, y, sample_weight=sample_weight) elif self.implementation == 'scipy': self.estimator = stats.gaussian_kde(X.T, bw_method=bw) elif self.implementation == 'awkde': self.estimator = awkde.GaussianKDE(**{ **{ 'glob_bw': bw }, **self.kde_kws }) self.estimator.fit(X=X, weights=sample_weight) else: raise ValueError( f'self.implementation should be one of ["sklearn","scipy","awkde"], not {self.implementation}' ) self._transformed_bw_value = bw self.n_dim = X.shape[-1] return self def evaluate(self, data): data = self._check_X_2d(data) #transform input data = self._space_transformer.transform(data) self._check_input_dims_match(data) #get likelihoods if self.implementation == 'sklearn': likelihood = np.exp(self.estimator.score_samples(data)) elif self.implementation == 'scipy': likelihood = self.estimator.pdf(data.T) elif self.implementation == 'awkde': likelihood = self.estimator.predict(data) else: raise ValueError( f'self.implementation should be one of ["sklearn","scipy","awkde"], not {self.implementation}' ) return likelihood def predict(self, X): return self.evaluate(X) def pdf(self, data): return self.evaluate(data) def rvs(self, size=1, random_state=None): sample_size = size if self.implementation == 'sklearn': samples = self.estimator.sample(n_samples=sample_size, random_state=random_state) elif self.implementation == 'scipy': samples = self.estimator.resample(sample_size, random_state).T elif self.implementation == 'awkde': samples = self.estimator.sample(n_samples=sample_size, random_state=random_state) else: raise ValueError( f'self.implementation should be one of ["sklearn","scipy","awkde"], not {self.implementation}' ) #inverse transform samples samples = self._space_transformer.inverse_transform(samples) return samples def sample(self, sample_size=1, random_state=None): return self.rvs(sample_size, random_state) def entropy(self, sample_size=100): return np.mean(-np.log2(self.evaluate(self.rvs(size=sample_size)))) def cdf(self, data, sample_size=1000): samples = self.sample(sample_size=sample_size) # fix shape in order to work with _quantile samples = samples.reshape(1, *samples.shape) return _quantile(data.reshape(1, *data.shape), samples) def ppf(self, data, sample_size=100): #estimate using sampling and QuantileTransformer since integration is too costly data = np.array(data) assert (data.min() >= 0) and (data.max() <= 1), 'data contains values < 0 or > 1' samples = self.sample(sample_size=sample_size) return QuantileTransformer(n_quantiles=min( 1000, samples.shape[0])).fit(samples).inverse_transform(data) def _make_conditioning_grid(self, condition_dict={}, resolution=None): samples, likelihood = self.sample( 1000) #estimate min and max intervals argsrt = np.argsort(likelihood)[::-1] likelihood_msk = likelihood[argsrt].cumsum() < 0.99 * likelihood.sum() likelihood_msk = argsrt[likelihood_msk] #ignore points with low likelihood grid_min, grid_max = samples[likelihood_msk].min( axis=0), samples[likelihood_msk].max(axis=0) dim_grid = [] for dim in range(grid_min.shape[0]): dim_min, dim_max = grid_min[dim], grid_max[dim] if not dim in condition_dict: dim_grid.append(np.linspace(dim_min, dim_max, resolution)) else: dim_grid.append( np.linspace(condition_dict[dim], condition_dict[dim], resolution)) return np.array(dim_grid).T