def test_fit_sample(self): model = GaussianKDE() model.fit(self.data) sampled_data = model.sample(50) assert isinstance(sampled_data, np.ndarray) assert sampled_data.shape == (50, )
def test_to_dict_sample_size(self): model = GaussianKDE(sample_size=10) model.fit(self.constant) params = model.to_dict() assert params['type'] == 'copulas.univariate.gaussian_kde.GaussianKDE' assert len(params['dataset']) == 10
def test_to_dict_constant(self): model = GaussianKDE() model.fit(self.constant) params = model.to_dict() assert params == { 'type': 'copulas.univariate.gaussian_kde.GaussianKDE', 'dataset': [5] * 100 }
def test_fit_sample_distribution_dict(self): data = sample_trivariate_xyz() model = GaussianMultivariate(distribution={'x': GaussianKDE()}) model.fit(data) sampled_data = model.sample(10) assert sampled_data.shape == (10, 3)
def test_save_load(self): model = GaussianKDE() model.fit(self.data) sampled_data = model.sample(50) path_to_model = os.path.join(self.test_dir.name, "model.pkl") model.save(path_to_model) model2 = GaussianKDE.load(path_to_model) pdf = model.probability_density(sampled_data) pdf2 = model2.probability_density(sampled_data) assert np.all(np.isclose(pdf, pdf2, atol=0.01)) cdf = model.cumulative_distribution(sampled_data) cdf2 = model2.cumulative_distribution(sampled_data) assert np.all(np.isclose(cdf, cdf2, atol=0.01))
def test_bimodal(self): """ Suppose the data follows a bimodal distribution. The KS statistic should be larger for a Gaussian model than a GaussianKDE model (since it can't capture 2 modes). """ kde_likelihood = ks_statistic(GaussianKDE(), self.bimodal_data) gaussian_likelihood = ks_statistic(GaussianUnivariate(), self.bimodal_data) assert kde_likelihood < gaussian_likelihood
def test_binary(self): """ Suppose the data follows a Bernoulli distribution. The KS statistic should be larger for a TruncatedGaussian model than a GaussianKDE model which can somewhat capture a Bernoulli distribution as it resembles a bimodal distribution. """ model = select_univariate( self.binary_data, [GaussianKDE(), TruncatedGaussian()]) assert isinstance(model, GaussianKDE)
def test_binary(self): """ Suppose the data follows a Bernoulli distribution. The KS statistic should be larger for a TruncatedGaussian model than a GaussianKDE model which can somewhat capture a Bernoulli distribution as it resembles a bimodal distribution. """ kde_likelihood = ks_statistic(GaussianKDE(), self.binary_data) truncated_likelihood = ks_statistic(TruncatedGaussian(), self.binary_data) assert kde_likelihood < truncated_likelihood
def test_to_dict_from_dict(self): model = GaussianKDE() model.fit(self.data) sampled_data = model.sample(50) params = model.to_dict() model2 = GaussianKDE.from_dict(params) pdf = model.probability_density(sampled_data) pdf2 = model2.probability_density(sampled_data) assert np.all(np.isclose(pdf, pdf2, atol=0.01)) cdf = model.cumulative_distribution(sampled_data) cdf2 = model2.cumulative_distribution(sampled_data) assert np.all(np.isclose(cdf, cdf2, atol=0.01))
def test_fit_sample_distribution_dict_multiple(self): data = sample_trivariate_xyz() model = GaussianMultivariate( distribution={ 'x': Univariate(parametric=ParametricType.PARAMETRIC), 'y': BetaUnivariate(), 'z': GaussianKDE() }) model.fit(data) sampled_data = model.sample(10) assert sampled_data.shape == (10, 3)
def test_to_dict(self): """to_dict returns the internal parameters to replicate one instance.""" # Setup instance = VineCopula('regular') instance.fitted = True instance.n_sample = 100 instance.n_var = 10 instance.depth = 3 instance.truncated = 3 tree = Tree('regular') instance.trees = [tree] uni = GaussianKDE() instance.unis = [uni] tau_mat = np.array([[0, 1], [1, 0]]) instance.tau_mat = tau_mat u_matrix = np.array([[0, 1], [1, 0]]) instance.u_matrix = u_matrix expected_result = { 'type': 'copulas.multivariate.vine.VineCopula', 'fitted': True, 'vine_type': 'regular', 'n_sample': 100, 'n_var': 10, 'depth': 3, 'truncated': 3, 'trees': [{ 'type': 'copulas.multivariate.tree.RegularTree', 'tree_type': 'regular', 'fitted': False }], 'tau_mat': [[0, 1], [1, 0]], 'u_matrix': [[0, 1], [1, 0]], 'unis': [{ 'type': 'copulas.univariate.gaussian_kde.GaussianKDE', 'fitted': False, }] } # Run result = instance.to_dict() # Check assert result == expected_result
def test_pdf(self): model = GaussianKDE() model.fit(self.data) sampled_data = model.sample(50) # Test PDF pdf = model.probability_density(sampled_data) assert (0 < pdf).all()
def test_cdf(self): model = GaussianKDE() model.fit(self.data) sampled_data = model.sample(50) # Test the CDF cdf = model.cumulative_distribution(sampled_data) assert (0 < cdf).all() and (cdf < 1).all() # Test CDF increasing function sorted_data = sorted(sampled_data) cdf = model.cumulative_distribution(sorted_data) assert (np.diff(cdf) >= 0).all()
def test_fit_sample_constant(self): model = GaussianKDE() model.fit(self.constant) sampled_data = model.sample(50) assert isinstance(sampled_data, np.ndarray) assert sampled_data.shape == (50, ) assert model._constant_value == 5 np.testing.assert_equal(np.full(50, 5), model.sample(50))
def test_get_parameters_non_parametric(self): """Test the ``get_parameters`` method when model is parametric. If there is at least one distributions in the model that is not parametric, a NonParametricError should be raised. Setup: - ``self._model`` is set to a ``GaussianMultivariate`` that uses ``GaussianKDE`` as its ``distribution``. Side Effects: - A NonParametricError is raised. """ # Setup gm = GaussianMultivariate(distribution=GaussianKDE()) data = pd.DataFrame([1, 1, 1]) gm.fit(data) gc = Mock() gc._model = gm # Run, Assert with pytest.raises(NonParametricError): GaussianCopula.get_parameters(gc)
def _gaussian(self, dataset): """ For the given dataset, this runs "everything but the kitchen sink" (i.e. every feature of GaussianMultivariate that is officially supported) and makes sure it doesn't crash. """ model = GaussianMultivariate({ dataset.columns[0]: GaussianKDE() # Use a KDE for the first column }) model.fit(dataset) for N in [10, 100, 50]: assert len(model.sample(N)) == N sampled_data = model.sample(10) pdf = model.probability_density(sampled_data) cdf = model.cumulative_distribution(sampled_data) # Test Save/Load from Dictionary config = model.to_dict() model2 = GaussianMultivariate.from_dict(config) for N in [10, 100, 50]: assert len(model2.sample(N)) == N pdf2 = model2.probability_density(sampled_data) cdf2 = model2.cumulative_distribution(sampled_data) assert np.all(np.isclose(pdf, pdf2, atol=0.01)) assert np.all(np.isclose(cdf, cdf2, atol=0.01)) path_to_model = os.path.join(self.test_dir.name, "model.pkl") model.save(path_to_model) model2 = GaussianMultivariate.load(path_to_model) for N in [10, 100, 50]: assert len(model2.sample(N)) == N pdf2 = model2.probability_density(sampled_data) cdf2 = model2.cumulative_distribution(sampled_data) assert np.all(np.isclose(pdf, pdf2, atol=0.01)) assert np.all(np.isclose(cdf, cdf2, atol=0.01))