def test_productkde_bandwidth(): # for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: for variables in [['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: for instances in [50, 150, 500]: cpd = pbn.ProductKDE(variables) cpd.fit(df.iloc[:instances]) assert np.all(np.isclose(cpd.bandwidth, py_nr_bandwidth(df[:instances], variables))), "Wrong bandwidth computed with normal reference rule." cpd.fit(df_float.iloc[:instances]) assert np.all(np.isclose(cpd.bandwidth, py_nr_bandwidth(df[:instances], variables), atol=0.0005)), "Wrong bandwidth computed with normal reference rule." cpd = pbn.ProductKDE(variables, pbn.ScottsBandwidth()) cpd.fit(df.iloc[:instances]) assert np.all(np.isclose(cpd.bandwidth, py_scott_bandwidth(df[:instances], variables))), "Wrong bandwidth computed with Scott's rule." cpd.fit(df_float.iloc[:instances]) assert np.all(np.isclose(cpd.bandwidth, py_scott_bandwidth(df[:instances], variables), atol=0.0005)), "Wrong bandwidth computed with Scott's rule." cpd = pbn.ProductKDE(['a']) cpd.fit(df) cpd.bandwidth = [1] assert cpd.bandwidth == np.asarray([1]), "Could not change bandwidth." cpd.fit(df_float) cpd.bandwidth = [1] assert cpd.bandwidth == np.asarray([1]), "Could not change bandwidth."
def test_productkde_slogl(): def _test_productkde_slogl_iter(variables, _df, _test_df): cpd = pbn.ProductKDE(variables) cpd.fit(_df) npdata = _df.loc[:, variables].to_numpy() final_scipy_kde = gaussian_kde(npdata.T) final_scipy_kde.covariance = np.diag(cpd.bandwidth) final_scipy_kde.inv_cov = np.diag(1. / cpd.bandwidth) final_scipy_kde.log_det = cpd.bandwidth.shape[0] * np.log(2*np.pi) + np.log(cpd.bandwidth).sum() test_npdata = _test_df.loc[:, variables].to_numpy() assert np.all(np.isclose(cpd.slogl(_test_df), final_scipy_kde.logpdf(test_npdata.T).sum())) test_df = util_test.generate_normal_data(50, seed=1) test_df_float = test_df.astype('float32') for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: _test_productkde_slogl_iter(variables, df, test_df) _test_productkde_slogl_iter(variables, df_float, test_df_float) cpd = pbn.ProductKDE(['d', 'a', 'b', 'c']) cpd.fit(df) cpd2 = pbn.ProductKDE(['a', 'c', 'd', 'b']) cpd2.fit(df) assert np.all(np.isclose(cpd.slogl(test_df), cpd2.slogl(test_df))), "Order of evidence changes slogl() result." cpd = pbn.ProductKDE(['d', 'a', 'b', 'c']) cpd.fit(df_float) cpd2 = pbn.ProductKDE(['a', 'c', 'd', 'b']) cpd2.fit(df_float) assert np.all(np.isclose(cpd.slogl(test_df_float), cpd2.slogl(test_df_float), atol=0.0005)), "Order of evidence changes slogl() result."
def test_productkde_logl_null(): def _test_productkde_logl_null_iter(variables, _df, _test_df): cpd = pbn.ProductKDE(variables) cpd.fit(_df) logl = cpd.logl(_test_df) npdata = _df.loc[:, variables].to_numpy() final_scipy_kde = gaussian_kde(npdata.T) final_scipy_kde.covariance = np.diag(cpd.bandwidth) final_scipy_kde.inv_cov = np.diag(1. / cpd.bandwidth) final_scipy_kde.log_det = cpd.bandwidth.shape[0] * np.log(2*np.pi) + np.log(cpd.bandwidth).sum() test_npdata = _test_df.loc[:, variables].to_numpy() scipy = final_scipy_kde.logpdf(test_npdata.T) if npdata.dtype == "float32": assert np.all(np.isclose(logl, scipy, atol=0.0005, equal_nan=True)) else: assert np.all(np.isclose(logl, scipy, equal_nan=True)) TEST_SIZE = 50 test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) test_df_float = test_df.astype('float32') np.random.seed(0) a_null = np.random.randint(0, TEST_SIZE, size=10) b_null = np.random.randint(0, TEST_SIZE, size=10) c_null = np.random.randint(0, TEST_SIZE, size=10) d_null = np.random.randint(0, TEST_SIZE, size=10) df_null = test_df.copy() df_null.loc[df_null.index[a_null], 'a'] = np.nan df_null.loc[df_null.index[b_null], 'b'] = np.nan df_null.loc[df_null.index[c_null], 'c'] = np.nan df_null.loc[df_null.index[d_null], 'd'] = np.nan df_null_float = test_df_float.copy() df_null_float.loc[df_null_float.index[a_null], 'a'] = np.nan df_null_float.loc[df_null_float.index[b_null], 'b'] = np.nan df_null_float.loc[df_null_float.index[c_null], 'c'] = np.nan df_null_float.loc[df_null_float.index[d_null], 'd'] = np.nan for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: _test_productkde_logl_null_iter(variables, df, df_null) _test_productkde_logl_null_iter(variables, df_float, df_null_float) cpd = pbn.ProductKDE(['d', 'a', 'b', 'c']) cpd.fit(df) cpd2 = pbn.ProductKDE(['a', 'c', 'd', 'b']) cpd2.fit(df) assert np.all(np.isclose(cpd.logl(df_null), cpd2.logl(df_null), equal_nan=True)), "Order of evidence changes logl() result." cpd = pbn.ProductKDE(['d', 'a', 'b', 'c']) cpd.fit(df_float) cpd2 = pbn.ProductKDE(['a', 'c', 'd', 'b']) cpd2.fit(df_float) assert np.all(np.isclose(cpd.logl(df_null_float), cpd2.logl(df_null_float), atol=0.0005, equal_nan=True)), "Order of evidence changes logl() result."
def test_productkde_new_bandwidth(): kde = pbn.ProductKDE(["a"], UnitaryBandwidth()) kde.fit(df) assert kde.bandwidth == np.ones((1,)) kde.fit(df_float) assert kde.bandwidth == np.ones((1,)) kde = pbn.ProductKDE(["a", "b", "c", "d"], UnitaryBandwidth()) kde.fit(df) assert np.all(kde.bandwidth == np.ones((4,))) kde.fit(df_float) assert np.all(kde.bandwidth == np.ones((4,)))
def test_productkde_data_type(): k = pbn.ProductKDE(["a"]) with pytest.raises(ValueError) as ex: k.data_type() "KDE factor not fitted" in str(ex.value) k.fit(df) assert k.data_type() == pa.float64() k.fit(df_float) assert k.data_type() == pa.float32()
def _test_productkde_slogl_null_iter(variables, _df, _test_df): cpd = pbn.ProductKDE(variables) cpd.fit(_df) npdata = _df.loc[:, variables].to_numpy() final_scipy_kde = gaussian_kde(npdata.T) final_scipy_kde.covariance = np.diag(cpd.bandwidth) final_scipy_kde.inv_cov = np.diag(1. / cpd.bandwidth) final_scipy_kde.log_det = cpd.bandwidth.shape[0] * np.log(2*np.pi) + np.log(cpd.bandwidth).sum() test_npdata = _test_df.loc[:, variables].to_numpy() assert np.all(np.isclose(cpd.slogl(_test_df), np.nansum(final_scipy_kde.logpdf(test_npdata.T))))
def _test_productkde_fit_iter(variables, _df, instances): cpd = pbn.ProductKDE(variables) assert not cpd.fitted() cpd.fit(_df.iloc[:instances,:]) assert cpd.fitted() assert instances == cpd.num_instances(), "Wrong number of training instances." assert len(variables) == cpd.num_variables(), "Wrong number of training variables." if np.all(_df.dtypes == 'float32'): assert np.all(np.isclose(cpd.bandwidth, py_nr_bandwidth(_df.iloc[:instances], variables), atol=0.0005)), "Wrong bandwidth." else: assert np.all(np.isclose(cpd.bandwidth, py_nr_bandwidth(_df.iloc[:instances], variables))), "Wrong bandwidth."
def test_check_type(): cpd = pbn.ProductKDE(['a']) cpd.fit(df) with pytest.raises(ValueError) as ex: cpd.logl(df_float) assert "Data type of training and test datasets is different." in str(ex.value) with pytest.raises(ValueError) as ex: cpd.slogl(df_float) assert "Data type of training and test datasets is different." in str(ex.value) cpd.fit(df_float) with pytest.raises(ValueError) as ex: cpd.logl(df) assert "Data type of training and test datasets is different." in str(ex.value) with pytest.raises(ValueError) as ex: cpd.slogl(df) assert "Data type of training and test datasets is different." in str(ex.value)
def _test_productkde_fit_null_iter(variables, _df, instances): cpd = pbn.ProductKDE(variables) assert not cpd.fitted() cpd.fit(_df.iloc[:instances,:]) assert cpd.fitted() npdata = _df.loc[:, variables].to_numpy() npdata_instances = npdata[:instances,:] nan_rows = np.any(np.isnan(npdata_instances), axis=1) nonnan_indices = np.where(~nan_rows)[0] assert (~nan_rows).sum() == cpd.num_instances(), "Wrong number of training instances with null values." assert len(variables) == cpd.num_variables(), "Wrong number of training variables with null values." if np.all(_df.dtypes == 'float32'): assert np.all(np.isclose(cpd.bandwidth, py_nr_bandwidth(_df.iloc[nonnan_indices,:], variables), atol=0.0005)), "Wrong bandwidth with null values." else: assert np.all(np.isclose(cpd.bandwidth, py_nr_bandwidth(_df.iloc[nonnan_indices,:], variables))), "Wrong bandwidth with null values."
def _test_productkde_logl_null_iter(variables, _df, _test_df): cpd = pbn.ProductKDE(variables) cpd.fit(_df) logl = cpd.logl(_test_df) npdata = _df.loc[:, variables].to_numpy() final_scipy_kde = gaussian_kde(npdata.T) final_scipy_kde.covariance = np.diag(cpd.bandwidth) final_scipy_kde.inv_cov = np.diag(1. / cpd.bandwidth) final_scipy_kde.log_det = cpd.bandwidth.shape[0] * np.log(2*np.pi) + np.log(cpd.bandwidth).sum() test_npdata = _test_df.loc[:, variables].to_numpy() scipy = final_scipy_kde.logpdf(test_npdata.T) if npdata.dtype == "float32": assert np.all(np.isclose(logl, scipy, atol=0.0005, equal_nan=True)) else: assert np.all(np.isclose(logl, scipy, equal_nan=True))
def test_productkde_variables(): for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: cpd = pbn.ProductKDE(variables) assert cpd.variables() == variables