def test_maximum_for_categorical_model(self): data = pd.DataFrame({'B': np.array(['foo', 'bar', 'foo', 'foo', 'bar', 'foo', 'bar']), 'C': np.array(['hey', 'hey', 'hey', 'hey', 'hey', 'hey', 'ho'])}, columns=['B', 'C']) kde_model = KDEModel('kde_model') kde_model.fit(data) self.assertTrue(kde_model._maximum() == ['foo', 'hey'], 'maximum was not correctly calculated')
def test_mixed_categorical_numerical_model(self): data = pd.DataFrame({'A': np.array([1, 2, 3, 2, 3, 4, 5]), 'B': np.array(['foo', 'bar', 'foo', 'foo', 'bar', 'foo', 'bar'])}, columns=['A', 'B']) kde_model = KDEModel('kde_model') kde_model.fit(data) self.assertAlmostEqual(kde_model._density([1, 'foo']), 0.1, places=2, msg='density is not calculated correctly')
def test_maximum(self): data = pd.DataFrame({'B': np.array([0, 2, 2, 3, 3, 3, 4, 4, 6]), 'A': np.array([1, 2, 3, 3, 3, 3, 3, 4, 5])}, columns=['B', 'A']) maximum = np.array([3., 3.]) kde_model = KDEModel('kde_model') kde_model.fit(data) model_max = kde_model._maximum() for i in range(len(kde_model.fields)): self.assertAlmostEqual(model_max[i], maximum[i])
def test_kde_storage(self): data = pd.DataFrame({'A': np.array([1, 2, 3, 2, 3, 4, 5]), 'B': np.array(['foo', 'bar', 'foo', 'foo', 'bar', 'foo', 'bar']), 'C': np.array(['hey', 'hey', 'hey', 'hey', 'hey', 'hey', 'ho'])}, columns=['A', 'B', 'C']) kde_model = KDEModel('kde_model') kde_model.fit(data) kde_model._density(['foo', 'hey', 2]) kde_model._density(['foo', 'hey', 3]) self.assertTrue(kde_model.kde['[\'foo\', \'hey\']'], 'kde storage does not work properly')
def test_conditionout(self): data = pd.DataFrame({'B': np.array([2, 4, 7, 7, 7, 4, 1]), 'A': np.array([1, 2, 3, 3, 3, 4, 5])}, columns=['B', 'A']) kde_model = KDEModel('kde_model') kde_model.fit(data) self.assertTrue(kde_model.data.sort_values(by='A').reset_index(drop=True).equals(data), "input data was not passed properly to the model") # Change domains of dimension A kde_model.fields[1]['domain'].setlowerbound(2) kde_model.fields[1]['domain'].setupperbound(4) # Condition and marginalize model kde_model._conditionout(keep='B', remove='A') # Generate control data data_cond = pd.DataFrame({'B': np.array([4, 7, 7, 7, 4]), 'A': np.array([2, 3, 3, 3, 4])}, columns=['B', 'A']) self.assertTrue(kde_model.data.sort_values(by='A').reset_index(drop=True).equals(data_cond), "model data was not marginalized and conditioned properly")
def test_predict(self): data = pd.DataFrame({'A': np.array([1, 2, 3, 2, 3, 4, 5]), 'B': np.array([1, 2, 2, 3, 3, 4, 5]), 'C': np.array([1, 2, 3, 3, 3, 4, 5]), 'D': np.array([0, 3, 3, 4, 4, 6, 7])}, columns=['A', 'B', 'C', 'D']) kde_model = KDEModel('kde_model') kde_model.fit(data) # marginalize out all but two dimensions kde_model.marginalize(keep=['C', 'D']) # condition out one of the remaining dimensions kde_model.byname('C')['domain'].setupperbound(3) kde_model.byname('C')['domain'].setlowerbound(2) kde_model.marginalize(keep=['D']) # For the remaining dimension: get point of maximum/average probability density self.assertAlmostEqual(kde_model._maximum()[0], 3.0, places=2, msg='prediction is not correct')