def test_DNF(self): A1 = ps.EqualitySelector("A1", 1) A2 = ps.EqualitySelector("A2", 1, "AA") B1 = ps.EqualitySelector("B1", 1) B2 = ps.EqualitySelector("B2", "1") dnf1 = ps.DNF() dnf1.append_or([A1, A2]) dnf2 = ps.DNF([A1, A2]) self.assertTrue(dnf1 == dnf2) dnf3 = ps.DNF(ps.Conjunction([A1, A2])) dnf4 = ps.DNF() dnf4.append_and([A1, A2]) dnf5 = ps.DNF() dnf5.append_and(A1) dnf5.append_and(A2) self.assertTrue(dnf3 == dnf4) self.assertTrue(dnf4 == dnf5) dnf6 = ps.DNF([]) dnf6.append_and([B1, B2]) dnf7 = ps.DNF([]) dnf7.append_and([A1, A2]) dnf7.append_or(ps.Conjunction([B1, B2])) self.df = pd.DataFrame.from_dict({ "A1": [1, 1, 1, 2, 2, 2, 2, 0, 0, 0], #pylint: disable=attribute-defined-outside-init "A2": [0, 1, 1, 1, 2, 2, 2, 0, 0, 0], "B1": [0, 0, 0, 0, 1, 1, 1, 0, 1, 1], "B2": ["0", "0", "0", "0", "1", "1", "2", "0", "0", "1"] }) self.check_dataframe_query(dnf1, [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]) self.check_dataframe_query(dnf3, [0, 1, 1, 0, 0, 0, 0, 0, 0, 0]) self.check_dataframe_query(dnf6, [0, 0, 0, 0, 1, 1, 0, 0, 0, 1]) self.check_dataframe_query(dnf7, [0, 1, 1, 0, 1, 1, 0, 0, 0, 1])
def setUp(self): NS_checking = ps.EqualitySelector("checking_status", b"<0") NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes") NS_other_parties = ps.EqualitySelector("other_parties", b"none") NS_savings_status = ps.EqualitySelector("savings_status", b"<100") NS_job = ps.EqualitySelector("job", b"skilled") self.result = [ps.Conjunction([NS_checking, NS_foreign_worker]), ps.Conjunction([NS_checking]), ps.Conjunction([NS_checking, NS_other_parties, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_other_parties]), ps.Conjunction([NS_checking, NS_savings_status, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_savings_status]), ps.Conjunction([NS_checking, NS_savings_status, NS_other_parties, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_job, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_savings_status, NS_other_parties]), ps.Conjunction([NS_checking, NS_job]), ] self.qualities = [0.055299999999999995, 0.05280000000000001, 0.052300000000000006, 0.05059999999999999, 0.04959999999999999, 0.048299999999999996, 0.04660000000000001, 0.04550000000000001, 0.0452, 0.044399999999999995] data = get_credit_data() target = ps.BinaryTarget('class', b'bad') searchSpace = ps.create_nominal_selectors(data, ignore=['class']) self.task = ps.SubgroupDiscoveryTask(data, target, searchSpace, result_set_size=10, depth=5, qf=ps.StandardQF(1.0))
def test_str_representations(self): A = ps.EqualitySelector("A", 1) self.assertEqual(str(A), "A==1") self.assertEqual(repr(A), "A==1") B = ps.EqualitySelector("BB", 2) self.assertEqual(str(B), "BB==2") self.assertEqual(repr(B), "BB==2") C = ps.EqualitySelector("CCC", True) self.assertEqual(str(C), "CCC==True") self.assertEqual(repr(C), "CCC==True") NegC = ps.NegatedSelector(ps.EqualitySelector("CCC", True)) self.assertEqual(str(NegC), "NOT CCC==True") self.assertEqual(repr(NegC), "(not CCC==True)") I1 = ps.IntervalSelector("test", 10, 15) self.assertEqual(str(I1), "test: [10:15[") self.assertEqual(repr(I1), "test: [10:15[") I2 = ps.IntervalSelector("test2", np.sqrt(2), np.sqrt(3)) self.assertEqual(str(I2), "test2: [1.41:1.73[") self.assertEqual(repr(I2), "test2: [1.4142135623730951:1.7320508075688772[")
def test_equality_expressions(self): A1 = ps.EqualitySelector("A", 1) A2 = ps.EqualitySelector("A", 2, "AA") B1 = ps.EqualitySelector("B", 1) D1 = ps.Disjunction([A1, A2]) D1_clone = ps.Disjunction([A1, A2]) self.assertTrue(D1 == D1_clone) self.assertTrue(hash(D1) == hash(D1_clone)) D_all = ps.Disjunction([A1, A2, B1]) D1_clone.append_or(B1) self.assertTrue(D_all == D1_clone) self.assertTrue(hash(D_all) == hash(D1_clone)) C1 = ps.Conjunction([A1, A2]) C1_clone = ps.Conjunction([A1, A2]) self.assertTrue(C1 == C1_clone) self.assertTrue(hash(C1) == hash(C1_clone)) C_all = ps.Conjunction([A1, A2, B1]) C1_clone.append_and(B1) self.assertTrue(C_all == C1_clone) self.assertTrue(hash(C_all) == hash(C1_clone)) self.assertFalse(C1 == D1) self.assertFalse(hash(C1) == hash(D1))
def setUp(self): #NS_all = ps.EqualitySelector(True) NS_payment = ps.EqualitySelector("other_payment_plans",b"none") NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes") NS_other_parties = ps.EqualitySelector("other_parties", b"none") NS_housing = ps.EqualitySelector("housing", b'own') NS_class = ps.EqualitySelector("class", b"good") DFSo = [[NS_foreign_worker], [NS_other_parties], [NS_foreign_worker, NS_other_parties], [NS_payment], [NS_foreign_worker, NS_payment], [NS_other_parties, NS_payment], [NS_housing], [NS_class], [NS_foreign_worker, NS_other_parties, NS_payment]] self.DFSresult = list(map(ps.Conjunction, DFSo)) self.DFSresult.insert(0,True) self.DFSqualities = [500.4980179286455, 483.3153195123844, 459.2862838915471, 444.60343785358896, 398.25539855072464, 384.0460358056267, 362.090608537693, 355.0749649843413, 355.010575658835, 349.8188702669149] o = [[NS_foreign_worker], [NS_other_parties], [NS_foreign_worker, NS_other_parties], [NS_payment], [NS_foreign_worker, NS_payment], [NS_other_parties, NS_payment], [NS_housing], [NS_class], [NS_foreign_worker, NS_other_parties, NS_payment], [NS_foreign_worker, NS_housing]] self.result = list(map(ps.Conjunction, o)) self.qualities = [483.3153195123844, 459.2862838915471, 444.60343785358896, 398.25539855072464, 384.0460358056267, 362.090608537693, 355.0749649843413, 355.010575658835, 349.8188702669149, 342.20780439530444] np.random.seed(1111) self.target_variables = np.random.randint(low=0, high=2, size=1000) self.target_estimates = np.random.uniform(size=1000) data = get_credit_data() target = ps.PredictionTarget(self.target_variables, self.target_estimates, roc_auc_score) searchSpace_Nominal = ps.create_nominal_selectors(data, ignore=['credit_amount']) searchSpace_Numeric = [] #ps.create_numeric_selectors(data, ignore=['credit_amount'], nbins=10) searchSpace = searchSpace_Nominal + searchSpace_Numeric self.task = ps.SubgroupDiscoveryTask(data, target, searchSpace, result_set_size=10, depth=5, qf=ps.CountCallsInterestingMeasure(ps.PredictionQFNumeric(1, False)))
def test_create_selectors_with_nan(self): df = pd.DataFrame.from_dict({ 'A': np.array([np.nan, np.nan, np.nan]), 'B': np.array([10, np.nan, np.nan]) }) result = ps.create_selectors(df) A_null = ps.EqualitySelector('A', np.nan) B_null = ps.EqualitySelector('B', np.nan) B_10 = ps.EqualitySelector('B', 10.) assert A_null in result assert B_null in result assert B_10 in result
def prepare_df(self): A = np.array([0, 0, 1, 1, 0, 0, 1, 1, 1, 1], dtype=bool) self.A1 = ps.EqualitySelector("columnA", True) self.A0 = ps.EqualitySelector("columnA", False) B = np.array(["A", "B", "C", "C", "B", "A", "D", "A", "A", "A"]) self.BA = ps.EqualitySelector("columnB", "A") self.BC = ps.EqualitySelector("columnB", "C") self.df = pd.DataFrame.from_dict({ 'columnA': A, 'columnB': B, 'columnC': np.array([[0, 1] for _ in range(5)]).flatten() })
def test_EqualitySelector_ordering(self): A1 = ps.EqualitySelector("A", 1) A1_clone = ps.EqualitySelector("A", 1) A2 = ps.EqualitySelector("A", 2, "AA") B1 = ps.EqualitySelector("B", 1) self.assertTrue(A1_clone is A1) B1_clone = ps.EqualitySelector("B", 1) self.assertTrue(A1 < B1) self.assertTrue(A1 < A2) self.assertTrue(A2 < B1) self.assertTrue(B1 == B1_clone) self.assertTrue(hash(B1) == hash(B1_clone)) C1 = ps.EqualitySelector("checking_status", b"<0") C2 = ps.EqualitySelector("checking_status", b"<0") self.assertTrue(C1 == C2) self.assertTrue(hash(C1) == hash(C2)) l = [A1, A2, B1] self.assertEqual(l.index(A1), 0) self.assertEqual(l.index(A2), 1) self.assertEqual(l.index(B1), 2)
def assert_class_ordering(self, cls): A1 = ps.EqualitySelector("A", 1) A2 = ps.EqualitySelector("A", 2, "AA") B1 = ps.EqualitySelector("B", 1) SGD1 = cls([A1, A2]) SGD1_clone = cls([A1, A2]) SGD1_order = cls([A2, A1]) self.assertTrue(SGD1 == SGD1_clone) self.assertTrue(hash(SGD1) == hash(SGD1_clone)) self.assertTrue(SGD1 == SGD1_order) self.assertTrue(hash(SGD1) == hash(SGD1_order)) SGD2 = cls([A1, A2, B1]) SGD3 = cls([B1]) self.assertTrue(SGD1 > SGD2) self.assertTrue(SGD2 < SGD3)
def test_get_cover_array_and_size(self): sel = ps.EqualitySelector('checking_status', b'no checking') _, size = ps.get_cover_array_and_size(sel, None, self.data) self.assertEqual(size, 394) _, size = ps.get_cover_array_and_size(slice(None), len(self.data), None) self.assertEqual(size, len(self.data)) _, size = ps.get_cover_array_and_size(slice(0, 10), len(self.data)) self.assertEqual(size, 10) _, size = ps.get_cover_array_and_size( np.array([1, 3, 5, 7, 11], dtype=int)) self.assertEqual(size, 5)
def test_nominal_selector_covers(self): A = np.array([0, 0, 1, 1, 0, 0, 1, 1, 1, 1], dtype=bool) A1 = ps.EqualitySelector("columnA", True) A0 = ps.EqualitySelector("columnA", False) B = np.array(["A", "B", "C", "C", "B", "A", "D", "A", "A", "A"]) BA = ps.EqualitySelector("columnB", "A") BC = ps.EqualitySelector("columnB", "C") C = np.array([np.nan, np.nan, 1.1, 1.1, 2, 2, 2, 2, 2, 2]) CA = ps.EqualitySelector("columnC", 1.1) CNan = ps.EqualitySelector("columnC", np.nan) df = pd.DataFrame.from_dict({"columnA": A, "columnB": B, "columnC": C}) np.testing.assert_array_equal(A1.covers(df), A) np.testing.assert_array_equal(A0.covers(df), np.logical_not(A)) np.testing.assert_array_equal(BA.covers(df), [1, 0, 0, 0, 0, 1, 0, 1, 1, 1]) np.testing.assert_array_equal(BC.covers(df), [0, 0, 1, 1, 0, 0, 0, 0, 0, 0]) np.testing.assert_array_equal(CA.covers(df), [0, 0, 1, 1, 0, 0, 0, 0, 0, 0]) np.testing.assert_array_equal(CNan.covers(df), [1, 1, 0, 0, 0, 0, 0, 0, 0, 0])
def setUp(self): NS_cabin = ps.EqualitySelector("Cabin", np.nan) NS_embarked = ps.EqualitySelector("Embarked", 'S') NS_embarked2 = ps.EqualitySelector("Embarked", 'C') NS_male = ps.EqualitySelector("Sex", 'male') NS_female = ps.EqualitySelector("Sex", 'female') #NS_other_parties = ps.EqualitySelector("other_parties", b"none") #NS_savings_status = ps.EqualitySelector("savings_status", b"<100") #NS_job = ps.EqualitySelector("job", b"skilled") self.result = [ ps.Conjunction([NS_cabin, NS_embarked]), ps.Conjunction([NS_cabin, NS_male]), ps.Conjunction([NS_embarked, NS_male]), ps.Conjunction([NS_cabin]), ps.Conjunction([NS_embarked]), ps.Conjunction([NS_male]), ps.Conjunction([NS_cabin, NS_female]), ps.Conjunction([NS_embarked, NS_female]), ps.Conjunction([NS_female]), ps.Conjunction([NS_cabin, NS_embarked2]), ] self.qualities = [178, 164, 146, 125, 110, 100, 86, 74, 56, 46] data = get_titanic_data() self.qualities2 = [ np.count_nonzero(conj.covers(data)) * conj.depth for conj in self.result ] self.assertEqual(self.qualities, self.qualities2) searchSpace = ps.create_nominal_selectors(data) self.task = ps.SubgroupDiscoveryTask(data, ps.FITarget, searchSpace, result_set_size=10, depth=2, qf=ps.AreaQF())
def setUp(self): NS_checking = ps.EqualitySelector("checking_status", b"<0") NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes") NS_other_parties = ps.EqualitySelector("other_parties", b"none") NS_savings_status = ps.EqualitySelector("savings_status", b"<100") NS_payment_plans = ps.EqualitySelector("other_payment_plans", b"none") self.result = [ ps.Conjunction([NS_checking, NS_foreign_worker]), ps.Conjunction([NS_checking]), ps.Conjunction([NS_checking, NS_other_parties, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_other_parties]), ps.Conjunction([NS_checking, NS_savings_status, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_payment_plans]), ps.Conjunction([NS_checking, NS_payment_plans]), ps.Conjunction([NS_foreign_worker, NS_savings_status]), ps.Conjunction( [NS_foreign_worker, NS_other_parties, NS_savings_status]), ] self.qualities = [ 0.055299999999999995, 0.05280000000000001, 0.052300000000000006, 0.05059999999999999, 0.04959999999999999, 0.048299999999999996, 0.0426, 0.04, 0.03869999999999999, 0.03750000000000001 ] data = get_credit_data() target = ps.BinaryTarget('class', b'bad') searchSpace = ps.create_nominal_selectors(data, ignore=['class']) self.task = ps.SubgroupDiscoveryTask( data, target, searchSpace, result_set_size=10, depth=5, qf=ps.StandardQF(1.0), constraints=[ps.MinSupportConstraint(200)])
def test_CountQf(self): task = ps.SubgroupDiscoveryTask(self.data, ps.FITarget, None, None) qf = ps.CountQF() qf.calculate_constant_statistics(task) sel = ps.EqualitySelector('checking_status', b'no checking') print(self.data.columns) print(self.data.checking_status.value_counts()) size = qf.evaluate(sel, self.data) self.assertEqual(size, 394) size = qf.evaluate(slice(None)) self.assertEqual(size, len(self.data)) size = qf.evaluate(slice(0, 10)) self.assertEqual(size, 10) size = qf.evaluate(np.array([1, 3, 5, 7, 11], dtype=int)) self.assertEqual(size, 5)
def setUp(self): self.A = np.array([0, 0, 1, 1, 0, 0, 1, 1, 1, 1], dtype=bool) self.A1 = ps.EqualitySelector("columnA", True) self.A0 = ps.EqualitySelector("columnA", False) self.B = np.array(["A", "B", "C", "C", "B", "A", "D", "A", "A", "A"]) self.BA = ps.EqualitySelector("columnB", "A") self.BC = ps.EqualitySelector("columnB", "C") self.C = np.array([np.nan, np.nan, 1.1, 1.1, 2, 2, 2, 2, 2, 2]) self.CA = ps.EqualitySelector("columnC", 1.1) self.CNan = ps.EqualitySelector("columnC", np.nan) self.df = pd.DataFrame.from_dict({ "columnA": self.A, "columnB": self.B, "columnC": self.C })
def setUp(self): NS_checking = ps.EqualitySelector("checking_status", b"<0") NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes") NS_other_parties = ps.EqualitySelector("other_parties", b"none") NS_savings_status = ps.EqualitySelector("savings_status", b"<100") NS_job = ps.EqualitySelector("job", b"skilled") NS_dependents = ps.EqualitySelector("num_dependents", 1.0) self.result = [ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_other_parties, NS_savings_status]), # AND job=='b'skilled'' AND other_parties=='b'none'' AND savings_status=='b'<100' # 0.113713540226172: checking_status=='b'<0'' AND foreign_worker=='b'yes'' AND job=='b'skilled'' AND savings_status=='b'<100'' ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_job]), # checking_status=='b'<0'' AND foreign_worker=='b'yes'' AND job=='b'skilled'' # checking_status=='b'<0'' AND job=='b'skilled'' AND other_parties=='b'none'' AND savings_status=='b'<100'' ps.Conjunction([NS_checking, NS_job, NS_other_parties, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_other_parties]), ps.Conjunction([NS_checking, NS_job, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_other_parties, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_other_parties]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_dependents, NS_savings_status]), ps.Conjunction([NS_checking, NS_job, NS_other_parties])] self.qualities = [0.11457431093955019, 0.113713540226172, 0.11201325679119281, 0.1117538749727658, 0.11161046793076415, 0.11145710640046322, 0.11045259291161472, 0.10929088624672183, 0.10875519439407161, 0.10866138825404954, 0.10832735026213287, 0.10813405094128754] data = get_credit_data() target = ps.BinaryTarget('class', b'bad') searchSpace_Nominal = ps.create_nominal_selectors(data, ignore=['class']) searchSpace_Numeric = ps.create_numeric_selectors(data, ignore=['class']) searchSpace = searchSpace_Nominal + searchSpace_Numeric self.task = ps.SubgroupDiscoveryTask(data, target, searchSpace, result_set_size=12, depth=5, qf=ps.StandardQF(0.5))
def setUp(self): NS_telephone = ps.EqualitySelector("own_telephone", b"yes") NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes") NS_other_parties = ps.EqualitySelector("other_parties", b"none") NS_personal = ps.EqualitySelector("personal_status", b'male single') NS_job = ps.EqualitySelector("job", b'high qualif/self emp/mgmt') NS_class = ps.EqualitySelector("class", b"bad") o = [[NS_telephone], [NS_foreign_worker, NS_telephone], [NS_other_parties, NS_telephone], [NS_foreign_worker, NS_telephone, NS_personal], [NS_telephone, NS_personal], [NS_foreign_worker, NS_other_parties, NS_telephone], [NS_job], [NS_class, NS_telephone], [NS_foreign_worker, NS_job], [NS_foreign_worker, NS_other_parties, NS_telephone, NS_personal]] self.result = list(map(ps.Conjunction, o)) self.qualities = [ 383476.7679999999, 361710.05800000014, 345352.9920000001, 338205.08, 336857.8220000001, 323586.28200000006, 320306.81600000005, 300963.84599999996, 299447.332, 297422.98200000013 ] data = get_credit_data() target = ps.NumericTarget('credit_amount') searchSpace_Nominal = ps.create_nominal_selectors( data, ignore=['credit_amount']) searchSpace_Numeric = [ ] #ps.create_numeric_selectors(data, ignore=['credit_amount'], nbins=10) searchSpace = searchSpace_Nominal + searchSpace_Numeric self.task = ps.SubgroupDiscoveryTask( data, target, searchSpace, result_set_size=10, depth=5, qf=ps.CountCallsInterestingMeasure( ps.StandardQFNumeric(1, False, 'sum')))
def setUp(self): NS_checking = ps.EqualitySelector("checking_status", b"<0") NS_checking2 = ps.EqualitySelector("checking_status", b"0<=X<200") NS_other_parties = ps.EqualitySelector("other_parties", b"co applicant") NS_other = ps.EqualitySelector("purpose", b'other') NS_repairs = ps.EqualitySelector("purpose", b'repairs') NS_purpose = ps.EqualitySelector("purpose", b'business') NS_history = ps.EqualitySelector("credit_history", b"no credits/all paid") NS_history2 = ps.EqualitySelector("credit_history", b"all paid") NS_empl = ps.EqualitySelector("employment", b"unemployed") NS_job = ps.EqualitySelector("job", b"unemp/unskilled non res") NS_bank = ps.EqualitySelector("other_payment_plans", b"bank") self.result = [ ps.Disjunction([NS_checking, NS_checking2, NS_bank]), ps.Disjunction([NS_checking, NS_checking2, NS_history]), ps.Disjunction([NS_checking, NS_checking2]), ps.Disjunction([NS_checking, NS_checking2, NS_other]), ps.Disjunction([NS_checking, NS_checking2, NS_repairs]), ps.Disjunction([NS_checking, NS_checking2, NS_empl]), ps.Disjunction([NS_checking, NS_checking2, NS_other_parties]), ps.Disjunction([NS_checking, NS_checking2, NS_history2]), ps.Disjunction([NS_checking, NS_checking2, NS_purpose]), ps.Disjunction([NS_checking, NS_checking2, NS_job]), ] self.qualities = [ 0.0779, 0.07740000000000002, 0.0771, 0.07680000000000001, 0.07670000000000002, 0.0767, 0.07660000000000003, 0.07650000000000003, 0.07650000000000001, 0.07600000000000001 ] data = get_credit_data() target = ps.BinaryTarget('class', b'bad') searchSpace = ps.create_nominal_selectors(data, ignore=['class']) self.task = ps.SubgroupDiscoveryTask(data, target, searchSpace, result_set_size=10, depth=3, qf=ps.StandardQF(1.0))
import pprint import numpy as np import pandas as pd import pysubgroup as ps pp = pprint.PrettyPrinter(indent=4) data = np.array([[1, 2, 3, 4, 5], ["F", "F", "F", "Tr", "Tr"]]).T data = pd.DataFrame(data, columns=["Target", "A"]) data["Target"] = pd.to_numeric(data["Target"]) target = ps.NumericTarget('Target') print(data[target.target_variable]) sgd = ps.EqualitySelector("A", "Tr") target.calculate_statistics(sgd, data) qf = ps.StandardQFNumeric(1.0) print(qf.evaluate(sgd, target, data)) print(qf.optimistic_estimate(sgd, target, data))
import unittest import pysubgroup as ps from pysubgroup.tests.DataSets import get_credit_data from pysubgroup.tests.algorithms_testing import TestAlgorithmsBase data = get_credit_data() target = ps.NumericTarget('credit_amount') sgd = ps.EqualitySelector("purpose", b"other") stats = target.calculate_statistics(sgd, data) print(stats) qf = ps.StandardQFNumeric(1.0) score = qf.evaluate(sgd, target, data) print(score) score = qf.evaluate(sgd, target, data, stats) print(score)