示例#1
0
def plot_roc(result_df, data, qf=ps.StandardQF(0.5), levels=40, annotate=False):
    instances_dataset = len(data)
    positives_dataset = np.max(result_df['positives_dataset'])
    negatives_dataset = instances_dataset - positives_dataset

    xlist = np.linspace(0.01, 0.99, 100)
    ylist = np.linspace(0.01, 0.99, 100)
    X, Y = np.meshgrid(xlist, ylist)
    f = np.vectorize(partial(qf.evaluate, instances_dataset, positives_dataset), otypes=[np.float])
    Z = f(X * negatives_dataset + Y * positives_dataset, Y * positives_dataset)
    max_val = np.max([np.max(Z), -np.min(Z)])

    fig, ax = plt.subplots()
    cm = plt.cm.get_cmap("bwr")

    plt.contourf(X, Y, Z, levels, cmap=cm, vmin=-max_val, vmax=max_val)

    for i, sg in result_df.iterrows():
        rel_positives_sg = sg['positives_sg'] / positives_dataset
        rel_negatives_sg = (sg['size_sg'] - sg['positives_sg']) / negatives_dataset
        ax.plot(rel_negatives_sg, rel_positives_sg, 'ro', color='black')
        if annotate:
            label_margin = 0.01
            ax.annotate(str(i), (rel_negatives_sg + label_margin, rel_positives_sg + label_margin))

    # plt.colorbar(cp)
    plt.title('Discovered subgroups')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

    return fig
    def test_simple(self):

        task = task_dummy(self.df, ps.BinaryTarget('columnC', 1))
        qf = ps.StandardQF(0)
        qf.calculate_constant_statistics(task)

        self.ga_qf.calculate_constant_statistics(task)

        #print(qf.calculate_statistics(self.A1, self.df))
        #print(qf.calculate_statistics(self.BA, self.df))
        #print(qf.calculate_statistics(ps.Conjunction([self.A1, self.BA]), self.df))
        #print(qf.calculate_statistics(slice(None), self.df))
        ga_stat = self.ga_qf.calculate_statistics(
            ps.Conjunction([self.A1, self.BA]), self.df)

        self.assertEqual(ga_stat.subgroup_stats,
                         ps.SimplePositivesQF.tpl(3, 2))
        self.assertEqual(ga_stat.generalisation_stats,
                         ps.SimplePositivesQF.tpl(5, 3))
        # Ensure cache works properly
        self.assertEqual(
            ga_stat,
            self.ga_qf.calculate_statistics(ps.Conjunction([self.A1, self.BA]),
                                            self.df))

        ga_score = self.ga_qf.evaluate(ps.Conjunction([self.A1, self.BA]),
                                       self.df)
        ga_score2 = self.ga_qf.evaluate(ps.Conjunction([self.A1, self.BA]),
                                        self.df)

        self.assertEqual(ga_score, ga_score2)
        self.assertAlmostEqual(ga_score, 0.06666666666666)
示例#3
0
 def setUpClass(cls):
     data = get_credit_data()
     target = ps.BinaryTarget('class', b'bad')
     searchSpace = ps.create_nominal_selectors(data, ignore=['class'])
     cls.task = ps.SubgroupDiscoveryTask(data,
                                         target,
                                         searchSpace,
                                         result_set_size=10,
                                         depth=5,
                                         qf=ps.StandardQF(1.0))
     cls.result = ps.SimpleDFS().execute(cls.task)
 def setUp(self):
     NS_checking = ps.EqualitySelector("checking_status", b"<0")
     NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes")
     NS_other_parties = ps.EqualitySelector("other_parties", b"none")
     NS_savings_status = ps.EqualitySelector("savings_status", b"<100")
     NS_job = ps.EqualitySelector("job", b"skilled")
     NS_dependents = ps.EqualitySelector("num_dependents", 1.0)
     self.result = [
         ps.Conjunction([NS_checking, NS_foreign_worker]),
         ps.Conjunction([NS_checking]),
         ps.Conjunction([NS_checking, NS_other_parties, NS_foreign_worker]),
         ps.Conjunction([NS_checking, NS_other_parties]),
         ps.Conjunction([NS_checking, NS_savings_status,
                         NS_foreign_worker]),
         ps.Conjunction([NS_checking, NS_foreign_worker, NS_dependents]),
         ps.Conjunction([NS_checking, NS_savings_status]),
         ps.Conjunction([NS_checking, NS_dependents]),
         ps.Conjunction([
             NS_checking, NS_savings_status, NS_other_parties,
             NS_foreign_worker
         ]),
         ps.Conjunction([NS_checking, NS_job, NS_foreign_worker]),
         ps.Conjunction([NS_checking, NS_savings_status, NS_other_parties]),
         ps.Conjunction([NS_checking, NS_job]),
     ]
     self.qualities = [
         0.055299999999999995, 0.05280000000000001, 0.052300000000000006,
         0.05059999999999999, 0.04959999999999999, 0.04870000000000001,
         0.048299999999999996, 0.0474, 0.04660000000000001,
         0.04550000000000001, 0.0452, 0.044399999999999995
     ]
     data = get_credit_data()
     target = ps.BinaryTarget('class', b'bad')
     searchSpace_Nominal = ps.create_nominal_selectors(data,
                                                       ignore=['class'])
     searchSpace_Numeric = ps.create_numeric_selectors(data,
                                                       ignore=['class'])
     searchSpace = searchSpace_Nominal + searchSpace_Numeric
     self.task = ps.SubgroupDiscoveryTask(data,
                                          target,
                                          searchSpace,
                                          result_set_size=12,
                                          depth=5,
                                          qf=ps.StandardQF(1.0))
    def setUp(self):
        NS_checking = ps.EqualitySelector("checking_status", b"<0")
        NS_checking2 = ps.EqualitySelector("checking_status", b"0<=X<200")
        NS_other_parties = ps.EqualitySelector("other_parties",
                                               b"co applicant")
        NS_other = ps.EqualitySelector("purpose", b'other')
        NS_repairs = ps.EqualitySelector("purpose", b'repairs')
        NS_purpose = ps.EqualitySelector("purpose", b'business')

        NS_history = ps.EqualitySelector("credit_history",
                                         b"no credits/all paid")
        NS_history2 = ps.EqualitySelector("credit_history", b"all paid")
        NS_empl = ps.EqualitySelector("employment", b"unemployed")
        NS_job = ps.EqualitySelector("job", b"unemp/unskilled non res")
        NS_bank = ps.EqualitySelector("other_payment_plans", b"bank")
        self.result = [
            ps.Disjunction([NS_checking, NS_checking2, NS_bank]),
            ps.Disjunction([NS_checking, NS_checking2, NS_history]),
            ps.Disjunction([NS_checking, NS_checking2]),
            ps.Disjunction([NS_checking, NS_checking2, NS_other]),
            ps.Disjunction([NS_checking, NS_checking2, NS_repairs]),
            ps.Disjunction([NS_checking, NS_checking2, NS_empl]),
            ps.Disjunction([NS_checking, NS_checking2, NS_other_parties]),
            ps.Disjunction([NS_checking, NS_checking2, NS_history2]),
            ps.Disjunction([NS_checking, NS_checking2, NS_purpose]),
            ps.Disjunction([NS_checking, NS_checking2, NS_job]),
        ]
        self.qualities = [
            0.0779, 0.07740000000000002, 0.0771, 0.07680000000000001,
            0.07670000000000002, 0.0767, 0.07660000000000003,
            0.07650000000000003, 0.07650000000000001, 0.07600000000000001
        ]
        data = get_credit_data()
        target = ps.BinaryTarget('class', b'bad')
        searchSpace = ps.create_nominal_selectors(data, ignore=['class'])
        self.task = ps.SubgroupDiscoveryTask(data,
                                             target,
                                             searchSpace,
                                             result_set_size=10,
                                             depth=3,
                                             qf=ps.StandardQF(1.0))
 def setUp(self):
     NS_checking = ps.EqualitySelector("checking_status", b"<0")
     NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes")
     NS_other_parties = ps.EqualitySelector("other_parties", b"none")
     NS_savings_status = ps.EqualitySelector("savings_status", b"<100")
     NS_payment_plans = ps.EqualitySelector("other_payment_plans", b"none")
     self.result = [
         ps.Conjunction([NS_checking, NS_foreign_worker]),
         ps.Conjunction([NS_checking]),
         ps.Conjunction([NS_checking, NS_other_parties, NS_foreign_worker]),
         ps.Conjunction([NS_checking, NS_other_parties]),
         ps.Conjunction([NS_checking, NS_savings_status,
                         NS_foreign_worker]),
         ps.Conjunction([NS_checking, NS_savings_status]),
         ps.Conjunction([NS_checking, NS_foreign_worker, NS_payment_plans]),
         ps.Conjunction([NS_checking, NS_payment_plans]),
         ps.Conjunction([NS_foreign_worker, NS_savings_status]),
         ps.Conjunction(
             [NS_foreign_worker, NS_other_parties, NS_savings_status]),
     ]
     self.qualities = [
         0.055299999999999995, 0.05280000000000001, 0.052300000000000006,
         0.05059999999999999, 0.04959999999999999, 0.048299999999999996,
         0.0426, 0.04, 0.03869999999999999, 0.03750000000000001
     ]
     data = get_credit_data()
     target = ps.BinaryTarget('class', b'bad')
     searchSpace = ps.create_nominal_selectors(data, ignore=['class'])
     self.task = ps.SubgroupDiscoveryTask(
         data,
         target,
         searchSpace,
         result_set_size=10,
         depth=5,
         qf=ps.StandardQF(1.0),
         constraints=[ps.MinSupportConstraint(200)])
示例#7
0
import pandas as pd
import pysubgroup as ps


data = pd.read_table("../data/titanic.csv")
target = ps.BinaryTarget('Survived', 0)
search_space = ps.create_selectors(data, ignore=['Survived'])
task = ps.SubgroupDiscoveryTask(data, target, search_space,
                                result_set_size=5, depth=2,
                                qf=ps.CombinedInterestingnessMeasure([ps.StandardQF(1), ps.GeneralizationAware_StandardQF(1)]))

result = ps.SimpleDFS().execute(task, use_optimistic_estimates=False)

print(result.to_dataframe())
示例#8
0
from scipy.io import arff

import pysubgroup as ps
import pandas as pd

data = pd.DataFrame(arff.loadarff("../data/credit-g.arff")[0])

target = ps.NominalTarget('class', b'bad')
searchSpace = ps.createNominalSelectors(data, ignore=['class'])
task = ps.SubgroupDiscoveryTask(data,
                                target,
                                searchSpace,
                                resultSetSize=10,
                                depth=3,
                                qf=ps.StandardQF(1.0))

result = ps.BeamSearch(beamWidth=10).execute(task)
for (q, sg) in result:
    print(str(q) + ":\t" + str(sg.subgroupDescription))

print("******")
result = ps.SimpleDFS().execute(task)
for (q, sg) in result:
    print(str(q) + ":\t" + str(sg.subgroupDescription))

# print WRAccQF().evaluateFromDataset(data, Subgroup(target, []))
示例#9
0
from scipy.io import arff

import pysubgroup as ps
import pandas as pd
from timeit import default_timer as timer

data = pd.DataFrame(arff.loadarff("../data/credit-g.arff")[0])

target = ps.NominalTarget('class', b'bad')
search_space = ps.create_nominal_selectors(data, ignore=['class'])
task = ps.SubgroupDiscoveryTask(data,
                                target,
                                search_space,
                                result_set_size=10,
                                depth=5,
                                qf=ps.StandardQF(0.5))

#start = timer()
#result = ps.BSD_Bitarray().execute(task)
#end = timer()
#print("Time elapsed: ", (end - start))
#for (q, sg) in result:
#    print (str(q) + ":\t" + str(sg.subgroup_description))
# print WRAccQF().evaluate_from_dataset(data, Subgroup(target, []))

start = timer()
result = ps.BSD().execute(task)
end = timer()
print("Time elapsed: ", (end - start))
for (q, sg) in result:
    print(str(q) + ":\t" + str(sg.subgroup_description))
示例#10
0
import pandas as pd
import pysubgroup as ps

data = pd.read_table("../data/titanic.csv")
target = ps.BinaryTarget('Survived', 0)
search_space = ps.create_selectors(data, ignore=['Survived'])
task = ps.SubgroupDiscoveryTask(data,
                                target,
                                search_space,
                                result_set_size=5,
                                depth=2,
                                qf=ps.CombinedInterestingnessMeasure(
                                    [ps.StandardQF(1),
                                     ps.GAStandardQF(1)]))

result = ps.SimpleDFS().execute(task, use_optimistic_estimates=False)

for (q, sg) in result:
    print(str(q) + ":\t" + str(sg.subgroup_description))
示例#11
0
    def setUp(self):
        NS_checking = ps.EqualitySelector("checking_status", b"<0")
        NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes")
        NS_other_parties = ps.EqualitySelector("other_parties", b"none")
        NS_savings_status = ps.EqualitySelector("savings_status", b"<100")
        NS_job = ps.EqualitySelector("job", b"skilled")
        NS_dependents = ps.EqualitySelector("num_dependents", 1.0)
        self.result = [ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_other_parties, NS_savings_status]),  # AND job=='b'skilled'' AND other_parties=='b'none'' AND savings_status=='b'<100'
                       # 0.113713540226172:    checking_status=='b'<0'' AND foreign_worker=='b'yes'' AND job=='b'skilled'' AND savings_status=='b'<100''
                       ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_savings_status]),
                       ps.Conjunction([NS_checking, NS_foreign_worker, NS_job]),  # checking_status=='b'<0'' AND foreign_worker=='b'yes'' AND job=='b'skilled''
                       # checking_status=='b'<0'' AND job=='b'skilled'' AND other_parties=='b'none'' AND savings_status=='b'<100''
                       ps.Conjunction([NS_checking, NS_job, NS_other_parties, NS_savings_status]),
                       ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_other_parties]),
                       ps.Conjunction([NS_checking, NS_job, NS_savings_status]),
                       ps.Conjunction([NS_checking, NS_foreign_worker, NS_other_parties, NS_savings_status]),
                       ps.Conjunction([NS_checking, NS_foreign_worker, NS_other_parties]),
                       ps.Conjunction([NS_checking, NS_foreign_worker, NS_savings_status]),
                       ps.Conjunction([NS_checking, NS_foreign_worker]),
                       ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_dependents, NS_savings_status]),
                       ps.Conjunction([NS_checking, NS_job, NS_other_parties])]

        self.qualities = [0.11457431093955019,
                          0.113713540226172,
                          0.11201325679119281,
                          0.1117538749727658,
                          0.11161046793076415,
                          0.11145710640046322,
                          0.11045259291161472,
                          0.10929088624672183,
                          0.10875519439407161,
                          0.10866138825404954,
                          0.10832735026213287,
                          0.10813405094128754]
        data = get_credit_data()
        target = ps.BinaryTarget('class', b'bad')
        searchSpace_Nominal = ps.create_nominal_selectors(data, ignore=['class'])
        searchSpace_Numeric = ps.create_numeric_selectors(data, ignore=['class'])
        searchSpace = searchSpace_Nominal + searchSpace_Numeric
        self.task = ps.SubgroupDiscoveryTask(data, target, searchSpace, result_set_size=12, depth=5, qf=ps.StandardQF(0.5))
示例#12
0
import pysubgroup as ps
import pandas as pd

data = pd.read_csv("~/datasets/titanic.csv")
target = ps.NominalTarget('survived', True)
searchSpace = ps.createSelectors(data, ignore=['survived'])
task = ps.SubgroupDiscoveryTask(data,
                                target,
                                searchSpace,
                                resultSetSize=5,
                                depth=2,
                                qf=ps.StandardQF(1))

result = ps.BeamSearch().execute(task)

for (q, sg) in result:
    print(str(q) + ":\t" + str(sg.subgroupDescription))