示例#1
0
    def test_predict_probability(self):
        cba = CBA(algorithm="m2")

        test_dataframe = pd.read_csv(dataset_file, sep=",")

        transactions = TransactionDB.from_DataFrame(test_dataframe)
        transactions_test = TransactionDB.from_DataFrame(test_dataframe[:2])

        cba.fit(transactions)

        cba.predict_probability(transactions_test)
示例#2
0
    def test_predict_probability_works(self):
        cba = CBA(algorithm="m1")

        test_dataframe = pd.read_csv(dataset_file, sep=",")

        transactions = TransactionDB.from_DataFrame(test_dataframe)
        transactions_test = TransactionDB.from_DataFrame(test_dataframe[:2])

        cba.fit(transactions)

        probabilities = cba.predict_probability(transactions_test)
        matched_rules = cba.predict_matched_rules(transactions_test)

        for idx in range(len(probabilities)):
            self.assertEqual(probabilities[idx], matched_rules[idx].confidence)
示例#3
0
    def test_accuracy(self):
        expected_accuracy = 0.5

        cba = CBA(algorithm="m2")

        test_dataframe = pd.read_csv(dataset_file, sep=",")

        transactions = TransactionDB.from_DataFrame(test_dataframe)
        transactions_test = TransactionDB.from_DataFrame(test_dataframe[:2])

        cba.fit(transactions)

        accuracy = cba.rule_model_accuracy(transactions_test)

        self.assertAlmostEqual(accuracy, expected_accuracy, places=3)
示例#4
0
    def test_top_rules(self):
        header1 = ["A", "B", "Y"]
        rows1 = [
            [1, 1, 0],
            [1, 1, 0],
            [1, 1, 1],
            [0, 0, 0],
            [0, 0, 1],
            [0, 0, 1]
        ]

        transactionDB1 = TransactionDB(rows1, header1)

        rules = None
        with HiddenPrints():
            rules = top_rules(transactionDB1.string_representation, appearance=transactionDB1.appeardict)

        expected_rules = [
            ('Y:=:1', ('A:=:1',), 1/6, 1/3),
            ('Y:=:0', ('A:=:1',), 1/3, 2/3),
            ('Y:=:1', ('B:=:1',), 1/6, 1/3),
            ('Y:=:0', ('B:=:1',), 1/3, 2/3),
            ('Y:=:1', ('B:=:1', 'A:=:1'), 1/6, 1/3),
            ('Y:=:0', ('B:=:1', 'A:=:1'), 1/3, 2/3),
            ('Y:=:1', ('A:=:0',), 1/3, 2/3),
            ('Y:=:0', ('A:=:0',), 1/6, 1/3),
            ('Y:=:1', ('B:=:0',), 1/3, 2/3),
            ('Y:=:0', ('B:=:0',), 1/6, 1/3),
            ('Y:=:1', ('B:=:0', 'A:=:0'), 1/3, 2/3),
            ('Y:=:0', ('B:=:0', 'A:=:0'), 1/6, 1/3)
        ]

        for r in rules:
            assert r in expected_rules
示例#5
0
    def test_len(self):
        rows1 = [[1, 1, 0, 0], [1, 1, 0, 1], [0, 0, 1, 1], [0, 1, 0, 1]]
        header1 = ["A", "B", "C", "Y"]

        transDB1 = TransactionDB(rows1, header1)

        assert len(transDB1) == 4
示例#6
0
    def test_default_rule_correct(self):
        cba = CBA(support=0.9)
        cba_m2 = CBA(support=0.9)

        header1 = ["A", "B", "Y"]
        rows1 = [
            [1, 1, 0],
            [0, 0, 1],
        ]

        transactions = TransactionDB(rows1, header1)

        cba.fit(transactions)
        cba_m2.fit(transactions)

        default_class = cba.clf.default_class
        default_class_m2 = cba_m2.clf.default_class

        self.assertTrue(default_class in ["0", "1"])
        self.assertTrue(default_class_m2 in ["0", "1"])

        default_class_support = cba.clf.default_class_support
        default_class_confidence = cba.clf.default_class_confidence

        default_class_support_m2 = cba_m2.clf.default_class_support
        default_class_confidence_m2 = cba_m2.clf.default_class_confidence

        self.assertTrue(0 <= default_class_support <= 1)
        self.assertTrue(0 <= default_class_support_m2 <= 1)
        self.assertTrue(0 <= default_class_confidence <= 1)
        self.assertTrue(0 <= default_class_confidence_m2 <= 1)
示例#7
0
    def fit(self,
            quant_dataframe,
            cars=None,
            rule_cutoff=30,
            lambda_array=7 * [1],
            class_name=None,
            debug=False,
            algorithm="SLS"):

        self.quant_dataframe_train = quant_dataframe

        self._prepare(quant_dataframe, class_name)

        for class_, clf_dict in self.ids_classifiers.items():
            print("training class:", class_)

            clf = clf_dict["clf"]
            quant_dataframe = clf_dict["quant_dataframe"]
            pandas_dataframe = quant_dataframe.dataframe

            txns = TransactionDB.from_DataFrame(pandas_dataframe)
            rules = top_rules(txns.string_representation,
                              appearance=txns.appeardict)
            cars = createCARs(rules)
            cars.sort(reverse=True)

            clf.fit(quant_dataframe,
                    cars[:rule_cutoff],
                    lambda_array=lambda_array,
                    debug=debug,
                    algorithm=algorithm)
示例#8
0
    def fit(self, rule_cutoff):
        dataframes = self._prepare_dataframes()

        scores = []

        for dataframe_train, dataframe_test in dataframes:
            txns_train = TransactionDB.from_DataFrame(dataframe_train)

            rules = top_rules(txns_train.string_representation, appearance=txns_train.appeardict)
            cars = createCARs(rules)[:rule_cutoff]

            quant_dataframe_train = QuantitativeDataFrame(dataframe_train)
            quant_dataframe_test = QuantitativeDataFrame(dataframe_test)

            self.classifier.fit(quant_dataframe_train, cars, debug=self.debug)

            score = None
            
            if self.score_auc:
                score = self.classifier.score_auc(quant_dataframe_test)
            else:
                score = self.classifier.score(quant_dataframe_test)


            scores.append(score)

        return scores
示例#9
0
    def mine_frequent_itemsets(self, pandas_df, minsup):
        txns_classless = TransactionDB.from_DataFrame(pandas_df.iloc[:, :-1])

        frequent_itemsets = fim.apriori(txns_classless.string_representation,
                                        supp=minsup * 100,
                                        report="s")

        return frequent_itemsets
示例#10
0
    def test_fitting(self):
        cba = CBA()

        test_dataframe = pd.read_csv(dataset_file, sep=",")

        transactions = TransactionDB.from_DataFrame(test_dataframe)

        cba.fit(transactions)
示例#11
0
def mine_CARs(df, rule_cutoff, sample=False):
    txns = TransactionDB.from_DataFrame(df)
    rules = top_rules(txns.string_representation, appearance=txns.appeardict)
    cars = createCARs(rules)

    cars_subset = cars[:rule_cutoff]

    if sample:
        cars_subset = random.sample(cars, rule_cutoff)

    return cars_subset
示例#12
0
    def test_predict_probablity(self):
        header1 = ["A", "B", "Y"]
        rows1 = [[1, 1, 0], [1, 1, 0], [1, 1, 1], [0, 0, 0], [0, 0, 1],
                 [0, 0, 1]]

        transactions = TransactionDB(rows1, header1)

        cba = CBA()

        cba.fit(transactions)

        probs = cba.clf.predict_probability_all(transactions)
示例#13
0
    def test_target_class_works(self):
        cba = CBA(algorithm="m2")

        test_dataframe = pd.read_csv(dataset_file, sep=",")

        transactions = TransactionDB.from_DataFrame(test_dataframe,
                                                    target="Gender")

        cba.fit(transactions)

        rules = cba.clf.rules

        rule0 = rules[0]

        self.assertEqual(rule0.consequent[0], "Gender")
示例#14
0
    def test_rule_class_label_works(self):
        cba = CBA(algorithm="m2")

        test_dataframe = pd.read_csv(dataset_file, sep=",")

        transactions = TransactionDB.from_DataFrame(test_dataframe)

        cba.fit(transactions)

        rules = cba.clf.rules

        rule0 = rules[0]

        self.assertEqual(rule0.consequent[0],
                         test_dataframe.columns.values[-1])
示例#15
0
    def test_inspect(self):
        cba = CBA()

        test_dataframe = pd.read_csv(dataset_file, sep=";")

        transactions = TransactionDB.from_DataFrame(test_dataframe)

        cba.fit(transactions)

        clf = cba.clf

        inspect_df = clf.inspect()

        self.assertEqual(type(inspect_df), pd.DataFrame)
        self.assertEqual(len(inspect_df), len(clf.rules) + 1)

        self.assertEqual(inspect_df["lhs"].iloc[-1], "{}")
示例#16
0
    def test_init(self):
        rows1 = [[1, 1, 0, 0], [1, 1, 0, 1], [0, 0, 1, 1], [0, 1, 0, 1]]
        header1 = ["A", "B", "C", "Y"]

        transDB1 = TransactionDB(rows1, header1, unique_transactions=False)

        transaction1 = Transaction([1, 1, 0], "ABC", Item("Y", 0))

        class_labels = [
            Item("Y", 0),
            Item("Y", 1),
            Item("Y", 1),
            Item("Y", 1),
        ]

        assert transDB1.class_labels == class_labels
        assert transDB1.classes == ["0", "1", "1", "1"]
        assert transDB1.data[0] == transaction1
示例#17
0
def mine_CARs(df,
              rule_cutoff,
              sample=False,
              random_seed=None,
              **top_rules_kwargs):
    if random_seed:
        random.seed(random_seed)
        np.random.seed(random_seed)

    txns = TransactionDB.from_DataFrame(df)
    rules = top_rules(txns.string_representation,
                      appearance=txns.appeardict,
                      **top_rules_kwargs)
    cars = createCARs(rules)

    cars_subset = cars[:rule_cutoff]

    if sample:
        cars_subset = random.sample(cars, rule_cutoff)

    return cars_subset
示例#18
0
    def test_generateCARs(self):
        header1 = ["A", "B", "Y"]
        rows1 = [
            [1, 1, 0],
            [1, 1, 0],
            [1, 1, 1],
            [0, 0, 0],
            [0, 0, 1],
            [0, 0, 1]
        ]

        transactionDB1 = TransactionDB(rows1, header1)

        rules = generateCARs(transactionDB1, support=50)

        car1 = ClassAssocationRule([], Consequent("Y", 1), support=0.5, confidence=0.5)
        car1.id = rules[0].id

        car2 = ClassAssocationRule([], Consequent("Y", 0), support=0.5, confidence=0.5)
        car1.id = rules[1].id

        car1 == rules[0]
        car2 == rules[1]

benchmark_data = []

for dataset_name, rule_cutoff in dataset_rulenum.items():
    print(dataset_name)

    train_files = get_dataset_files(train_path, dataset_name)
    test_files = get_dataset_files(test_path, dataset_name)
    for train_file, test_file in list(zip(train_files, test_files))[:]:
        dataset_path = os.path.join(train_path, train_file)
        dataset_test_path = os.path.join(test_path, test_file)

        df = pd.read_csv(dataset_path)
        quant_df = QuantitativeDataFrame(df)
        txns = TransactionDB.from_DataFrame(df)

        df_test = pd.read_csv(dataset_test_path)
        quant_df_test = QuantitativeDataFrame(df_test)
        txns_test = TransactionDB.from_DataFrame(df_test)

        def fmax(param_dict):
            print(param_dict)

            support, confidence = param_dict["support"] / 1000, param_dict[
                "confidence"] / 1000
            print(dict(support=support, confidence=confidence))

            cba = CBA(support=support, confidence=confidence)
            cba.fit(txns)
示例#20
0
dataset_files = [f"{dataset_name}0.csv" for dataset_name in datasets]

dataset_path = "C:/code/python/machine_learning/assoc_rules/"
dataset_path_train = os.path.join(dataset_path, "train")
dataset_path_test = os.path.join(dataset_path, "../../../test")

benchmark_list = []

for dataset_filename in dataset_files:
    print(dataset_filename)

    df_train = pd.read_csv(os.path.join(dataset_path_train, dataset_filename))
    df_test = pd.read_csv(os.path.join(dataset_path_test, dataset_filename))

    txns_train = TransactionDB.from_DataFrame(df_train)
    txns_test = TransactionDB.from_DataFrame(df_test)

    quant_df_train = QuantitativeDataFrame(df_train)
    quant_df_test = QuantitativeDataFrame(df_test)

    cba = CBA(support=0.1, confidence=0.1)
    cba.fit(txns_train)

    rules = cba.clf.rules
    ids_ruleset = IDSRuleSet.from_cba_rules(rules)

    ids = IDS()
    ids.clf = IDSClassifier(ids_ruleset.ruleset)
    ids.clf.default_class = cba.clf.default_class
示例#21
0
import numpy as np

from pyarc.qcba.data_structures import QuantitativeDataFrame
import time

from pyids.algorithms.ids_classifier import mine_CARs
from pyids.algorithms.ids import IDS
from pyarc.data_structures import TransactionDB
from pyarc.algorithms import M1Algorithm

#logging.basicConfig(level=logging.DEBUG)

iris_file = "c:/code/python/machine_learning/assoc_rules/train/iris0.csv"

df = pd.read_csv(iris_file)
txns = TransactionDB.from_DataFrame(df)

iris_benchmark = []

for i in range(10, 110, 10):
    rule_count = i

    rules = mine_CARs(df, rule_count)

    quant_df = QuantitativeDataFrame(df)

    cars = mine_CARs(df, rule_count)
    print(len(cars))

    for algorithm in ["DLS", "SLS"]:
        times = []