def test_predict_probability(self): cba = CBA(algorithm="m2") test_dataframe = pd.read_csv(dataset_file, sep=",") transactions = TransactionDB.from_DataFrame(test_dataframe) transactions_test = TransactionDB.from_DataFrame(test_dataframe[:2]) cba.fit(transactions) cba.predict_probability(transactions_test)
def test_predict_probability_works(self): cba = CBA(algorithm="m1") test_dataframe = pd.read_csv(dataset_file, sep=",") transactions = TransactionDB.from_DataFrame(test_dataframe) transactions_test = TransactionDB.from_DataFrame(test_dataframe[:2]) cba.fit(transactions) probabilities = cba.predict_probability(transactions_test) matched_rules = cba.predict_matched_rules(transactions_test) for idx in range(len(probabilities)): self.assertEqual(probabilities[idx], matched_rules[idx].confidence)
def test_accuracy(self): expected_accuracy = 0.5 cba = CBA(algorithm="m2") test_dataframe = pd.read_csv(dataset_file, sep=",") transactions = TransactionDB.from_DataFrame(test_dataframe) transactions_test = TransactionDB.from_DataFrame(test_dataframe[:2]) cba.fit(transactions) accuracy = cba.rule_model_accuracy(transactions_test) self.assertAlmostEqual(accuracy, expected_accuracy, places=3)
def test_top_rules(self): header1 = ["A", "B", "Y"] rows1 = [ [1, 1, 0], [1, 1, 0], [1, 1, 1], [0, 0, 0], [0, 0, 1], [0, 0, 1] ] transactionDB1 = TransactionDB(rows1, header1) rules = None with HiddenPrints(): rules = top_rules(transactionDB1.string_representation, appearance=transactionDB1.appeardict) expected_rules = [ ('Y:=:1', ('A:=:1',), 1/6, 1/3), ('Y:=:0', ('A:=:1',), 1/3, 2/3), ('Y:=:1', ('B:=:1',), 1/6, 1/3), ('Y:=:0', ('B:=:1',), 1/3, 2/3), ('Y:=:1', ('B:=:1', 'A:=:1'), 1/6, 1/3), ('Y:=:0', ('B:=:1', 'A:=:1'), 1/3, 2/3), ('Y:=:1', ('A:=:0',), 1/3, 2/3), ('Y:=:0', ('A:=:0',), 1/6, 1/3), ('Y:=:1', ('B:=:0',), 1/3, 2/3), ('Y:=:0', ('B:=:0',), 1/6, 1/3), ('Y:=:1', ('B:=:0', 'A:=:0'), 1/3, 2/3), ('Y:=:0', ('B:=:0', 'A:=:0'), 1/6, 1/3) ] for r in rules: assert r in expected_rules
def test_len(self): rows1 = [[1, 1, 0, 0], [1, 1, 0, 1], [0, 0, 1, 1], [0, 1, 0, 1]] header1 = ["A", "B", "C", "Y"] transDB1 = TransactionDB(rows1, header1) assert len(transDB1) == 4
def test_default_rule_correct(self): cba = CBA(support=0.9) cba_m2 = CBA(support=0.9) header1 = ["A", "B", "Y"] rows1 = [ [1, 1, 0], [0, 0, 1], ] transactions = TransactionDB(rows1, header1) cba.fit(transactions) cba_m2.fit(transactions) default_class = cba.clf.default_class default_class_m2 = cba_m2.clf.default_class self.assertTrue(default_class in ["0", "1"]) self.assertTrue(default_class_m2 in ["0", "1"]) default_class_support = cba.clf.default_class_support default_class_confidence = cba.clf.default_class_confidence default_class_support_m2 = cba_m2.clf.default_class_support default_class_confidence_m2 = cba_m2.clf.default_class_confidence self.assertTrue(0 <= default_class_support <= 1) self.assertTrue(0 <= default_class_support_m2 <= 1) self.assertTrue(0 <= default_class_confidence <= 1) self.assertTrue(0 <= default_class_confidence_m2 <= 1)
def fit(self, quant_dataframe, cars=None, rule_cutoff=30, lambda_array=7 * [1], class_name=None, debug=False, algorithm="SLS"): self.quant_dataframe_train = quant_dataframe self._prepare(quant_dataframe, class_name) for class_, clf_dict in self.ids_classifiers.items(): print("training class:", class_) clf = clf_dict["clf"] quant_dataframe = clf_dict["quant_dataframe"] pandas_dataframe = quant_dataframe.dataframe txns = TransactionDB.from_DataFrame(pandas_dataframe) rules = top_rules(txns.string_representation, appearance=txns.appeardict) cars = createCARs(rules) cars.sort(reverse=True) clf.fit(quant_dataframe, cars[:rule_cutoff], lambda_array=lambda_array, debug=debug, algorithm=algorithm)
def fit(self, rule_cutoff): dataframes = self._prepare_dataframes() scores = [] for dataframe_train, dataframe_test in dataframes: txns_train = TransactionDB.from_DataFrame(dataframe_train) rules = top_rules(txns_train.string_representation, appearance=txns_train.appeardict) cars = createCARs(rules)[:rule_cutoff] quant_dataframe_train = QuantitativeDataFrame(dataframe_train) quant_dataframe_test = QuantitativeDataFrame(dataframe_test) self.classifier.fit(quant_dataframe_train, cars, debug=self.debug) score = None if self.score_auc: score = self.classifier.score_auc(quant_dataframe_test) else: score = self.classifier.score(quant_dataframe_test) scores.append(score) return scores
def mine_frequent_itemsets(self, pandas_df, minsup): txns_classless = TransactionDB.from_DataFrame(pandas_df.iloc[:, :-1]) frequent_itemsets = fim.apriori(txns_classless.string_representation, supp=minsup * 100, report="s") return frequent_itemsets
def test_fitting(self): cba = CBA() test_dataframe = pd.read_csv(dataset_file, sep=",") transactions = TransactionDB.from_DataFrame(test_dataframe) cba.fit(transactions)
def mine_CARs(df, rule_cutoff, sample=False): txns = TransactionDB.from_DataFrame(df) rules = top_rules(txns.string_representation, appearance=txns.appeardict) cars = createCARs(rules) cars_subset = cars[:rule_cutoff] if sample: cars_subset = random.sample(cars, rule_cutoff) return cars_subset
def test_predict_probablity(self): header1 = ["A", "B", "Y"] rows1 = [[1, 1, 0], [1, 1, 0], [1, 1, 1], [0, 0, 0], [0, 0, 1], [0, 0, 1]] transactions = TransactionDB(rows1, header1) cba = CBA() cba.fit(transactions) probs = cba.clf.predict_probability_all(transactions)
def test_target_class_works(self): cba = CBA(algorithm="m2") test_dataframe = pd.read_csv(dataset_file, sep=",") transactions = TransactionDB.from_DataFrame(test_dataframe, target="Gender") cba.fit(transactions) rules = cba.clf.rules rule0 = rules[0] self.assertEqual(rule0.consequent[0], "Gender")
def test_rule_class_label_works(self): cba = CBA(algorithm="m2") test_dataframe = pd.read_csv(dataset_file, sep=",") transactions = TransactionDB.from_DataFrame(test_dataframe) cba.fit(transactions) rules = cba.clf.rules rule0 = rules[0] self.assertEqual(rule0.consequent[0], test_dataframe.columns.values[-1])
def test_inspect(self): cba = CBA() test_dataframe = pd.read_csv(dataset_file, sep=";") transactions = TransactionDB.from_DataFrame(test_dataframe) cba.fit(transactions) clf = cba.clf inspect_df = clf.inspect() self.assertEqual(type(inspect_df), pd.DataFrame) self.assertEqual(len(inspect_df), len(clf.rules) + 1) self.assertEqual(inspect_df["lhs"].iloc[-1], "{}")
def test_init(self): rows1 = [[1, 1, 0, 0], [1, 1, 0, 1], [0, 0, 1, 1], [0, 1, 0, 1]] header1 = ["A", "B", "C", "Y"] transDB1 = TransactionDB(rows1, header1, unique_transactions=False) transaction1 = Transaction([1, 1, 0], "ABC", Item("Y", 0)) class_labels = [ Item("Y", 0), Item("Y", 1), Item("Y", 1), Item("Y", 1), ] assert transDB1.class_labels == class_labels assert transDB1.classes == ["0", "1", "1", "1"] assert transDB1.data[0] == transaction1
def mine_CARs(df, rule_cutoff, sample=False, random_seed=None, **top_rules_kwargs): if random_seed: random.seed(random_seed) np.random.seed(random_seed) txns = TransactionDB.from_DataFrame(df) rules = top_rules(txns.string_representation, appearance=txns.appeardict, **top_rules_kwargs) cars = createCARs(rules) cars_subset = cars[:rule_cutoff] if sample: cars_subset = random.sample(cars, rule_cutoff) return cars_subset
def test_generateCARs(self): header1 = ["A", "B", "Y"] rows1 = [ [1, 1, 0], [1, 1, 0], [1, 1, 1], [0, 0, 0], [0, 0, 1], [0, 0, 1] ] transactionDB1 = TransactionDB(rows1, header1) rules = generateCARs(transactionDB1, support=50) car1 = ClassAssocationRule([], Consequent("Y", 1), support=0.5, confidence=0.5) car1.id = rules[0].id car2 = ClassAssocationRule([], Consequent("Y", 0), support=0.5, confidence=0.5) car1.id = rules[1].id car1 == rules[0] car2 == rules[1]
benchmark_data = [] for dataset_name, rule_cutoff in dataset_rulenum.items(): print(dataset_name) train_files = get_dataset_files(train_path, dataset_name) test_files = get_dataset_files(test_path, dataset_name) for train_file, test_file in list(zip(train_files, test_files))[:]: dataset_path = os.path.join(train_path, train_file) dataset_test_path = os.path.join(test_path, test_file) df = pd.read_csv(dataset_path) quant_df = QuantitativeDataFrame(df) txns = TransactionDB.from_DataFrame(df) df_test = pd.read_csv(dataset_test_path) quant_df_test = QuantitativeDataFrame(df_test) txns_test = TransactionDB.from_DataFrame(df_test) def fmax(param_dict): print(param_dict) support, confidence = param_dict["support"] / 1000, param_dict[ "confidence"] / 1000 print(dict(support=support, confidence=confidence)) cba = CBA(support=support, confidence=confidence) cba.fit(txns)
dataset_files = [f"{dataset_name}0.csv" for dataset_name in datasets] dataset_path = "C:/code/python/machine_learning/assoc_rules/" dataset_path_train = os.path.join(dataset_path, "train") dataset_path_test = os.path.join(dataset_path, "../../../test") benchmark_list = [] for dataset_filename in dataset_files: print(dataset_filename) df_train = pd.read_csv(os.path.join(dataset_path_train, dataset_filename)) df_test = pd.read_csv(os.path.join(dataset_path_test, dataset_filename)) txns_train = TransactionDB.from_DataFrame(df_train) txns_test = TransactionDB.from_DataFrame(df_test) quant_df_train = QuantitativeDataFrame(df_train) quant_df_test = QuantitativeDataFrame(df_test) cba = CBA(support=0.1, confidence=0.1) cba.fit(txns_train) rules = cba.clf.rules ids_ruleset = IDSRuleSet.from_cba_rules(rules) ids = IDS() ids.clf = IDSClassifier(ids_ruleset.ruleset) ids.clf.default_class = cba.clf.default_class
import numpy as np from pyarc.qcba.data_structures import QuantitativeDataFrame import time from pyids.algorithms.ids_classifier import mine_CARs from pyids.algorithms.ids import IDS from pyarc.data_structures import TransactionDB from pyarc.algorithms import M1Algorithm #logging.basicConfig(level=logging.DEBUG) iris_file = "c:/code/python/machine_learning/assoc_rules/train/iris0.csv" df = pd.read_csv(iris_file) txns = TransactionDB.from_DataFrame(df) iris_benchmark = [] for i in range(10, 110, 10): rule_count = i rules = mine_CARs(df, rule_count) quant_df = QuantitativeDataFrame(df) cars = mine_CARs(df, rule_count) print(len(cars)) for algorithm in ["DLS", "SLS"]: times = []