def _calculate_rule_confidence(self, car, pandas_df): quant_dataframe = QuantitativeDataFrame(pandas_df) support, confidence = quant_dataframe.calculate_rule_statistics(car) return confidence
def fit(self, rule_cutoff): dataframes = self._prepare_dataframes() scores = [] for dataframe_train, dataframe_test in dataframes: txns_train = TransactionDB.from_DataFrame(dataframe_train) rules = top_rules(txns_train.string_representation, appearance=txns_train.appeardict) cars = createCARs(rules)[:rule_cutoff] quant_dataframe_train = QuantitativeDataFrame(dataframe_train) quant_dataframe_test = QuantitativeDataFrame(dataframe_test) self.classifier.fit(quant_dataframe_train, cars, debug=self.debug) score = None if self.score_auc: score = self.classifier.score_auc(quant_dataframe_test) else: score = self.classifier.score(quant_dataframe_test) scores.append(score) return scores
def _prepare_data_sample(self, quant_dataframe): pandas_dataframe = quant_dataframe.dataframe class_column = pandas_dataframe[ self.class_name] if self.class_name else pandas_dataframe.iloc[:, -1] unique_classes = np.unique(class_column.values) restricted_quant_dataframes = [] for class_ in unique_classes: # TODO # find a better way than copy dataframe_restricted = pandas_dataframe.copy() dataframe_class_column_restricted = np.where( class_column == class_, class_, self.other_class_label) if self.class_name: dataframe_restricted[ self.class_name] = dataframe_class_column_restricted else: dataframe_restricted.iloc[:, -1] = dataframe_class_column_restricted dataframe = QuantitativeDataFrame(dataframe_restricted) restricted_quant_dataframes.append(dataframe) return restricted_quant_dataframes
def split_data_by_class(self, quant_dataframe): pandas_dataframe = quant_dataframe.dataframe class_column = pandas_dataframe[ self.class_name] if self.class_name else pandas_dataframe.iloc[:, -1] unique_classes = np.unique(class_column.values) restricted_quant_dataframes = dict() for class_ in unique_classes: dataframe_restricted = pandas_dataframe.copy() dataframe_class_column_restricted = np.where( class_column == class_, class_, self.other_class_label) if self.class_name: dataframe_restricted[ self.class_name] = dataframe_class_column_restricted else: dataframe_restricted.iloc[:, -1] = dataframe_class_column_restricted dataframe = QuantitativeDataFrame(dataframe_restricted) restricted_quant_dataframes[class_] = dataframe return restricted_quant_dataframes
def test_optimization(self): df = pd.read_csv("C:/code/python/interpretable_decision_sets/data/titanic.csv") ids_ruleset = mine_IDS_ruleset(df, rule_cutoff=40) quant_df = QuantitativeDataFrame(df) ascent = CoordinateAscentOptimizer(IDS(), maximum_consecutive_iterations=1) lambdas = ascent.fit(ids_ruleset, quant_df, quant_df)
def _prepare(self, quant_dataframe, class_name): if type(quant_dataframe) != QuantitativeDataFrame: raise Exception("Type of quant_dataframe must be QuantitativeDataFrame") self.quant_dataframe = quant_dataframe self.pandas_dataframe = self.quant_dataframe.dataframe self.ids_classifiers = dict() self.class_name = class_name self.other_class_label = "OTHER" class_column = self.pandas_dataframe[class_name] if class_name else self.pandas_dataframe.iloc[:,-1] unique_classes = np.unique(class_column.values) if len(unique_classes) < 3: raise Exception("Number of distinct classes must be greater than 2, otherwise use binary classifier") for class_ in unique_classes: # TODO # find a better way than copy dataframe_restricted = self.pandas_dataframe.copy() dataframe_class_column_restricted = np.where(class_column == class_, class_, self.other_class_label) if class_name: dataframe_restricted[class_name] = dataframe_class_column_restricted else: dataframe_restricted.iloc[:,-1] = dataframe_class_column_restricted ids_class_clf = IDS() self.ids_classifiers.update({class_ : dict( quant_dataframe=QuantitativeDataFrame(dataframe_restricted), clf=ids_class_clf )})
def test_model_fitting(self): df = pd.read_csv( "C:/code/python/interpretable_decision_sets/data/iris0.csv") quant_df = QuantitativeDataFrame(df) ids = IDSOneVsAll() ids.fit(quant_df, quant_df, debug=False) auc = ids.score_auc(quant_df)
def test_dls_algorithm(self): df = pd.read_csv( "C:/code/python/interpretable_decision_sets/data/iris0.csv") cars = mine_CARs(df, rule_cutoff=40) quant_df = QuantitativeDataFrame(df) ids = IDSOneVsAll() ids.fit(quant_df, quant_df, algorithm="SLS", debug=False) auc = ids.score_auc(quant_df)
def test_model_fitting(self): df = pd.read_csv( "C:/code/python/interpretable_decision_sets/data/titanic.csv") cars = mine_CARs(df, rule_cutoff=40) quant_df = QuantitativeDataFrame(df) ids = IDS() ids.fit(quant_df, cars, debug=False) auc = ids.score_auc(quant_df)
def test_random_seed(self): replications_n = 10 cars_to_mine = 10 df = pd.read_csv("data/iris0.csv") quant_df = QuantitativeDataFrame(df) mined_cars_mupliple = [] mined_cars_comparison_results = [] for _ in range(replications_n): ClassAssocationRule.id = 0 cars = mine_CARs(df, cars_to_mine) mined_cars_mupliple.append(cars) for idx in range(replications_n): same = _all_rules_same(mined_cars_mupliple[0], mined_cars_mupliple[idx]) mined_cars_comparison_results.append(same) self.assertTrue(np.all(mined_cars_comparison_results)) ids_models_multiple = [] ids_comparison_results = [] for _ in range(replications_n): ids = IDS() ids = ids.fit(quant_dataframe=quant_df, class_association_rules=cars, debug=False, random_seed=2) ids_models_multiple.append(ids.clf.rules) for idx in range(replications_n): same = _all_rules_same(ids_models_multiple[0], ids_models_multiple[idx]) ids_comparison_results.append(same) self.assertTrue(np.all(ids_comparison_results))
def _prepare(self, quant_dataframe: QuantitativeDataFrame, class_name: str): if type(quant_dataframe) != QuantitativeDataFrame: raise Exception( "Type of quant_dataframe must be QuantitativeDataFrame") self.quant_dataframe = quant_dataframe self.pandas_dataframe = self.quant_dataframe.dataframe self.class_name = class_name if class_name else self.pandas_dataframe.columns[ -1] class_column = self.pandas_dataframe[self.class_name] unique_classes = np.unique(class_column.values) if len(unique_classes) < 3: raise Exception( "Number of distinct classes must be greater than 2, otherwise use binary classifier" ) for class_ in unique_classes: dataframe_restricted = self.pandas_dataframe.copy() dataframe_class_column_restricted = np.where( class_column == class_, class_, self.other_class_label) dataframe_restricted[ self.class_name] = dataframe_class_column_restricted ids_class_clf = IDS(algorithm=self.algorithm) self.ids_classifiers.update({ class_: dict(quant_dataframe=QuantitativeDataFrame( dataframe_restricted), rules=None, clf=ids_class_clf) })
from pyids.algorithms import mine_CARs from pyids.data_structures import IDSRuleSet import json from pyarc.qcba.data_structures import QuantitativeDataFrame import random import logging import time logging.basicConfig(level=logging.INFO) df = pd.read_csv("../../../data/iris0.csv") cars = mine_CARs(df, 10, sample=False) ids_ruleset = IDSRuleSet.from_cba_rules(cars).ruleset quant_dataframe = QuantitativeDataFrame(df) def generate_lambda_arr(one_idx): total_params = 7 if one_idx == 0: start_arr = [] else: start_arr = one_idx * [0] end_arr = (total_params - one_idx - 1) * [0] return start_arr + [1] + end_arr
import pandas as pd from pyids.ids_classifier import IDSOneVsAll, mine_IDS_ruleset from pyids.model_selection import CoordinateAscentOptimizer, train_test_split_pd from pyarc.qcba.data_structures import QuantitativeDataFrame df = pd.read_csv("./data/iris0.csv") df_train, df_test = train_test_split_pd(df, prop=0.2) ids_ruleset = mine_IDS_ruleset(df_train, rule_cutoff=50) quant_dataframe_train = QuantitativeDataFrame(df_train) quant_dataframe_test = QuantitativeDataFrame(df_test) coordinate_ascent = CoordinateAscentOptimizer( IDSOneVsAll(), debug=True, maximum_delta_between_iterations=200, maximum_score_estimation_iterations=3) coordinate_ascent.fit(ids_ruleset, quant_dataframe_train, quant_dataframe_test) best_lambda_array = coordinate_ascent.current_best_params
from pyids.ids_classifier import IDS, mine_IDS_ruleset from pyids.ids_ruleset import IDSRuleSet from pyids.model_selection import RandomSearchOptimizer, train_test_split_pd from pyids.rule_mining import RuleMiner df = pd.read_csv("../../data/titanic.csv") rm = RuleMiner() cars = rm.mine_rules(df, minsup=0.005) ids_ruleset = IDSRuleSet.from_cba_rules(cars) df_train, df_test = train_test_split_pd(df, prop=0.25) quant_df_train, quant_df_test = QuantitativeDataFrame(df_train), QuantitativeDataFrame(df_test) random_optimizer = RandomSearchOptimizer(IDS(), maximum_score_estimation_iterations=5, maximum_iterations=500) lambda_array = random_optimizer.fit(ids_ruleset, quant_df_train, quant_df_test) all_params = random_optimizer.score_params_dict print(lambda_array) with open("results/random_search_lambda_array.txt", "w") as file: file.write(str(lambda_array)) with open("results/random_search_all_score_params.txt", "w") as file: file.write(str(random_optimizer.score_params_dict))
return [x for x in os.listdir(path) if x.startswith(dataset_name)] benchmark_data = [] for dataset_name, rule_cutoff in dataset_rulenum.items(): print(dataset_name) train_files = get_dataset_files(train_path, dataset_name) test_files = get_dataset_files(test_path, dataset_name) for train_file, test_file in list(zip(train_files, test_files))[:]: dataset_path = os.path.join(train_path, train_file) dataset_test_path = os.path.join(test_path, test_file) df = pd.read_csv(dataset_path) quant_df = QuantitativeDataFrame(df) txns = TransactionDB.from_DataFrame(df) df_test = pd.read_csv(dataset_test_path) quant_df_test = QuantitativeDataFrame(df_test) txns_test = TransactionDB.from_DataFrame(df_test) def fmax(param_dict): print(param_dict) support, confidence = param_dict["support"] / 1000, param_dict[ "confidence"] / 1000 print(dict(support=support, confidence=confidence)) cba = CBA(support=support, confidence=confidence) cba.fit(txns)
import pandas as pd import time import numpy as np import random from pyarc import CBA from pyarc.qcba import QCBA from pyarc import TransactionDB from pyarc.algorithms import (top_rules, createCARs, M1Algorithm, M2Algorithm) df = pd.read_csv( "c:/code/python/machine_learning/assoc_rules/train/lymph0.csv") df_undiscr = pd.read_csv( "c:/code/python/machine_learning/assoc_rules/folds_undiscr/train/lymph0.csv" ) quant_df = QuantitativeDataFrame(df) quant_df_undiscr = QuantitativeDataFrame(df_undiscr) benchmark_data = [] time_estimation_iterations = 10 max_rules = 100 def generate_lambda_array(): lambdas = [generate_lambda_parameter() for i in range(7)] return lambdas def generate_lambda_parameter():
from pyids.algorithms.ids import IDS from pyids.algorithms import mine_CARs from pyarc.qcba.data_structures import QuantitativeDataFrame from pyids.model_selection.coordinate_ascent import CoordinateAscent lambda_dict = { 'l1': 124.16415180612711, 'l2': 38.896662094192955, 'l3': 557.0996799268405, 'l4': 638.188385916781, 'l5': 136.48056698673983, 'l6': 432.1760402377687, 'l7': 452.1563786008231 } lambda_array = [ 665.9341563786008, 271.7242798353909, 212.34156378600824, 20.489711934156375, 648.5761316872428, 911, 560 ] df = pd.read_csv("C:/code/python/machine_learning/assoc_rules/train/iris0.csv") quant_df = QuantitativeDataFrame(df) quant_df_test = QuantitativeDataFrame( pd.read_csv("C:/code/python/machine_learning/assoc_rules/test/iris0.csv")) cars = mine_CARs(df, 20) ids = IDS(algorithm="DUSM") ids.fit(quant_df, cars, lambda_array=lambda_array) print(ids.score_auc(quant_df))
def generate_lambda_array(): lambdas = [generate_lambda_parameter() for i in range(7)] return lambdas def generate_lambda_parameter(): return random.randint(0, 1000) for i in range(100, 3000, 100): df = df_all.iloc[:i, :].copy() data_count = df.shape[0] quant_df = QuantitativeDataFrame(df) for algorithm in ["DLS", "SLS", "DUSM", "RUSM"]: durations = [] for _ in range(time_estimation_iterations): print(_) lambda_array = generate_lambda_array() print(f"data count: {data_count}") print(f"algorithm: {algorithm}") print(f"using lambda: {lambda_array}") cars = mine_CARs(df, rule_cutoff=rule_cutoff) ids = IDS(algorithm=algorithm)
def run1fold(basepath, datasetname, unique_transactions=True, runQCBA=False, saveIDSRules=True, useConfidenceForCandidateGeneration=True): df_stat = pd.DataFrame( columns=['ids', 'idsqcba'], index=["accuracy", "rulecount", "rulelength", "buildtime"]) if (runQCBA): #python QCBA implementation uses custom discretization format data_train_disc = pd.read_csv( basepath + "data/folds_discr/train/{}.csv".format(datasetname)) data_test_disc = pd.read_csv( basepath + "data/folds_discr/test/{}.csv".format(datasetname)) data_test_undisc = pd.read_csv( basepath + "data/folds_nodiscr/test/{}.csv".format(datasetname)) data_train_undisc = pd.read_csv( basepath + "data/folds_nodiscr/train/{}.csv".format(datasetname)) quant_dataframe_test_undisc = QuantitativeDataFrame(data_test_undisc) quant_dataframe_train_undisc = QuantitativeDataFrame(data_train_undisc) else: #R QCBA implementation uses different discretization format, folds are generated with preprocess_for_ids.R data_train_disc = pd.read_csv( basepath + "data/folds_discr2/train/{}.csv".format(datasetname)) data_test_disc = pd.read_csv( basepath + "data/folds_discr2/test/{}.csv".format(datasetname)) quant_dataframe_train_disc = QuantitativeDataFrame(data_train_disc) quant_dataframe_test_disc = QuantitativeDataFrame(data_test_disc) actual = quant_dataframe_test_disc.dataframe.iloc[:, -1].values if useConfidenceForCandidateGeneration: # mine_CARs learns initial candidate rules with CBA-like approach # it uses unsupervised paramter tuning to determine conf, supp and len thresholds, # as described in Kliegr & Kuchar, 2019 # Because the subsequent optimization is slow, not all initial candidate rules can be passed to IDS. # the sample parameter controls, how the subset of N rules will be selected from the initial candidates: # sample=False: take top N rules according to CBA criteria. According to our experiments, this has better results # sample=True: take random N rules cars = mine_CARs(data_train_disc, 50, sample=False) else: # learn candidate rules using approach without min confidence described in Lakkaraju et al, 2-16 print("WARNING save any unsaved work") print( "WARNING candidate generation without minimum confidence and sampling may be too slow or memory intensive" ) rm = RuleMiner() cars = rm.mine_rules( data_train_disc, minsup=0.01) # the 0.01 threshold is from the IDS paper print(len(cars)) print("rule mining finished") #train IDS model ids = IDS() start = time.time() # all lambdas are set to the same value ids.fit(class_association_rules=cars, lambda_array=7 * [1], quant_dataframe=quant_dataframe_train_disc, debug=False, random_seed=1) end = time.time() df_stat.loc["buildtime", "ids"] = end - start #apply IDS model df_stat.loc["accuracy", "ids"] = ids.score(quant_dataframe_test_disc) print("Acc IDS:", df_stat.loc["accuracy", "ids"]) df_stat.loc["rulecount", "ids"] = len(ids.clf.rules) antLengths = list( map(lambda r: len(r.car.antecedent.itemset.items()), ids.clf.rules)) df_stat.loc["rulelength", "ids"] = sum(antLengths) / len(antLengths) avg_rule_legnth_ids = None print("Rule Count IDS:", df_stat.loc["rulecount", "ids"]) if (saveIDSRules): idsRulesPath = basepath + modelFolder + "/{}.csv".format(datasetname) file = open(idsRulesPath, "w") txtexport = "rules,suppport,confidence,lift\n" #Before export, IDS sorts the rule by harmonic mean of support and confidence (st.hmean([self.car.support, self.car.confidence])) #In this order, rules are also applied for prediction for r in ids.clf.rules: args = [ r.car.antecedent.string(), "{" + r.car.consequent.string() + "}", r.car.support, r.car.confidence, 0 ] txtexport = txtexport + "\"{} => {}\",{:.2f},{:.2f},{:.2f} \n".format( *args) #add default rule classname = data_train_disc.columns.values[-1] txtexport = txtexport + "\"{ } => " + "{" + classname + "=" + mode( data_train_disc[data_train_disc.columns[-1]]) + "}\", 0,0,0" print(txtexport) file.write(txtexport) file.close() if (runQCBA): #postprocess IDS model with QCBA rules_to_optimize = ids.clf.rules start = time.time() quant_rules = [QuantitativeCAR(r.car) for r in rules_to_optimize] qcba_transformation = QCBATransformation(quant_dataframe_train_undisc) transformed_rules = qcba_transformation.transform(quant_rules) end = time.time() df_stat.loc["buildtime", "idsqcba"] = end - start rules, default_class = transformed_rules antLengths = list( map(lambda r: len(r.car.antecedent.itemset.items()), ids.clf.rules)) #+1 because the default rule is not counted df_stat.loc["rulelength", "idsqcba"] = sum(antLengths) / (len(antLengths) + 1) #apply QCBA model qclf = QuantitativeClassifier(rules, default_class) pred = qclf.predict(quant_dataframe_test_undisc) #evaluate model - QCBA df_stat.loc["accuracy", "idsqcba"] = accuracy_score(actual, pred) df_stat.loc["rulecount", "idsqcba"] = len(rules) print("Acc IDS-QCBA:", df_stat.loc["accuracy", "idsqcba"]) print("Rule Count IDS-QCBA:", df_stat.loc["rulecount", "idsqcba"]) return df_stat