예제 #1
0
    def _calculate_rule_confidence(self, car, pandas_df):

        quant_dataframe = QuantitativeDataFrame(pandas_df)

        support, confidence = quant_dataframe.calculate_rule_statistics(car)

        return confidence
예제 #2
0
    def fit(self, rule_cutoff):
        dataframes = self._prepare_dataframes()

        scores = []

        for dataframe_train, dataframe_test in dataframes:
            txns_train = TransactionDB.from_DataFrame(dataframe_train)

            rules = top_rules(txns_train.string_representation, appearance=txns_train.appeardict)
            cars = createCARs(rules)[:rule_cutoff]

            quant_dataframe_train = QuantitativeDataFrame(dataframe_train)
            quant_dataframe_test = QuantitativeDataFrame(dataframe_test)

            self.classifier.fit(quant_dataframe_train, cars, debug=self.debug)

            score = None
            
            if self.score_auc:
                score = self.classifier.score_auc(quant_dataframe_test)
            else:
                score = self.classifier.score(quant_dataframe_test)


            scores.append(score)

        return scores
예제 #3
0
    def _prepare_data_sample(self, quant_dataframe):
        pandas_dataframe = quant_dataframe.dataframe

        class_column = pandas_dataframe[
            self.class_name] if self.class_name else pandas_dataframe.iloc[:,
                                                                           -1]
        unique_classes = np.unique(class_column.values)

        restricted_quant_dataframes = []

        for class_ in unique_classes:
            # TODO
            # find a better way than copy
            dataframe_restricted = pandas_dataframe.copy()
            dataframe_class_column_restricted = np.where(
                class_column == class_, class_, self.other_class_label)

            if self.class_name:
                dataframe_restricted[
                    self.class_name] = dataframe_class_column_restricted
            else:
                dataframe_restricted.iloc[:,
                                          -1] = dataframe_class_column_restricted

            dataframe = QuantitativeDataFrame(dataframe_restricted)
            restricted_quant_dataframes.append(dataframe)

        return restricted_quant_dataframes
예제 #4
0
    def split_data_by_class(self, quant_dataframe):
        pandas_dataframe = quant_dataframe.dataframe

        class_column = pandas_dataframe[
            self.class_name] if self.class_name else pandas_dataframe.iloc[:,
                                                                           -1]
        unique_classes = np.unique(class_column.values)

        restricted_quant_dataframes = dict()

        for class_ in unique_classes:
            dataframe_restricted = pandas_dataframe.copy()
            dataframe_class_column_restricted = np.where(
                class_column == class_, class_, self.other_class_label)

            if self.class_name:
                dataframe_restricted[
                    self.class_name] = dataframe_class_column_restricted
            else:
                dataframe_restricted.iloc[:,
                                          -1] = dataframe_class_column_restricted

            dataframe = QuantitativeDataFrame(dataframe_restricted)
            restricted_quant_dataframes[class_] = dataframe

        return restricted_quant_dataframes
예제 #5
0
    def test_optimization(self):
        df = pd.read_csv("C:/code/python/interpretable_decision_sets/data/titanic.csv")
        ids_ruleset = mine_IDS_ruleset(df, rule_cutoff=40)

        quant_df = QuantitativeDataFrame(df)
        ascent = CoordinateAscentOptimizer(IDS(), maximum_consecutive_iterations=1)
        lambdas = ascent.fit(ids_ruleset, quant_df, quant_df)
예제 #6
0
    def _prepare(self, quant_dataframe, class_name):
        if type(quant_dataframe) != QuantitativeDataFrame:
            raise Exception("Type of quant_dataframe must be QuantitativeDataFrame")

        self.quant_dataframe = quant_dataframe   
        self.pandas_dataframe = self.quant_dataframe.dataframe
        self.ids_classifiers = dict()

        self.class_name = class_name
        self.other_class_label = "OTHER"

        class_column = self.pandas_dataframe[class_name] if class_name else self.pandas_dataframe.iloc[:,-1]
        unique_classes = np.unique(class_column.values)

        if len(unique_classes) < 3:
            raise Exception("Number of distinct classes must be greater than 2, otherwise use binary classifier")

        for class_ in unique_classes:
            # TODO
            # find a better way than copy
            dataframe_restricted = self.pandas_dataframe.copy()
            dataframe_class_column_restricted = np.where(class_column == class_, class_, self.other_class_label)

            if class_name:
                dataframe_restricted[class_name] = dataframe_class_column_restricted
            else:
                dataframe_restricted.iloc[:,-1] = dataframe_class_column_restricted
            
            ids_class_clf = IDS()

            self.ids_classifiers.update({class_ : dict(
                quant_dataframe=QuantitativeDataFrame(dataframe_restricted),
                clf=ids_class_clf
            )})
예제 #7
0
    def test_model_fitting(self):
        df = pd.read_csv(
            "C:/code/python/interpretable_decision_sets/data/iris0.csv")

        quant_df = QuantitativeDataFrame(df)
        ids = IDSOneVsAll()
        ids.fit(quant_df, quant_df, debug=False)
        auc = ids.score_auc(quant_df)
예제 #8
0
    def test_dls_algorithm(self):
        df = pd.read_csv(
            "C:/code/python/interpretable_decision_sets/data/iris0.csv")
        cars = mine_CARs(df, rule_cutoff=40)

        quant_df = QuantitativeDataFrame(df)
        ids = IDSOneVsAll()
        ids.fit(quant_df, quant_df, algorithm="SLS", debug=False)
        auc = ids.score_auc(quant_df)
예제 #9
0
    def test_model_fitting(self):
        df = pd.read_csv(
            "C:/code/python/interpretable_decision_sets/data/titanic.csv")
        cars = mine_CARs(df, rule_cutoff=40)

        quant_df = QuantitativeDataFrame(df)
        ids = IDS()
        ids.fit(quant_df, cars, debug=False)
        auc = ids.score_auc(quant_df)
예제 #10
0
    def test_random_seed(self):
        replications_n = 10
        cars_to_mine = 10

        df = pd.read_csv("data/iris0.csv")
        quant_df = QuantitativeDataFrame(df)

        mined_cars_mupliple = []
        mined_cars_comparison_results = []

        for _ in range(replications_n):
            ClassAssocationRule.id = 0
            cars = mine_CARs(df, cars_to_mine)
            mined_cars_mupliple.append(cars)

        for idx in range(replications_n):
            same = _all_rules_same(mined_cars_mupliple[0],
                                   mined_cars_mupliple[idx])
            mined_cars_comparison_results.append(same)

        self.assertTrue(np.all(mined_cars_comparison_results))

        ids_models_multiple = []
        ids_comparison_results = []

        for _ in range(replications_n):
            ids = IDS()
            ids = ids.fit(quant_dataframe=quant_df,
                          class_association_rules=cars,
                          debug=False,
                          random_seed=2)
            ids_models_multiple.append(ids.clf.rules)

        for idx in range(replications_n):
            same = _all_rules_same(ids_models_multiple[0],
                                   ids_models_multiple[idx])
            ids_comparison_results.append(same)

        self.assertTrue(np.all(ids_comparison_results))
예제 #11
0
    def _prepare(self, quant_dataframe: QuantitativeDataFrame,
                 class_name: str):
        if type(quant_dataframe) != QuantitativeDataFrame:
            raise Exception(
                "Type of quant_dataframe must be QuantitativeDataFrame")

        self.quant_dataframe = quant_dataframe
        self.pandas_dataframe = self.quant_dataframe.dataframe

        self.class_name = class_name if class_name else self.pandas_dataframe.columns[
            -1]
        class_column = self.pandas_dataframe[self.class_name]
        unique_classes = np.unique(class_column.values)

        if len(unique_classes) < 3:
            raise Exception(
                "Number of distinct classes must be greater than 2, otherwise use binary classifier"
            )

        for class_ in unique_classes:
            dataframe_restricted = self.pandas_dataframe.copy()
            dataframe_class_column_restricted = np.where(
                class_column == class_, class_, self.other_class_label)

            dataframe_restricted[
                self.class_name] = dataframe_class_column_restricted

            ids_class_clf = IDS(algorithm=self.algorithm)

            self.ids_classifiers.update({
                class_:
                dict(quant_dataframe=QuantitativeDataFrame(
                    dataframe_restricted),
                     rules=None,
                     clf=ids_class_clf)
            })
예제 #12
0
from pyids.algorithms import mine_CARs
from pyids.data_structures import IDSRuleSet
import json
from pyarc.qcba.data_structures import QuantitativeDataFrame

import random
import logging
import time

logging.basicConfig(level=logging.INFO)

df = pd.read_csv("../../../data/iris0.csv")
cars = mine_CARs(df, 10, sample=False)
ids_ruleset = IDSRuleSet.from_cba_rules(cars).ruleset

quant_dataframe = QuantitativeDataFrame(df)


def generate_lambda_arr(one_idx):
    total_params = 7

    if one_idx == 0:
        start_arr = []
    else:
        start_arr = one_idx * [0]

    end_arr = (total_params - one_idx - 1) * [0]

    return start_arr + [1] + end_arr

예제 #13
0
import pandas as pd
from pyids.ids_classifier import IDSOneVsAll, mine_IDS_ruleset
from pyids.model_selection import CoordinateAscentOptimizer, train_test_split_pd

from pyarc.qcba.data_structures import QuantitativeDataFrame

df = pd.read_csv("./data/iris0.csv")
df_train, df_test = train_test_split_pd(df, prop=0.2)

ids_ruleset = mine_IDS_ruleset(df_train, rule_cutoff=50)

quant_dataframe_train = QuantitativeDataFrame(df_train)
quant_dataframe_test = QuantitativeDataFrame(df_test)

coordinate_ascent = CoordinateAscentOptimizer(
    IDSOneVsAll(),
    debug=True,
    maximum_delta_between_iterations=200,
    maximum_score_estimation_iterations=3)
coordinate_ascent.fit(ids_ruleset, quant_dataframe_train, quant_dataframe_test)

best_lambda_array = coordinate_ascent.current_best_params
예제 #14
0
from pyids.ids_classifier import IDS, mine_IDS_ruleset
from pyids.ids_ruleset import IDSRuleSet
from pyids.model_selection import RandomSearchOptimizer, train_test_split_pd
from pyids.rule_mining import RuleMiner


df = pd.read_csv("../../data/titanic.csv")

rm = RuleMiner()
cars = rm.mine_rules(df, minsup=0.005)

ids_ruleset = IDSRuleSet.from_cba_rules(cars)

df_train, df_test = train_test_split_pd(df, prop=0.25)
quant_df_train, quant_df_test = QuantitativeDataFrame(df_train), QuantitativeDataFrame(df_test)



random_optimizer = RandomSearchOptimizer(IDS(), maximum_score_estimation_iterations=5, maximum_iterations=500)
lambda_array = random_optimizer.fit(ids_ruleset, quant_df_train, quant_df_test)
all_params = random_optimizer.score_params_dict

print(lambda_array)

with open("results/random_search_lambda_array.txt", "w") as file:
    file.write(str(lambda_array))

with open("results/random_search_all_score_params.txt", "w") as file:
    file.write(str(random_optimizer.score_params_dict))
    return [x for x in os.listdir(path) if x.startswith(dataset_name)]


benchmark_data = []

for dataset_name, rule_cutoff in dataset_rulenum.items():
    print(dataset_name)

    train_files = get_dataset_files(train_path, dataset_name)
    test_files = get_dataset_files(test_path, dataset_name)
    for train_file, test_file in list(zip(train_files, test_files))[:]:
        dataset_path = os.path.join(train_path, train_file)
        dataset_test_path = os.path.join(test_path, test_file)

        df = pd.read_csv(dataset_path)
        quant_df = QuantitativeDataFrame(df)
        txns = TransactionDB.from_DataFrame(df)

        df_test = pd.read_csv(dataset_test_path)
        quant_df_test = QuantitativeDataFrame(df_test)
        txns_test = TransactionDB.from_DataFrame(df_test)

        def fmax(param_dict):
            print(param_dict)

            support, confidence = param_dict["support"] / 1000, param_dict[
                "confidence"] / 1000
            print(dict(support=support, confidence=confidence))

            cba = CBA(support=support, confidence=confidence)
            cba.fit(txns)
예제 #16
0
import pandas as pd
import time
import numpy as np
import random
from pyarc import CBA
from pyarc.qcba import QCBA
from pyarc import TransactionDB
from pyarc.algorithms import (top_rules, createCARs, M1Algorithm, M2Algorithm)

df = pd.read_csv(
    "c:/code/python/machine_learning/assoc_rules/train/lymph0.csv")
df_undiscr = pd.read_csv(
    "c:/code/python/machine_learning/assoc_rules/folds_undiscr/train/lymph0.csv"
)

quant_df = QuantitativeDataFrame(df)
quant_df_undiscr = QuantitativeDataFrame(df_undiscr)

benchmark_data = []

time_estimation_iterations = 10
max_rules = 100


def generate_lambda_array():
    lambdas = [generate_lambda_parameter() for i in range(7)]

    return lambdas


def generate_lambda_parameter():
예제 #17
0
from pyids.algorithms.ids import IDS
from pyids.algorithms import mine_CARs
from pyarc.qcba.data_structures import QuantitativeDataFrame
from pyids.model_selection.coordinate_ascent import CoordinateAscent

lambda_dict = {
    'l1': 124.16415180612711,
    'l2': 38.896662094192955,
    'l3': 557.0996799268405,
    'l4': 638.188385916781,
    'l5': 136.48056698673983,
    'l6': 432.1760402377687,
    'l7': 452.1563786008231
}
lambda_array = [
    665.9341563786008, 271.7242798353909, 212.34156378600824,
    20.489711934156375, 648.5761316872428, 911, 560
]

df = pd.read_csv("C:/code/python/machine_learning/assoc_rules/train/iris0.csv")
quant_df = QuantitativeDataFrame(df)
quant_df_test = QuantitativeDataFrame(
    pd.read_csv("C:/code/python/machine_learning/assoc_rules/test/iris0.csv"))

cars = mine_CARs(df, 20)

ids = IDS(algorithm="DUSM")
ids.fit(quant_df, cars, lambda_array=lambda_array)

print(ids.score_auc(quant_df))
예제 #18
0
def generate_lambda_array():
    lambdas = [generate_lambda_parameter() for i in range(7)]

    return lambdas


def generate_lambda_parameter():
    return random.randint(0, 1000)


for i in range(100, 3000, 100):
    df = df_all.iloc[:i, :].copy()
    data_count = df.shape[0]

    quant_df = QuantitativeDataFrame(df)

    for algorithm in ["DLS", "SLS", "DUSM", "RUSM"]:
        durations = []

        for _ in range(time_estimation_iterations):
            print(_)
            lambda_array = generate_lambda_array()

            print(f"data count: {data_count}")
            print(f"algorithm: {algorithm}")
            print(f"using lambda: {lambda_array}")

            cars = mine_CARs(df, rule_cutoff=rule_cutoff)

            ids = IDS(algorithm=algorithm)
예제 #19
0
def run1fold(basepath,
             datasetname,
             unique_transactions=True,
             runQCBA=False,
             saveIDSRules=True,
             useConfidenceForCandidateGeneration=True):
    df_stat = pd.DataFrame(
        columns=['ids', 'idsqcba'],
        index=["accuracy", "rulecount", "rulelength", "buildtime"])
    if (runQCBA):
        #python QCBA implementation uses custom discretization format
        data_train_disc = pd.read_csv(
            basepath + "data/folds_discr/train/{}.csv".format(datasetname))
        data_test_disc = pd.read_csv(
            basepath + "data/folds_discr/test/{}.csv".format(datasetname))
        data_test_undisc = pd.read_csv(
            basepath + "data/folds_nodiscr/test/{}.csv".format(datasetname))
        data_train_undisc = pd.read_csv(
            basepath + "data/folds_nodiscr/train/{}.csv".format(datasetname))

        quant_dataframe_test_undisc = QuantitativeDataFrame(data_test_undisc)
        quant_dataframe_train_undisc = QuantitativeDataFrame(data_train_undisc)
    else:
        #R QCBA implementation uses different discretization format, folds are generated with preprocess_for_ids.R
        data_train_disc = pd.read_csv(
            basepath + "data/folds_discr2/train/{}.csv".format(datasetname))
        data_test_disc = pd.read_csv(
            basepath + "data/folds_discr2/test/{}.csv".format(datasetname))

    quant_dataframe_train_disc = QuantitativeDataFrame(data_train_disc)
    quant_dataframe_test_disc = QuantitativeDataFrame(data_test_disc)

    actual = quant_dataframe_test_disc.dataframe.iloc[:, -1].values

    if useConfidenceForCandidateGeneration:
        # mine_CARs learns initial candidate rules with CBA-like approach
        # it uses unsupervised paramter tuning to determine conf, supp and len thresholds,
        # as described in Kliegr & Kuchar, 2019
        # Because the subsequent optimization is slow, not all initial candidate rules can be passed to IDS.
        # the sample parameter controls, how the subset of N rules will be selected from the initial candidates:
        # sample=False: take top N rules according to CBA criteria. According to our experiments, this has better results
        # sample=True: take random N rules

        cars = mine_CARs(data_train_disc, 50, sample=False)
    else:
        # learn candidate rules using approach without min confidence described in Lakkaraju et al, 2-16
        print("WARNING save any unsaved work")
        print(
            "WARNING candidate generation without minimum confidence and sampling may be too slow or memory intensive"
        )
        rm = RuleMiner()
        cars = rm.mine_rules(
            data_train_disc,
            minsup=0.01)  # the 0.01 threshold is from the IDS paper
        print(len(cars))
        print("rule mining finished")

    #train IDS model
    ids = IDS()
    start = time.time()
    # all lambdas are set to the same value
    ids.fit(class_association_rules=cars,
            lambda_array=7 * [1],
            quant_dataframe=quant_dataframe_train_disc,
            debug=False,
            random_seed=1)
    end = time.time()
    df_stat.loc["buildtime", "ids"] = end - start
    #apply IDS model
    df_stat.loc["accuracy", "ids"] = ids.score(quant_dataframe_test_disc)
    print("Acc IDS:", df_stat.loc["accuracy", "ids"])
    df_stat.loc["rulecount", "ids"] = len(ids.clf.rules)
    antLengths = list(
        map(lambda r: len(r.car.antecedent.itemset.items()), ids.clf.rules))
    df_stat.loc["rulelength", "ids"] = sum(antLengths) / len(antLengths)

    avg_rule_legnth_ids = None
    print("Rule Count IDS:", df_stat.loc["rulecount", "ids"])
    if (saveIDSRules):
        idsRulesPath = basepath + modelFolder + "/{}.csv".format(datasetname)
        file = open(idsRulesPath, "w")
        txtexport = "rules,suppport,confidence,lift\n"
        #Before export, IDS sorts the rule by harmonic mean of support and confidence (st.hmean([self.car.support, self.car.confidence]))
        #In this order, rules are also applied for prediction
        for r in ids.clf.rules:
            args = [
                r.car.antecedent.string(),
                "{" + r.car.consequent.string() + "}", r.car.support,
                r.car.confidence, 0
            ]
            txtexport = txtexport + "\"{} => {}\",{:.2f},{:.2f},{:.2f} \n".format(
                *args)
        #add default rule
        classname = data_train_disc.columns.values[-1]
        txtexport = txtexport + "\"{ } => " + "{" + classname + "=" + mode(
            data_train_disc[data_train_disc.columns[-1]]) + "}\", 0,0,0"

        print(txtexport)

        file.write(txtexport)
        file.close()

    if (runQCBA):
        #postprocess IDS model with QCBA
        rules_to_optimize = ids.clf.rules
        start = time.time()
        quant_rules = [QuantitativeCAR(r.car) for r in rules_to_optimize]
        qcba_transformation = QCBATransformation(quant_dataframe_train_undisc)
        transformed_rules = qcba_transformation.transform(quant_rules)
        end = time.time()
        df_stat.loc["buildtime", "idsqcba"] = end - start
        rules, default_class = transformed_rules
        antLengths = list(
            map(lambda r: len(r.car.antecedent.itemset.items()),
                ids.clf.rules))
        #+1 because the default rule is not counted
        df_stat.loc["rulelength",
                    "idsqcba"] = sum(antLengths) / (len(antLengths) + 1)

        #apply QCBA model
        qclf = QuantitativeClassifier(rules, default_class)
        pred = qclf.predict(quant_dataframe_test_undisc)

        #evaluate model - QCBA
        df_stat.loc["accuracy", "idsqcba"] = accuracy_score(actual, pred)
        df_stat.loc["rulecount", "idsqcba"] = len(rules)
        print("Acc IDS-QCBA:", df_stat.loc["accuracy", "idsqcba"])
        print("Rule Count IDS-QCBA:", df_stat.loc["rulecount", "idsqcba"])
    return df_stat