示例#1
0
results = ""
for datasetname in list_datasets:
    # load data
    print(f"{datasetname} dataset")
    filename = "./data/single-nominal/" + datasetname + ".csv"
    df = pd.read_csv(filename, delimiter=delim)
    X = df.iloc[:, :-1]
    Y = df.iloc[:, -1]
    Y = pd.DataFrame(Y)
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.50,
                                                        random_state=42)
    model = MDLRuleList(target_type, task=task_name, beam_width=100)
    model.fit(X_train, Y_train)
    folder_path = makefolder_name(folder2save_name)
    save_rulelist_path = os.path.join(folder_path,
                                      datasetname + "_rulelist.pickle")
    with open(save_rulelist_path, 'wb') as f:
        pickle.dump(model, f)
    measures = nominal_discovery_measures(model._rulelist, X_train, Y_train)
    measures["runtime"] = model.runtime
    measures["nsamples_train"] = X.shape[0]
    measures[
        "swkl_train_norm"] = measures["wkl_sum"] / measures["nsamples_train"]

    # add more measures on generalisation
    loss_train, loss_train_norm = model.swkl_generalise(X_train, Y_train)
    measures["loss_train"] = loss_train
    measures["loss_train_norm"] = loss_train_norm
示例#2
0
def runOtherSSDalgorithms(task, algorithmname, list_datasets, arff_algorithm):
    directory_arff = "data/" + task + "-arff/"
    directory_csv = "data/" + task + "/"

    depthmax = 5.0
    beam_width = 100

    savefile = makefolder_name(algorithmname + task)
    savefile = savefile + "/summary.csv"

    if task == "multi-numeric" or task == "multi-nominal":
        dataset_number_targets = pd.read_csv(directory_arff +
                                             "number_targets.csv",
                                             index_col=0)
    elif task == "single-numeric" or task == "single-nominal":
        dataset_number_targets = pd.DataFrame(
            index=[datasetname for datasetname in list_datasets],
            columns=["number_targets"],
            data=[1 for datasetname in list_datasets])

    number_rules_SSD = pd.read_csv(directory_arff + "number_rules_SSD.csv",
                                   index_col=0)
    for dataset_number, datasetname in enumerate(list_datasets):
        print("Dataset name: " + str(datasetname))
        number_targets = dataset_number_targets.loc[datasetname,
                                                    "number_targets"]
        if arff_algorithm:
            file_arff_data = directory_arff + datasetname + ".arff"
            data_arff_scipy = loadarff(file_arff_data)
            attribute_names = [att for att in data_arff_scipy[1]._attributes]
            df = pd.DataFrame(data_arff_scipy[0])

        else:
            file_csv_data = directory_csv + datasetname + ".csv"
            df = pd.read_csv(file_csv_data)
            attribute_names = df.columns[0:-number_targets]
        # train test split
        indeces_train, indeces_test = np.arange(df.shape[0]), np.arange(0)
        # indeces_train, indeces_test = train_test_split(indexes_alldataset, test_size = 0.33, random_state = 42)
        X_train, Y_train = df.iloc[indeces_train, :-number_targets], df.iloc[
            indeces_train, -number_targets:]
        X_test, Y_test = df.iloc[indeces_test, :-number_targets], df.iloc[
            indeces_test, -number_targets:]
        Y_train = pd.DataFrame(Y_train)
        Y_test = pd.DataFrame(Y_test)

        # change configuration file of DSSD
        if algorithmname in ['top-k', 'seq-cover', 'DSSD']:
            save_file_tmp_arff = 'otheralgorithms/DSSD/data/datasets/tmp/tmp.arff'
            writedf2arff(df, indeces_train, file_arff_data, save_file_tmp_arff)
            nitems, subgroup_sets_support_bitset, timespent = run_DSSD_wrapper(
                algorithmname, beam_width, number_rules_SSD, datasetname, df,
                task, depthmax, attribute_names, number_targets)
        elif algorithmname == 'MCTS4DM':
            save_file_tmp_arff = 'otheralgorithms/MCTS4DM/data/datasets/tmp/tmp.arff'
            writedf2arff(df, indeces_train, file_arff_data, save_file_tmp_arff)

        elif algorithmname == 'FSSD':
            class_attribute = 'class'
            attributes, types = transform_dataset_to_attributes(
                file_csv_data, class_attribute, delimiter=',')
            dataset, header = readCSVwithHeader(
                file_csv_data,
                numberHeader=[
                    a for a, t in zip(attributes, types) if t == 'numeric'
                ],
                delimiter=',')
            #dataset_train = [dataset[irow] for irow in indeces_train] # for nursery something weird happens here...
            dataset_train = dataset
            nitems, subgroup_sets_support_bitset, timespent = run_FSSD_wrapper(
                dataset_train, attributes, class_attribute, types, depthmax)
        elif algorithmname in ['CN2SD-entro', 'CN2SD-wracc']:
            class_attribute = 'class'
            attributes, types = transform_dataset_to_attributes(
                file_csv_data, class_attribute, delimiter=',')
            dataset, header = readCSVwithHeader(
                file_csv_data,
                numberHeader=[
                    a for a, t in zip(attributes, types) if t == 'numeric'
                ],
                delimiter=',')
            #dataset_train = [dataset[irow] for irow in indeces_train]
            dataset_train = dataset
            if algorithmname == 'CN2SD-entro':
                quality = 'entropy'
            elif algorithmname == 'CN2SD-wracc':
                quality = 'wracc'
            nitems, subgroup_sets_support_bitset, timespent = run_CN2SD_wrapper(
                dataset_train, attributes, types, class_attribute, beam_width,
                depthmax, quality)
        else:
            raise ValueError(
                "Wrong algorithmname selected. please try one from this list ['top-k','seq-cover','DSSD',"
                "'CN2SD','MCTS4DM','FSSD']")
        # Train dataset
        nrows_train = Y_train.shape[0]
        if task == "single-nominal" or task == "multi-nominal":
            default_prob_per_class_train = {
                name: {
                    category: sum(columnvals == category) / nrows_train
                    for category in columnvals.unique()
                }
                for name, columnvals in Y_train.items()
            }
            measures_train = nominal_discovery_measures(
                default_prob_per_class_train, subgroup_sets_support_bitset,
                X_train, Y_train)

        elif task == "single-numeric" or task == "multi-numeric":
            # other measures
            measures_train = numeric_discovery_measures(
                subgroup_sets_support_bitset, X_train, Y_train)
        else:
            raise Exception("Wrong task name")
        measures_train["avg_items"] = sum(nitems) / len(nitems)
        measures_train["runtime"] = timespent

        if dataset_number == 0:
            string = "datasetname," + ",".join(
                [meas for meas in measures_train]) + " \n"
            with open(savefile, 'w') as file:
                file.write("%s" % string)
        append_result2file(measures_train, datasetname, savefile)
import matplotlib.pyplot as plt
import matplotlib.ticker as tick
import matplotlib.markers as marker
import matplotlib.axes as axes

from matplotlib.ticker import FormatStrFormatter
from RSD.util.results2folder import makefolder_name

###############################################################################
# runtime plot
###############################################################################

name_save = "plot_runtime"
datatype= '-nominal'
algorithms = ["RSD","top-k","seq-cover","CN2-SD"]
folder_path = makefolder_name(name_save)
variable = "runtime"
s=50
alp = 0.7
fig = plt.figure()
ax = plt.gca()
list_markers=['s','D','v','^','<',"o",'>']
# load data
results = dict()
for ialg,alg in enumerate(algorithms):
    folder_load = os.path.join("results",alg+datatype,"summary.csv")
    results[alg]= pd.read_csv(folder_load,index_col=False)

labelstotal = results["top-k"].datasetname.to_numpy()
#ax.axvline(10.5,linewidth =1,linestyle="-.", color =(0,0,0))
for ialg,alg in enumerate(algorithms):
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as tick
import matplotlib.markers as marker
import matplotlib.axes as axes

from matplotlib.ticker import FormatStrFormatter
from RSD.util.results2folder import makefolder_name
###############################################################################
# beamsize  categorical
###############################################################################

folder_load = os.path.join("results", "hyperparameter testing",
                           "categorical_beam_width_results", "summary.csv")
folder_save = "categorical_hyperparameters_plots"
folder_path = makefolder_name(folder_save)
df_beam = pd.read_csv(folder_load, index_col=False)

datasetsnames = np.unique(df_beam.datasetname)
results2plot = dict()
for datname in datasetsnames:
    results2plot[datname] = dict()
    results2plot[datname]["beamsize"] = df_beam[df_beam.datasetname ==
                                                datname].beam_width.to_numpy()
    results2plot[datname]["compression"] = df_beam[
        df_beam.datasetname == datname].length_ratio.to_numpy()
    results2plot[datname]["wkl_sum"] = df_beam[df_beam.datasetname ==
                                               datname].wkl_sum.to_numpy()

fig, lgd = make_graph(results2plot,
                      "beamsize",