Exemplo n.º 1
0
def compare_models(true_model, trained_models_folder, training_data,
                   test_data):
    ground_truth_slogl = true_model.slogl(test_data)

    print("Ground truth loglik: " + str(ground_truth_slogl))

    gbn_bic_folder = trained_models_folder + '/PC/GBN/LinearCorrelation'
    all_models = sorted(glob.glob(gbn_bic_folder + '/*.pickle'))
    final_model = all_models[-1]

    gbn = load(final_model)
    gbn.fit(training_data)

    slogl = gbn.slogl(test_data)
    print("GBN LinearCorrelation results:")
    print("Loglik: " + str(slogl))
    print("SHD: " + str(experiments_helper.shd(gbn, true_model)))
    print("Hamming: " + str(experiments_helper.hamming(gbn, true_model)))
    print()

    gbn_bge_folder = trained_models_folder + '/PC/GBN/RCoT'
    all_models = sorted(glob.glob(gbn_bge_folder + '/*.pickle'))
    final_model = all_models[-1]

    gbn = load(final_model)
    gbn.fit(training_data)

    slogl = gbn.slogl(test_data)
    print("GBN RCoT results:")
    print("Loglik: " + str(slogl))
    print("SHD: " + str(experiments_helper.shd(gbn, true_model)))
    print("Hamming: " + str(experiments_helper.hamming(gbn, true_model)))
    print()
Exemplo n.º 2
0
def compare_models(true_model, trained_models_folder, training_data,
                   test_data):
    ground_truth_slogl = true_model.slogl(test_data)

    print("Ground truth loglik: " + str(ground_truth_slogl))

    gbn_bic_folder = trained_models_folder + '/HillClimbing/GBN_BIC/'
    all_models = sorted(glob.glob(gbn_bic_folder + '/*.pickle'))
    final_model = all_models[-1]

    gbn = load(final_model)
    gbn.fit(training_data)

    slogl = gbn.slogl(test_data)
    print("GBN BIC results:")
    print("Loglik: " + str(slogl))
    print("SHD: " + str(experiments_helper.shd(gbn, true_model)))
    print("Hamming: " + str(experiments_helper.hamming(gbn, true_model)))
    print()

    gbn_bge_folder = trained_models_folder + '/HillClimbing/GBN_BGe/'
    all_models = sorted(glob.glob(gbn_bge_folder + '/*.pickle'))
    final_model = all_models[-1]

    gbn = load(final_model)
    gbn.fit(training_data)

    slogl = gbn.slogl(test_data)
    print("GBN BGe results:")
    print("Loglik: " + str(slogl))
    print("SHD: " + str(experiments_helper.shd(gbn, true_model)))
    print("Hamming: " + str(experiments_helper.hamming(gbn, true_model)))
    print()
Exemplo n.º 3
0
def test_pc_lc_gbn(train_data, test_data, result_folder, idx_fold):
    models_folder = result_folder + '/PC/Gaussian/LinearCorrelation/' + str(
        idx_fold)
    all_models = sorted(glob.glob(models_folder + '/*.pickle'))
    final_model = load(all_models[-1])
    final_model.fit(train_data)
    return final_model.slogl(test_data)
Exemplo n.º 4
0
def compare_models(true_model, trained_models_folder, training_data, test_data,
                   patience):
    ground_truth_slogl = true_model.slogl(test_data)

    print("Ground truth loglik: " + str(ground_truth_slogl))
    print("SPBN results:")
    for p in patience:
        ckde_folder = trained_models_folder + '/HillClimbing/SPBN/' + str(p)

        all_models = sorted(glob.glob(ckde_folder + '/*.pickle'))
        final_model = all_models[-1]

        spbn = load(final_model)
        spbn.fit(training_data)

        logl = spbn.slogl(test_data)

        print("Loglik, p " + str(p) + ": " + str(logl))
        print("SHD, p " + str(p) + ": " +
              str(experiments_helper.shd(spbn, true_model)))
        print("Hamming, p " + str(p) + ": " +
              str(experiments_helper.hamming(spbn, true_model)))
        print("Type Hamming, p " + str(p) + ": " +
              str(experiments_helper.hamming_type(spbn)))
        print()
Exemplo n.º 5
0
def test_bge_gaussian(train_data, test_data, result_folder, idx_fold):
    models_folder = result_folder + '/HillClimbing/Gaussian/BGe/' + str(
        idx_fold)
    all_models = sorted(glob.glob(models_folder + '/*.pickle'))
    final_model = load(all_models[-1])
    final_model.fit(train_data)

    return final_model.slogl(test_data)
Exemplo n.º 6
0
def compare_models(true_model, trained_models_folder, training_data,
                   test_data):
    ground_truth_slogl = true_model.slogl(test_data)

    print("Ground truth loglik: " + str(ground_truth_slogl))

    for p in experiments_helper.PATIENCE:
        folder = trained_models_folder + '/PC/SPBN/LinearCorrelation/' + str(p)
        all_models = sorted(glob.glob(folder + '/*.pickle'))
        final_model = all_models[-1]

        spbn = load(final_model)
        spbn.fit(training_data)

        slogl = spbn.slogl(test_data)
        print("SPBN LinearCorrelation results:")
        print("Loglik, p " + str(p) + ": " + str(slogl))
        print("SHD, p " + str(p) + ": " +
              str(experiments_helper.shd(spbn, true_model)))
        print("Hamming, p " + str(p) + ": " +
              str(experiments_helper.hamming(spbn, true_model)))
        print("Type Hamming, p " + str(p) + ": " +
              str(experiments_helper.hamming_type(spbn)))
        print()

    for p in experiments_helper.PATIENCE:
        folder = trained_models_folder + '/PC/SPBN/RCoT/' + str(p)
        all_models = sorted(glob.glob(folder + '/*.pickle'))
        final_model = all_models[-1]

        spbn = load(final_model)
        spbn.fit(training_data)

        slogl = spbn.slogl(test_data)
        print("SPBN RCoT results:")
        print("Loglik, p " + str(p) + ": " + str(slogl))
        print("SHD, p " + str(p) + ": " +
              str(experiments_helper.shd(spbn, true_model)))
        print("Hamming, p " + str(p) + ": " +
              str(experiments_helper.hamming(spbn, true_model)))
        print("Type Hamming, p " + str(p) + ": " +
              str(experiments_helper.hamming_type(spbn)))
        print()
Exemplo n.º 7
0
def test_pc_lc_spbn(train_data, test_data, folds, patience, result_folder, idx_fold):
    test_scores = np.full((len(folds), len(patience)), np.nan)

    for idx_k, k in enumerate(folds):
        for idx_p, p in enumerate(patience):
            models_folder = result_folder + '/PC/SPBN_CKDE/LinearCorrelation/Validation_' + str(k) + "_" + str(p) + '/' + str(idx_fold)
            all_models = sorted(glob.glob(models_folder + '/*.pickle'))
            final_model = load(all_models[-1])
            final_model.fit(train_data)
            test_scores[idx_k, idx_p] = final_model.slogl(test_data)

    return test_scores
Exemplo n.º 8
0
def run_pc_rcot_kdebn(result_folder, idx_fold):
    fold_folder = result_folder + '/PC/KDEBN/RCoT/' + str(idx_fold)
    pathlib.Path(fold_folder).mkdir(parents=True, exist_ok=True)

    pdag = load(result_folder + '/PC/graph-rcot-' + str(idx_fold) + ".pickle")

    try:
        dag = pdag.to_dag()
    except ValueError:
        dag = pdag.to_approximate_dag()

    kdebn = KDENetwork(dag)
    kdebn.save(fold_folder + "/000000")
Exemplo n.º 9
0
def test_validation_gaussian(train_data, test_data, folds, patience,
                             result_folder, idx_fold):
    test_scores = np.full((len(folds), len(patience)), np.nan)

    for idx_k, k in enumerate(folds):
        for idx_p, p in enumerate(patience):
            models_folder = result_folder + '/HillClimbing/Gaussian/Validation_' + str(
                k) + "_" + str(p) + '/' + str(idx_fold)
            all_models = sorted(glob.glob(models_folder + '/*.pickle'))
            final_model = load(all_models[-1])
            final_model.fit(train_data)
            test_scores[idx_k, idx_p] = final_model.slogl(test_data)

    return test_scores
Exemplo n.º 10
0
def run_pc_lc_gbn(result_folder, idx_fold):
    fold_folder = result_folder + '/PC/Gaussian/LinearCorrelation/' + str(
        idx_fold)
    pathlib.Path(fold_folder).mkdir(parents=True, exist_ok=True)

    pdag = load(result_folder + '/PC/graph-lc-' + str(idx_fold) + ".pickle")

    try:
        dag = pdag.to_dag()
    except ValueError:
        dag = pdag.to_approximate_dag()

    gbn = GaussianNetwork(dag)
    gbn.save(fold_folder + "/000000")
Exemplo n.º 11
0
def test_spbn(df, model_folder, patience, dag_type):
    print("Dag Type " + dag_type)
    for p in patience:
        result_folder = model_folder + '/PC/SPBN/' + dag_type + '/' + str(p)
        pathlib.Path(result_folder).mkdir(parents=True, exist_ok=True)

        all_models = sorted(glob.glob(result_folder + '/*.pickle'))
        final_model = load(all_models[-1])
        final_model.fit(df)

        slogl = final_model.slogl(df_test)

        print("Loglik, p " + str(p) + ": " + str(slogl))
        print("SHD, p " + str(p) + ": " +
              str(experiments_helper.shd(final_model, true_model)))
        print("Hamming, p " + str(p) + ": " +
              str(experiments_helper.hamming(final_model, true_model)))
        print("Hamming type, p " + str(p) + ": " +
              str(experiments_helper.hamming_type(final_model, true_model)))

        print()
Exemplo n.º 12
0
def run_pc_lc_spbn(train_data, folds, patience, result_folder, idx_fold):
    hc = GreedyHillClimbing()
    change_node_type = ChangeNodeTypeSet()

    pdag = load(result_folder + '/PC/graph-lc-' + str(idx_fold) + ".pickle")

    try:
        dag = pdag.to_dag()
    except ValueError:
        dag = pdag.to_approximate_dag()

    for k in folds:
        vl = ValidatedLikelihood(train_data, k=k, seed=experiments_helper.SEED)

        for p in patience:
            fold_folder = result_folder + '/PC/SPBN_CKDE/LinearCorrelation/Validation_' + str(
                k) + '_' + str(p) + '/' + str(idx_fold)
            pathlib.Path(fold_folder).mkdir(parents=True, exist_ok=True)

            if os.path.exists(fold_folder + '/end.lock'):
                continue

            cb_save = SaveModel(fold_folder)

            node_types = [(name, NodeType.CKDE) for name in dag.nodes()]
            start_model = SemiparametricBN(dag, node_types)

            bn = hc.estimate(change_node_type,
                             vl,
                             start_model,
                             callback=cb_save,
                             patience=p,
                             verbose=True)
            iters = sorted(glob.glob(fold_folder + '/*.pickle'))
            last_file = os.path.basename(iters[-1])
            number = int(os.path.splitext(last_file)[0])
            bn.save(fold_folder + '/' + str(number + 1).zfill(6) + ".pickle")
            with open(fold_folder + '/end.lock', 'w') as f:
                pass
Exemplo n.º 13
0
import numpy as np

np.random.seed(0)
import pandas as pd
import pathlib
import glob
from pybnesian import load
import experiments_helper
from generate_dataset_spbn import slogl_model

true_model = load('true_model.pickle')

df_200 = pd.read_csv('synthetic_200.csv')
df_2000 = pd.read_csv('synthetic_2000.csv')
df_10000 = pd.read_csv('synthetic_10000.csv')
df_test = pd.read_csv('synthetic_test.csv')

print("True model logl: " + str(slogl_model(df_test)))

patience = experiments_helper.PATIENCE


def test_spbn(df, model_folder, patience, dag_type):
    print("Dag Type " + dag_type)
    for p in patience:
        result_folder = model_folder + '/PC/SPBN/' + dag_type + '/' + str(p)
        pathlib.Path(result_folder).mkdir(parents=True, exist_ok=True)

        all_models = sorted(glob.glob(result_folder + '/*.pickle'))
        final_model = load(all_models[-1])
        final_model.fit(df)
Exemplo n.º 14
0
def extract_info(train_datasets, test_datasets, model_folders, true_models):
    patience = experiments_helper.PATIENCE

    tests = experiments_helper.TESTS

    slogl_true = np.empty((len(train_datasets, )))
    slogl_hc_gbn_bic = np.empty((len(train_datasets), len(train_datasets[0])))
    slogl_hc_gbn_bge = np.empty((len(train_datasets), len(train_datasets[0])))
    slogl_hc_spbn = np.empty(
        (len(train_datasets), len(train_datasets[0]), len(patience)))
    slogl_hc_spbn_ckde = np.empty(
        (len(train_datasets), len(train_datasets[0]), len(patience)))
    slogl_pc_gbn = np.empty(
        (len(train_datasets), len(train_datasets[0]), len(tests)))
    slogl_pc_spbn = np.empty((len(train_datasets), len(train_datasets[0]),
                              len(tests), len(patience)))
    slogl_pc_spbn_ckde = np.empty((len(train_datasets), len(train_datasets[0]),
                                   len(tests), len(patience)))

    hmd_hc_gbn_bic = np.empty((len(train_datasets), len(train_datasets[0])))
    hmd_hc_gbn_bge = np.empty((len(train_datasets), len(train_datasets[0])))
    hmd_hc_spbn = np.empty(
        (len(train_datasets), len(train_datasets[0]), len(patience)))
    hmd_hc_spbn_ckde = np.empty(
        (len(train_datasets), len(train_datasets[0]), len(patience)))
    hmd_pc = np.empty(
        (len(train_datasets), len(train_datasets[0]), len(tests)))

    shd_hc_gbn_bic = np.empty((len(train_datasets), len(train_datasets[0])))
    shd_hc_gbn_bge = np.empty((len(train_datasets), len(train_datasets[0])))
    shd_hc_spbn = np.empty(
        (len(train_datasets), len(train_datasets[0]), len(patience)))
    shd_hc_spbn_ckde = np.empty(
        (len(train_datasets), len(train_datasets[0]), len(patience)))
    shd_pc = np.empty(
        (len(train_datasets), len(train_datasets[0]), len(tests)))

    thd_hc_spbn = np.empty(
        (len(train_datasets), len(train_datasets[0]), len(patience)))
    thd_hc_spbn_ckde = np.empty(
        (len(train_datasets), len(train_datasets[0]), len(patience)))
    thd_pc_spbn = np.empty((len(train_datasets), len(train_datasets[0]),
                            len(tests), len(patience)))
    thd_pc_spbn_ckde = np.empty((len(train_datasets), len(train_datasets[0]),
                                 len(tests), len(patience)))

    for idx_dataset, (instance_datasets, test_data, dataset_folders,
                      true_model) in enumerate(
                          zip(train_datasets, test_datasets, model_folders,
                              true_models)):
        for idx_instances, (training_data, folder) in enumerate(
                zip(instance_datasets, dataset_folders)):

            slogl_true[idx_dataset] = true_model.slogl(test_data)

            ###########################
            # GBN BIC
            ###########################
            gbn_bic_folder = folder + '/HillClimbing/GBN_BIC/'

            all_models = sorted(glob.glob(gbn_bic_folder + '/*.pickle'))
            final_model = all_models[-1]

            bic = load(final_model)
            bic.fit(training_data)

            slogl_hc_gbn_bic[idx_dataset, idx_instances] = bic.slogl(test_data)
            hmd_hc_gbn_bic[idx_dataset,
                           idx_instances] = experiments_helper.hamming(
                               bic, true_model)
            shd_hc_gbn_bic[idx_dataset,
                           idx_instances] = experiments_helper.shd(
                               bic, true_model)

            ###########################
            # GBN BGe
            ###########################
            gbn_bge_folder = folder + '/HillClimbing/GBN_BGe/'

            all_models = sorted(glob.glob(gbn_bge_folder + '/*.pickle'))
            final_model = all_models[-1]

            bge = load(final_model)
            bge.fit(training_data)

            slogl_hc_gbn_bge[idx_dataset, idx_instances] = bge.slogl(test_data)
            hmd_hc_gbn_bge[idx_dataset,
                           idx_instances] = experiments_helper.hamming(
                               bge, true_model)
            shd_hc_gbn_bge[idx_dataset,
                           idx_instances] = experiments_helper.shd(
                               bge, true_model)

            ###########################
            # HC SPBN
            ###########################
            for idx_p, p in enumerate(patience):
                spbn_hc_folder = folder + '/HillClimbing/SPBN/' + str(p)

                all_models = sorted(glob.glob(spbn_hc_folder + '/*.pickle'))
                final_model = all_models[-1]

                spbn = load(final_model)
                spbn.fit(training_data)

                slogl_hc_spbn[idx_dataset, idx_instances,
                              idx_p] = spbn.slogl(test_data)
                hmd_hc_spbn[idx_dataset, idx_instances,
                            idx_p] = experiments_helper.hamming(
                                spbn, true_model)
                shd_hc_spbn[idx_dataset, idx_instances,
                            idx_p] = experiments_helper.shd(spbn, true_model)
                thd_hc_spbn[idx_dataset, idx_instances,
                            idx_p] = experiments_helper.hamming_type(spbn)

            ###########################
            # HC SPBN CKDE
            ###########################
            for idx_p, p in enumerate(patience):
                spbn_ckde_hc_folder = folder + '/HillClimbing/SPBN_CKDE/' + str(
                    p)

                all_models = sorted(
                    glob.glob(spbn_ckde_hc_folder + '/*.pickle'))
                final_model = all_models[-1]

                spbn_ckde = load(final_model)
                spbn_ckde.fit(training_data)

                slogl_hc_spbn_ckde[idx_dataset, idx_instances,
                                   idx_p] = spbn_ckde.slogl(test_data)
                hmd_hc_spbn_ckde[idx_dataset, idx_instances,
                                 idx_p] = experiments_helper.hamming(
                                     spbn_ckde, true_model)
                shd_hc_spbn_ckde[idx_dataset, idx_instances,
                                 idx_p] = experiments_helper.shd(
                                     spbn_ckde, true_model)
                thd_hc_spbn_ckde[idx_dataset, idx_instances,
                                 idx_p] = experiments_helper.hamming_type(
                                     spbn_ckde)

            ###########################
            # PC GBN and PC Graph
            ###########################
            for idx_t, test in enumerate(tests):
                gbn_pc_folder = folder + '/PC/GBN/' + test

                all_models = sorted(glob.glob(gbn_pc_folder + '/*.pickle'))
                final_model = all_models[-1]

                gbn_pc = load(final_model)
                gbn_pc.fit(training_data)

                slogl_pc_gbn[idx_dataset, idx_instances,
                             idx_t] = gbn_pc.slogl(test_data)
                hmd_pc[idx_dataset, idx_instances,
                       idx_t] = experiments_helper.hamming(gbn_pc, true_model)
                shd_pc[idx_dataset, idx_instances,
                       idx_t] = experiments_helper.shd(gbn_pc, true_model)

            ###########################
            # PC SPBN
            ###########################
            for idx_t, test in enumerate(tests):
                for idx_p, p in enumerate(patience):
                    spbn_pc_folder = folder + '/PC/SPBN/' + test + '/' + str(p)

                    all_models = sorted(glob.glob(spbn_pc_folder +
                                                  '/*.pickle'))
                    final_model = all_models[-1]

                    spbn_pc = load(final_model)
                    spbn_pc.fit(training_data)

                    slogl_pc_spbn[idx_dataset, idx_instances, idx_t,
                                  idx_p] = spbn_pc.slogl(test_data)
                    thd_pc_spbn[idx_dataset, idx_instances, idx_t,
                                idx_p] = experiments_helper.hamming_type(
                                    spbn_pc)

            ###########################
            # PC SPBN CKDE
            ###########################
            for idx_t, test in enumerate(tests):
                for idx_p, p in enumerate(patience):
                    spbn_ckde_pc_folder = folder + '/PC/SPBN_CKDE/' + test + '/' + str(
                        p)

                    all_models = sorted(
                        glob.glob(spbn_ckde_pc_folder + '/*.pickle'))
                    final_model = all_models[-1]

                    spbn_ckde_pc = load(final_model)
                    spbn_ckde_pc.fit(training_data)

                    slogl_pc_spbn_ckde[idx_dataset, idx_instances, idx_t,
                                       idx_p] = spbn_ckde_pc.slogl(test_data)
                    thd_pc_spbn_ckde[idx_dataset, idx_instances, idx_t,
                                     idx_p] = experiments_helper.hamming_type(
                                         spbn_ckde_pc)


    return (slogl_true, slogl_hc_gbn_bic, slogl_hc_gbn_bge, slogl_hc_spbn, slogl_hc_spbn_ckde, slogl_pc_gbn, slogl_pc_spbn, slogl_pc_spbn_ckde), \
           (hmd_hc_gbn_bic, hmd_hc_gbn_bge, hmd_hc_spbn, hmd_hc_spbn_ckde, hmd_pc), \
           (shd_hc_gbn_bic, shd_hc_gbn_bge, shd_hc_spbn, shd_hc_spbn_ckde, shd_pc),\
           (thd_hc_spbn, thd_hc_spbn_ckde, thd_pc_spbn, thd_pc_spbn_ckde)
Exemplo n.º 15
0
def kdeness_ckde():
    folds = experiments_helper.TRAINING_FOLDS
    patience = experiments_helper.PATIENCE

    files = experiments_helper.find_crossvalidation_datasets()
    valid_files = [
        f for f in files
        if experiments_helper.validate_dataset(f, folds) is not None
    ]

    n_ckde = np.full((len(valid_files), len(folds), 3, 10), np.nan)
    datasets = []
    n_vars = []
    for idx_file, file in enumerate(valid_files):
        x = experiments_helper.validate_dataset(
            file, experiments_helper.TRAINING_FOLDS)
        dataset, result_folder = x

        basefolder = os.path.basename(os.path.dirname(file))
        datasets.append(basefolder)
        n_vars.append(dataset.shape[1])

        for idx_f, f in enumerate(experiments_helper.TRAINING_FOLDS):
            for idx_fold in range(10):
                models_folder = result_folder + '/HillClimbing/SPBN_CKDE/Validation_' + str(
                    f) + '_5/' + str(idx_fold)
                all_models = sorted(glob.glob(models_folder + '/*.pickle'))
                final_model = load(all_models[-1])

                n_ckde[idx_file, idx_f, 0, idx_fold] = \
                    sum(map(lambda kv: kv[1] == NodeType.CKDE, final_model.node_types().items()))

        for idx_f, f in enumerate(experiments_helper.TRAINING_FOLDS):
            for idx_fold in range(10):
                models_folder = result_folder + '/PC/SPBN_CKDE/LinearCorrelation/Validation_' + str(
                    f) + '_5/' + str(idx_fold)
                all_models = sorted(glob.glob(models_folder + '/*.pickle'))
                final_model = load(all_models[-1])

                n_ckde[idx_file, idx_f, 1, idx_fold] = \
                    sum(map(lambda kv: kv[1] == NodeType.CKDE, final_model.node_types().items()))

        for idx_f, f in enumerate(experiments_helper.TRAINING_FOLDS):
            for idx_fold in range(10):
                models_folder = result_folder + '/PC/SPBN_CKDE/RCoT/Validation_' + str(
                    f) + '_5/' + str(idx_fold)
                all_models = sorted(glob.glob(models_folder + '/*.pickle'))
                final_model = load(all_models[-1])

                n_ckde[idx_file, idx_f, 2, idx_fold] = \
                    sum(map(lambda kv: kv[1] == NodeType.CKDE, final_model.node_types().items()))

    mean_ckde = np.mean(n_ckde, axis=3).reshape(len(valid_files), -1)
    algorithms = ["HC", "PC-PLC", "PC-RCoT"]
    names = [
        "CKDE_" + str(f) + "_" + algorithm for f in folds
        for algorithm in algorithms
    ]

    df = pd.DataFrame(mean_ckde, columns=names, index=datasets)
    df['n_vars'] = n_vars
    for f in folds:
        for algorithm in algorithms:
            df['%CKDE_' + str(f) + "_" +
               algorithm] = df.loc[:, 'CKDE_' + str(f) + "_" +
                                   algorithm] / df.loc[:, 'n_vars']

    N = df.shape[0]
    ind = np.arange(N)
    num_bars = len(folds) * len(algorithms)
    width = (1 - 0.3) / num_bars

    fig = plt.figure()
    ax = fig.add_subplot(111)

    offset = 0

    b = []

    color = {
        algorithms[0]: "#729CF5",
        algorithms[1]: "#FFB346",
        algorithms[2]: "#B5EA7F"
    }
    for f in folds:
        for algorithm in algorithms:
            t = ax.bar(ind + width * offset,
                       df['%CKDE_' + str(f) + "_" + algorithm].to_numpy(),
                       width,
                       align='edge',
                       linewidth=0.5,
                       edgecolor="black",
                       color=color[algorithm])
            offset += 1
            b.append(t)

    ax.set_ylabel('Ratio of CKDE variables')
    ax.set_xticks(ind + (1 - 0.3) / 2)
    ax.set_xticklabels(df.index)
    ax.tick_params(axis='x', rotation=90)

    plt.legend([t[0] for t in b], algorithms)
    tikzplotlib.save("plots/kdeness.tex",
                     standalone=True,
                     axis_width="25cm",
                     axis_height="10cm")
Exemplo n.º 16
0
import numpy as np
np.random.seed(0)
import pandas as pd
import pathlib
import glob
from pybnesian import load
import experiments_helper

ecoli70_true = load('ecoli70.pickle')
magic_niab_true = load('magic_niab.pickle')
magic_irri_true = load('magic_irri.pickle')
arth150_true = load('arth150.pickle')

ecoli70_200 = pd.read_csv("ecoli70_200.csv")
ecoli70_2000 = pd.read_csv("ecoli70_2000.csv")
ecoli70_10000 = pd.read_csv("ecoli70_10000.csv")
ecoli70_test = pd.read_csv("ecoli70_test.csv")

magic_niab_200 = pd.read_csv("magic_niab_200.csv")
magic_niab_2000 = pd.read_csv("magic_niab_2000.csv")
magic_niab_10000 = pd.read_csv("magic_niab_10000.csv")
magic_niab_test = pd.read_csv("magic_niab_test.csv")

magic_irri_200 = pd.read_csv("magic_irri_200.csv")
magic_irri_2000 = pd.read_csv("magic_irri_2000.csv")
magic_irri_10000 = pd.read_csv("magic_irri_10000.csv")
magic_irri_test = pd.read_csv("magic_irri_test.csv")

arth150_200 = pd.read_csv("arth150_200.csv")
arth150_2000 = pd.read_csv("arth150_2000.csv")
arth150_10000 = pd.read_csv("arth150_10000.csv")
Exemplo n.º 17
0
from pybnesian.factors import NodeType
from pybnesian.learning.scores import ValidatedLikelihood
from pybnesian.models import SemiparametricBN
import glob

df_10000 = pd.read_csv('synthetic_10000.csv')
df_test = pd.read_csv('synthetic_test.csv')

models = sorted(glob.glob('models/10000/HillClimbing/SPBN_CKDE/0/*.pickle'))

vl = ValidatedLikelihood(df_10000, k=10, seed=0)

node_types = [(name, NodeType.CKDE) for name in df_10000.columns.values]
start_model = SemiparametricBN(list(df_10000.columns.values), node_types)

print("Start model")
print("\tTraining score: " + str(vl.score(start_model)))
print("\tValidation score: " + str(vl.vscore(start_model)))

start_model.fit(df_10000)
print("\tTest score: " + str(start_model.slogl(df_test)))

for m in models:
    bn = load(m)
    print("Model " + m)
    print("\tTraining score: " + str(vl.score(bn)))
    print("\tValidation score: " + str(vl.vscore(bn)))

    bn.fit(df_10000)
    print("\tTest score: " + str(bn.slogl(df_test)))
Exemplo n.º 18
0

def draw_model(model, filename):

    DG = nx.DiGraph()
    DG.add_nodes_from(model.nodes())
    DG.add_edges_from(model.arcs())

    if isinstance(m, BayesianNetworkBase
                  ) and model.type == BayesianNetworkType.Semiparametric:
        for node in DG.nodes:
            if model.node_type(node) == NodeType.CKDE:
                DG.nodes[node]['style'] = 'filled'
                DG.nodes[node]['fillcolor'] = 'gray'

    a = nx.nx_agraph.to_agraph(DG)
    if filename[-4:] != '.dot':
        filename += '.dot'
    a.write(filename)
    a.clear()

    pdf_out = filename[:-4] + '.pdf'
    subprocess.run(["dot", "-Tpdf", filename, "-o", pdf_out])


all_models = glob.glob('models/**/*.pickle', recursive=True)
for model in all_models:
    print(model)
    m = load(model)
    filename = model[:-7] + '.dot'
    draw_model(m, filename)
Exemplo n.º 19
0
from pybnesian.learning.algorithms import GreedyHillClimbing
from pybnesian.learning.algorithms.callbacks import SaveModel
from pybnesian.learning.operators import OperatorPool, ArcOperatorSet, ChangeNodeTypeSet
from pybnesian.learning.scores import ValidatedLikelihood
import pathlib
import os
import experiments_helper

hc = GreedyHillClimbing()
change_node = ChangeNodeTypeSet()

for d in experiments_helper.DATASETS:
    for i in experiments_helper.INSTANCES:
        df = pd.read_csv(d + "_" + str(i) + '.csv')

        pdag_lc = load('models/' + d + '/' + str(i) + '/PC/graph-lc.pickle')

        try:
            dag_lc = pdag_lc.to_dag()
        except ValueError:
            dag_lc = pdag_lc.to_approximate_dag()

        vl = ValidatedLikelihood(df, k=10, seed=experiments_helper.SEED)

        for p in experiments_helper.PATIENCE:
            result_folder = 'models/' + d + '/' + str(
                i) + '/PC/SPBN_CKDE/LinearCorrelation/' + str(p)
            pathlib.Path(result_folder).mkdir(parents=True, exist_ok=True)

            if not os.path.exists(result_folder + '/end.lock'):
                cb_save = SaveModel(result_folder)
Exemplo n.º 20
0
def test_pc_rcot_kdebn(train_data, test_data, result_folder, idx_fold):
    models_folder = result_folder + '/PC/KDEBN/RCoT/' + str(idx_fold)
    all_models = sorted(glob.glob(models_folder + '/*.pickle'))
    final_model = load(all_models[-1])
    final_model.fit(train_data)
    return final_model.slogl(test_data)
Exemplo n.º 21
0
        number = int(os.path.splitext(last_file)[0])
        bn.save(result_folder + '/' + str(number+1).zfill(6) + ".pickle")

        with open(result_folder + '/end.lock', 'w') as f:
            pass



df_200 = pd.read_csv('synthetic_200.csv')
df_2000 = pd.read_csv('synthetic_2000.csv')
df_10000 = pd.read_csv('synthetic_10000.csv')

patience = experiments_helper.PATIENCE

for df, model_folder in [(df_200, 'models/200'), (df_2000, 'models/2000'), (df_10000, 'models/10000')]:
    print("Folder " + model_folder)

    pdag_lc = load(model_folder + '/PC/graph-lc.pickle')
    try:
        dag_lc = pdag_lc.to_dag()
    except ValueError:
        dag_lc = pdag.to_approximate_dag()
    find_node_types(df, dag_lc, model_folder, 'LinearCorrelation', patience)

    pdag_rcot = load(model_folder + '/PC/graph-rcot.pickle')
    try:
        dag_rcot = pdag_rcot.to_dag()
    except ValueError:
        dag_rcot = pdag.to_approximate_dag()
    find_node_types(df, dag_rcot, model_folder, 'RCoT', patience)
Exemplo n.º 22
0
import numpy as np
np.random.seed(0)
import pandas as pd
import pathlib
import glob
from pybnesian import load
import experiments_helper
from generate_dataset_spbn import slogl_model

true_model = load('true_model.pickle')

df_200 = pd.read_csv('synthetic_200.csv')
df_2000 = pd.read_csv('synthetic_2000.csv')
df_10000 = pd.read_csv('synthetic_10000.csv')
df_test = pd.read_csv('synthetic_test.csv')

print("True model logl: " + str(slogl_model(df_test)))

patience = experiments_helper.PATIENCE

for df, model_folder in [(df_200, 'models/200'), (df_2000, 'models/2000'), (df_10000, 'models/10000')]:
    print("Folder " + model_folder)
    for p in patience:
        result_folder = model_folder + '/HillClimbing/SPBN_CKDE/' + str(p)
        pathlib.Path(result_folder).mkdir(parents=True, exist_ok=True)

        all_models = sorted(glob.glob(result_folder + '/*.pickle'))
        final_model = load(all_models[-1])
        final_model.fit(df)

        slogl = final_model.slogl(df_test)