def compare_models(true_model, trained_models_folder, training_data, test_data): ground_truth_slogl = true_model.slogl(test_data) print("Ground truth loglik: " + str(ground_truth_slogl)) gbn_bic_folder = trained_models_folder + '/PC/GBN/LinearCorrelation' all_models = sorted(glob.glob(gbn_bic_folder + '/*.pickle')) final_model = all_models[-1] gbn = load(final_model) gbn.fit(training_data) slogl = gbn.slogl(test_data) print("GBN LinearCorrelation results:") print("Loglik: " + str(slogl)) print("SHD: " + str(experiments_helper.shd(gbn, true_model))) print("Hamming: " + str(experiments_helper.hamming(gbn, true_model))) print() gbn_bge_folder = trained_models_folder + '/PC/GBN/RCoT' all_models = sorted(glob.glob(gbn_bge_folder + '/*.pickle')) final_model = all_models[-1] gbn = load(final_model) gbn.fit(training_data) slogl = gbn.slogl(test_data) print("GBN RCoT results:") print("Loglik: " + str(slogl)) print("SHD: " + str(experiments_helper.shd(gbn, true_model))) print("Hamming: " + str(experiments_helper.hamming(gbn, true_model))) print()
def compare_models(true_model, trained_models_folder, training_data, test_data): ground_truth_slogl = true_model.slogl(test_data) print("Ground truth loglik: " + str(ground_truth_slogl)) gbn_bic_folder = trained_models_folder + '/HillClimbing/GBN_BIC/' all_models = sorted(glob.glob(gbn_bic_folder + '/*.pickle')) final_model = all_models[-1] gbn = load(final_model) gbn.fit(training_data) slogl = gbn.slogl(test_data) print("GBN BIC results:") print("Loglik: " + str(slogl)) print("SHD: " + str(experiments_helper.shd(gbn, true_model))) print("Hamming: " + str(experiments_helper.hamming(gbn, true_model))) print() gbn_bge_folder = trained_models_folder + '/HillClimbing/GBN_BGe/' all_models = sorted(glob.glob(gbn_bge_folder + '/*.pickle')) final_model = all_models[-1] gbn = load(final_model) gbn.fit(training_data) slogl = gbn.slogl(test_data) print("GBN BGe results:") print("Loglik: " + str(slogl)) print("SHD: " + str(experiments_helper.shd(gbn, true_model))) print("Hamming: " + str(experiments_helper.hamming(gbn, true_model))) print()
def test_pc_lc_gbn(train_data, test_data, result_folder, idx_fold): models_folder = result_folder + '/PC/Gaussian/LinearCorrelation/' + str( idx_fold) all_models = sorted(glob.glob(models_folder + '/*.pickle')) final_model = load(all_models[-1]) final_model.fit(train_data) return final_model.slogl(test_data)
def compare_models(true_model, trained_models_folder, training_data, test_data, patience): ground_truth_slogl = true_model.slogl(test_data) print("Ground truth loglik: " + str(ground_truth_slogl)) print("SPBN results:") for p in patience: ckde_folder = trained_models_folder + '/HillClimbing/SPBN/' + str(p) all_models = sorted(glob.glob(ckde_folder + '/*.pickle')) final_model = all_models[-1] spbn = load(final_model) spbn.fit(training_data) logl = spbn.slogl(test_data) print("Loglik, p " + str(p) + ": " + str(logl)) print("SHD, p " + str(p) + ": " + str(experiments_helper.shd(spbn, true_model))) print("Hamming, p " + str(p) + ": " + str(experiments_helper.hamming(spbn, true_model))) print("Type Hamming, p " + str(p) + ": " + str(experiments_helper.hamming_type(spbn))) print()
def test_bge_gaussian(train_data, test_data, result_folder, idx_fold): models_folder = result_folder + '/HillClimbing/Gaussian/BGe/' + str( idx_fold) all_models = sorted(glob.glob(models_folder + '/*.pickle')) final_model = load(all_models[-1]) final_model.fit(train_data) return final_model.slogl(test_data)
def compare_models(true_model, trained_models_folder, training_data, test_data): ground_truth_slogl = true_model.slogl(test_data) print("Ground truth loglik: " + str(ground_truth_slogl)) for p in experiments_helper.PATIENCE: folder = trained_models_folder + '/PC/SPBN/LinearCorrelation/' + str(p) all_models = sorted(glob.glob(folder + '/*.pickle')) final_model = all_models[-1] spbn = load(final_model) spbn.fit(training_data) slogl = spbn.slogl(test_data) print("SPBN LinearCorrelation results:") print("Loglik, p " + str(p) + ": " + str(slogl)) print("SHD, p " + str(p) + ": " + str(experiments_helper.shd(spbn, true_model))) print("Hamming, p " + str(p) + ": " + str(experiments_helper.hamming(spbn, true_model))) print("Type Hamming, p " + str(p) + ": " + str(experiments_helper.hamming_type(spbn))) print() for p in experiments_helper.PATIENCE: folder = trained_models_folder + '/PC/SPBN/RCoT/' + str(p) all_models = sorted(glob.glob(folder + '/*.pickle')) final_model = all_models[-1] spbn = load(final_model) spbn.fit(training_data) slogl = spbn.slogl(test_data) print("SPBN RCoT results:") print("Loglik, p " + str(p) + ": " + str(slogl)) print("SHD, p " + str(p) + ": " + str(experiments_helper.shd(spbn, true_model))) print("Hamming, p " + str(p) + ": " + str(experiments_helper.hamming(spbn, true_model))) print("Type Hamming, p " + str(p) + ": " + str(experiments_helper.hamming_type(spbn))) print()
def test_pc_lc_spbn(train_data, test_data, folds, patience, result_folder, idx_fold): test_scores = np.full((len(folds), len(patience)), np.nan) for idx_k, k in enumerate(folds): for idx_p, p in enumerate(patience): models_folder = result_folder + '/PC/SPBN_CKDE/LinearCorrelation/Validation_' + str(k) + "_" + str(p) + '/' + str(idx_fold) all_models = sorted(glob.glob(models_folder + '/*.pickle')) final_model = load(all_models[-1]) final_model.fit(train_data) test_scores[idx_k, idx_p] = final_model.slogl(test_data) return test_scores
def run_pc_rcot_kdebn(result_folder, idx_fold): fold_folder = result_folder + '/PC/KDEBN/RCoT/' + str(idx_fold) pathlib.Path(fold_folder).mkdir(parents=True, exist_ok=True) pdag = load(result_folder + '/PC/graph-rcot-' + str(idx_fold) + ".pickle") try: dag = pdag.to_dag() except ValueError: dag = pdag.to_approximate_dag() kdebn = KDENetwork(dag) kdebn.save(fold_folder + "/000000")
def test_validation_gaussian(train_data, test_data, folds, patience, result_folder, idx_fold): test_scores = np.full((len(folds), len(patience)), np.nan) for idx_k, k in enumerate(folds): for idx_p, p in enumerate(patience): models_folder = result_folder + '/HillClimbing/Gaussian/Validation_' + str( k) + "_" + str(p) + '/' + str(idx_fold) all_models = sorted(glob.glob(models_folder + '/*.pickle')) final_model = load(all_models[-1]) final_model.fit(train_data) test_scores[idx_k, idx_p] = final_model.slogl(test_data) return test_scores
def run_pc_lc_gbn(result_folder, idx_fold): fold_folder = result_folder + '/PC/Gaussian/LinearCorrelation/' + str( idx_fold) pathlib.Path(fold_folder).mkdir(parents=True, exist_ok=True) pdag = load(result_folder + '/PC/graph-lc-' + str(idx_fold) + ".pickle") try: dag = pdag.to_dag() except ValueError: dag = pdag.to_approximate_dag() gbn = GaussianNetwork(dag) gbn.save(fold_folder + "/000000")
def test_spbn(df, model_folder, patience, dag_type): print("Dag Type " + dag_type) for p in patience: result_folder = model_folder + '/PC/SPBN/' + dag_type + '/' + str(p) pathlib.Path(result_folder).mkdir(parents=True, exist_ok=True) all_models = sorted(glob.glob(result_folder + '/*.pickle')) final_model = load(all_models[-1]) final_model.fit(df) slogl = final_model.slogl(df_test) print("Loglik, p " + str(p) + ": " + str(slogl)) print("SHD, p " + str(p) + ": " + str(experiments_helper.shd(final_model, true_model))) print("Hamming, p " + str(p) + ": " + str(experiments_helper.hamming(final_model, true_model))) print("Hamming type, p " + str(p) + ": " + str(experiments_helper.hamming_type(final_model, true_model))) print()
def run_pc_lc_spbn(train_data, folds, patience, result_folder, idx_fold): hc = GreedyHillClimbing() change_node_type = ChangeNodeTypeSet() pdag = load(result_folder + '/PC/graph-lc-' + str(idx_fold) + ".pickle") try: dag = pdag.to_dag() except ValueError: dag = pdag.to_approximate_dag() for k in folds: vl = ValidatedLikelihood(train_data, k=k, seed=experiments_helper.SEED) for p in patience: fold_folder = result_folder + '/PC/SPBN_CKDE/LinearCorrelation/Validation_' + str( k) + '_' + str(p) + '/' + str(idx_fold) pathlib.Path(fold_folder).mkdir(parents=True, exist_ok=True) if os.path.exists(fold_folder + '/end.lock'): continue cb_save = SaveModel(fold_folder) node_types = [(name, NodeType.CKDE) for name in dag.nodes()] start_model = SemiparametricBN(dag, node_types) bn = hc.estimate(change_node_type, vl, start_model, callback=cb_save, patience=p, verbose=True) iters = sorted(glob.glob(fold_folder + '/*.pickle')) last_file = os.path.basename(iters[-1]) number = int(os.path.splitext(last_file)[0]) bn.save(fold_folder + '/' + str(number + 1).zfill(6) + ".pickle") with open(fold_folder + '/end.lock', 'w') as f: pass
import numpy as np np.random.seed(0) import pandas as pd import pathlib import glob from pybnesian import load import experiments_helper from generate_dataset_spbn import slogl_model true_model = load('true_model.pickle') df_200 = pd.read_csv('synthetic_200.csv') df_2000 = pd.read_csv('synthetic_2000.csv') df_10000 = pd.read_csv('synthetic_10000.csv') df_test = pd.read_csv('synthetic_test.csv') print("True model logl: " + str(slogl_model(df_test))) patience = experiments_helper.PATIENCE def test_spbn(df, model_folder, patience, dag_type): print("Dag Type " + dag_type) for p in patience: result_folder = model_folder + '/PC/SPBN/' + dag_type + '/' + str(p) pathlib.Path(result_folder).mkdir(parents=True, exist_ok=True) all_models = sorted(glob.glob(result_folder + '/*.pickle')) final_model = load(all_models[-1]) final_model.fit(df)
def extract_info(train_datasets, test_datasets, model_folders, true_models): patience = experiments_helper.PATIENCE tests = experiments_helper.TESTS slogl_true = np.empty((len(train_datasets, ))) slogl_hc_gbn_bic = np.empty((len(train_datasets), len(train_datasets[0]))) slogl_hc_gbn_bge = np.empty((len(train_datasets), len(train_datasets[0]))) slogl_hc_spbn = np.empty( (len(train_datasets), len(train_datasets[0]), len(patience))) slogl_hc_spbn_ckde = np.empty( (len(train_datasets), len(train_datasets[0]), len(patience))) slogl_pc_gbn = np.empty( (len(train_datasets), len(train_datasets[0]), len(tests))) slogl_pc_spbn = np.empty((len(train_datasets), len(train_datasets[0]), len(tests), len(patience))) slogl_pc_spbn_ckde = np.empty((len(train_datasets), len(train_datasets[0]), len(tests), len(patience))) hmd_hc_gbn_bic = np.empty((len(train_datasets), len(train_datasets[0]))) hmd_hc_gbn_bge = np.empty((len(train_datasets), len(train_datasets[0]))) hmd_hc_spbn = np.empty( (len(train_datasets), len(train_datasets[0]), len(patience))) hmd_hc_spbn_ckde = np.empty( (len(train_datasets), len(train_datasets[0]), len(patience))) hmd_pc = np.empty( (len(train_datasets), len(train_datasets[0]), len(tests))) shd_hc_gbn_bic = np.empty((len(train_datasets), len(train_datasets[0]))) shd_hc_gbn_bge = np.empty((len(train_datasets), len(train_datasets[0]))) shd_hc_spbn = np.empty( (len(train_datasets), len(train_datasets[0]), len(patience))) shd_hc_spbn_ckde = np.empty( (len(train_datasets), len(train_datasets[0]), len(patience))) shd_pc = np.empty( (len(train_datasets), len(train_datasets[0]), len(tests))) thd_hc_spbn = np.empty( (len(train_datasets), len(train_datasets[0]), len(patience))) thd_hc_spbn_ckde = np.empty( (len(train_datasets), len(train_datasets[0]), len(patience))) thd_pc_spbn = np.empty((len(train_datasets), len(train_datasets[0]), len(tests), len(patience))) thd_pc_spbn_ckde = np.empty((len(train_datasets), len(train_datasets[0]), len(tests), len(patience))) for idx_dataset, (instance_datasets, test_data, dataset_folders, true_model) in enumerate( zip(train_datasets, test_datasets, model_folders, true_models)): for idx_instances, (training_data, folder) in enumerate( zip(instance_datasets, dataset_folders)): slogl_true[idx_dataset] = true_model.slogl(test_data) ########################### # GBN BIC ########################### gbn_bic_folder = folder + '/HillClimbing/GBN_BIC/' all_models = sorted(glob.glob(gbn_bic_folder + '/*.pickle')) final_model = all_models[-1] bic = load(final_model) bic.fit(training_data) slogl_hc_gbn_bic[idx_dataset, idx_instances] = bic.slogl(test_data) hmd_hc_gbn_bic[idx_dataset, idx_instances] = experiments_helper.hamming( bic, true_model) shd_hc_gbn_bic[idx_dataset, idx_instances] = experiments_helper.shd( bic, true_model) ########################### # GBN BGe ########################### gbn_bge_folder = folder + '/HillClimbing/GBN_BGe/' all_models = sorted(glob.glob(gbn_bge_folder + '/*.pickle')) final_model = all_models[-1] bge = load(final_model) bge.fit(training_data) slogl_hc_gbn_bge[idx_dataset, idx_instances] = bge.slogl(test_data) hmd_hc_gbn_bge[idx_dataset, idx_instances] = experiments_helper.hamming( bge, true_model) shd_hc_gbn_bge[idx_dataset, idx_instances] = experiments_helper.shd( bge, true_model) ########################### # HC SPBN ########################### for idx_p, p in enumerate(patience): spbn_hc_folder = folder + '/HillClimbing/SPBN/' + str(p) all_models = sorted(glob.glob(spbn_hc_folder + '/*.pickle')) final_model = all_models[-1] spbn = load(final_model) spbn.fit(training_data) slogl_hc_spbn[idx_dataset, idx_instances, idx_p] = spbn.slogl(test_data) hmd_hc_spbn[idx_dataset, idx_instances, idx_p] = experiments_helper.hamming( spbn, true_model) shd_hc_spbn[idx_dataset, idx_instances, idx_p] = experiments_helper.shd(spbn, true_model) thd_hc_spbn[idx_dataset, idx_instances, idx_p] = experiments_helper.hamming_type(spbn) ########################### # HC SPBN CKDE ########################### for idx_p, p in enumerate(patience): spbn_ckde_hc_folder = folder + '/HillClimbing/SPBN_CKDE/' + str( p) all_models = sorted( glob.glob(spbn_ckde_hc_folder + '/*.pickle')) final_model = all_models[-1] spbn_ckde = load(final_model) spbn_ckde.fit(training_data) slogl_hc_spbn_ckde[idx_dataset, idx_instances, idx_p] = spbn_ckde.slogl(test_data) hmd_hc_spbn_ckde[idx_dataset, idx_instances, idx_p] = experiments_helper.hamming( spbn_ckde, true_model) shd_hc_spbn_ckde[idx_dataset, idx_instances, idx_p] = experiments_helper.shd( spbn_ckde, true_model) thd_hc_spbn_ckde[idx_dataset, idx_instances, idx_p] = experiments_helper.hamming_type( spbn_ckde) ########################### # PC GBN and PC Graph ########################### for idx_t, test in enumerate(tests): gbn_pc_folder = folder + '/PC/GBN/' + test all_models = sorted(glob.glob(gbn_pc_folder + '/*.pickle')) final_model = all_models[-1] gbn_pc = load(final_model) gbn_pc.fit(training_data) slogl_pc_gbn[idx_dataset, idx_instances, idx_t] = gbn_pc.slogl(test_data) hmd_pc[idx_dataset, idx_instances, idx_t] = experiments_helper.hamming(gbn_pc, true_model) shd_pc[idx_dataset, idx_instances, idx_t] = experiments_helper.shd(gbn_pc, true_model) ########################### # PC SPBN ########################### for idx_t, test in enumerate(tests): for idx_p, p in enumerate(patience): spbn_pc_folder = folder + '/PC/SPBN/' + test + '/' + str(p) all_models = sorted(glob.glob(spbn_pc_folder + '/*.pickle')) final_model = all_models[-1] spbn_pc = load(final_model) spbn_pc.fit(training_data) slogl_pc_spbn[idx_dataset, idx_instances, idx_t, idx_p] = spbn_pc.slogl(test_data) thd_pc_spbn[idx_dataset, idx_instances, idx_t, idx_p] = experiments_helper.hamming_type( spbn_pc) ########################### # PC SPBN CKDE ########################### for idx_t, test in enumerate(tests): for idx_p, p in enumerate(patience): spbn_ckde_pc_folder = folder + '/PC/SPBN_CKDE/' + test + '/' + str( p) all_models = sorted( glob.glob(spbn_ckde_pc_folder + '/*.pickle')) final_model = all_models[-1] spbn_ckde_pc = load(final_model) spbn_ckde_pc.fit(training_data) slogl_pc_spbn_ckde[idx_dataset, idx_instances, idx_t, idx_p] = spbn_ckde_pc.slogl(test_data) thd_pc_spbn_ckde[idx_dataset, idx_instances, idx_t, idx_p] = experiments_helper.hamming_type( spbn_ckde_pc) return (slogl_true, slogl_hc_gbn_bic, slogl_hc_gbn_bge, slogl_hc_spbn, slogl_hc_spbn_ckde, slogl_pc_gbn, slogl_pc_spbn, slogl_pc_spbn_ckde), \ (hmd_hc_gbn_bic, hmd_hc_gbn_bge, hmd_hc_spbn, hmd_hc_spbn_ckde, hmd_pc), \ (shd_hc_gbn_bic, shd_hc_gbn_bge, shd_hc_spbn, shd_hc_spbn_ckde, shd_pc),\ (thd_hc_spbn, thd_hc_spbn_ckde, thd_pc_spbn, thd_pc_spbn_ckde)
def kdeness_ckde(): folds = experiments_helper.TRAINING_FOLDS patience = experiments_helper.PATIENCE files = experiments_helper.find_crossvalidation_datasets() valid_files = [ f for f in files if experiments_helper.validate_dataset(f, folds) is not None ] n_ckde = np.full((len(valid_files), len(folds), 3, 10), np.nan) datasets = [] n_vars = [] for idx_file, file in enumerate(valid_files): x = experiments_helper.validate_dataset( file, experiments_helper.TRAINING_FOLDS) dataset, result_folder = x basefolder = os.path.basename(os.path.dirname(file)) datasets.append(basefolder) n_vars.append(dataset.shape[1]) for idx_f, f in enumerate(experiments_helper.TRAINING_FOLDS): for idx_fold in range(10): models_folder = result_folder + '/HillClimbing/SPBN_CKDE/Validation_' + str( f) + '_5/' + str(idx_fold) all_models = sorted(glob.glob(models_folder + '/*.pickle')) final_model = load(all_models[-1]) n_ckde[idx_file, idx_f, 0, idx_fold] = \ sum(map(lambda kv: kv[1] == NodeType.CKDE, final_model.node_types().items())) for idx_f, f in enumerate(experiments_helper.TRAINING_FOLDS): for idx_fold in range(10): models_folder = result_folder + '/PC/SPBN_CKDE/LinearCorrelation/Validation_' + str( f) + '_5/' + str(idx_fold) all_models = sorted(glob.glob(models_folder + '/*.pickle')) final_model = load(all_models[-1]) n_ckde[idx_file, idx_f, 1, idx_fold] = \ sum(map(lambda kv: kv[1] == NodeType.CKDE, final_model.node_types().items())) for idx_f, f in enumerate(experiments_helper.TRAINING_FOLDS): for idx_fold in range(10): models_folder = result_folder + '/PC/SPBN_CKDE/RCoT/Validation_' + str( f) + '_5/' + str(idx_fold) all_models = sorted(glob.glob(models_folder + '/*.pickle')) final_model = load(all_models[-1]) n_ckde[idx_file, idx_f, 2, idx_fold] = \ sum(map(lambda kv: kv[1] == NodeType.CKDE, final_model.node_types().items())) mean_ckde = np.mean(n_ckde, axis=3).reshape(len(valid_files), -1) algorithms = ["HC", "PC-PLC", "PC-RCoT"] names = [ "CKDE_" + str(f) + "_" + algorithm for f in folds for algorithm in algorithms ] df = pd.DataFrame(mean_ckde, columns=names, index=datasets) df['n_vars'] = n_vars for f in folds: for algorithm in algorithms: df['%CKDE_' + str(f) + "_" + algorithm] = df.loc[:, 'CKDE_' + str(f) + "_" + algorithm] / df.loc[:, 'n_vars'] N = df.shape[0] ind = np.arange(N) num_bars = len(folds) * len(algorithms) width = (1 - 0.3) / num_bars fig = plt.figure() ax = fig.add_subplot(111) offset = 0 b = [] color = { algorithms[0]: "#729CF5", algorithms[1]: "#FFB346", algorithms[2]: "#B5EA7F" } for f in folds: for algorithm in algorithms: t = ax.bar(ind + width * offset, df['%CKDE_' + str(f) + "_" + algorithm].to_numpy(), width, align='edge', linewidth=0.5, edgecolor="black", color=color[algorithm]) offset += 1 b.append(t) ax.set_ylabel('Ratio of CKDE variables') ax.set_xticks(ind + (1 - 0.3) / 2) ax.set_xticklabels(df.index) ax.tick_params(axis='x', rotation=90) plt.legend([t[0] for t in b], algorithms) tikzplotlib.save("plots/kdeness.tex", standalone=True, axis_width="25cm", axis_height="10cm")
import numpy as np np.random.seed(0) import pandas as pd import pathlib import glob from pybnesian import load import experiments_helper ecoli70_true = load('ecoli70.pickle') magic_niab_true = load('magic_niab.pickle') magic_irri_true = load('magic_irri.pickle') arth150_true = load('arth150.pickle') ecoli70_200 = pd.read_csv("ecoli70_200.csv") ecoli70_2000 = pd.read_csv("ecoli70_2000.csv") ecoli70_10000 = pd.read_csv("ecoli70_10000.csv") ecoli70_test = pd.read_csv("ecoli70_test.csv") magic_niab_200 = pd.read_csv("magic_niab_200.csv") magic_niab_2000 = pd.read_csv("magic_niab_2000.csv") magic_niab_10000 = pd.read_csv("magic_niab_10000.csv") magic_niab_test = pd.read_csv("magic_niab_test.csv") magic_irri_200 = pd.read_csv("magic_irri_200.csv") magic_irri_2000 = pd.read_csv("magic_irri_2000.csv") magic_irri_10000 = pd.read_csv("magic_irri_10000.csv") magic_irri_test = pd.read_csv("magic_irri_test.csv") arth150_200 = pd.read_csv("arth150_200.csv") arth150_2000 = pd.read_csv("arth150_2000.csv") arth150_10000 = pd.read_csv("arth150_10000.csv")
from pybnesian.factors import NodeType from pybnesian.learning.scores import ValidatedLikelihood from pybnesian.models import SemiparametricBN import glob df_10000 = pd.read_csv('synthetic_10000.csv') df_test = pd.read_csv('synthetic_test.csv') models = sorted(glob.glob('models/10000/HillClimbing/SPBN_CKDE/0/*.pickle')) vl = ValidatedLikelihood(df_10000, k=10, seed=0) node_types = [(name, NodeType.CKDE) for name in df_10000.columns.values] start_model = SemiparametricBN(list(df_10000.columns.values), node_types) print("Start model") print("\tTraining score: " + str(vl.score(start_model))) print("\tValidation score: " + str(vl.vscore(start_model))) start_model.fit(df_10000) print("\tTest score: " + str(start_model.slogl(df_test))) for m in models: bn = load(m) print("Model " + m) print("\tTraining score: " + str(vl.score(bn))) print("\tValidation score: " + str(vl.vscore(bn))) bn.fit(df_10000) print("\tTest score: " + str(bn.slogl(df_test)))
def draw_model(model, filename): DG = nx.DiGraph() DG.add_nodes_from(model.nodes()) DG.add_edges_from(model.arcs()) if isinstance(m, BayesianNetworkBase ) and model.type == BayesianNetworkType.Semiparametric: for node in DG.nodes: if model.node_type(node) == NodeType.CKDE: DG.nodes[node]['style'] = 'filled' DG.nodes[node]['fillcolor'] = 'gray' a = nx.nx_agraph.to_agraph(DG) if filename[-4:] != '.dot': filename += '.dot' a.write(filename) a.clear() pdf_out = filename[:-4] + '.pdf' subprocess.run(["dot", "-Tpdf", filename, "-o", pdf_out]) all_models = glob.glob('models/**/*.pickle', recursive=True) for model in all_models: print(model) m = load(model) filename = model[:-7] + '.dot' draw_model(m, filename)
from pybnesian.learning.algorithms import GreedyHillClimbing from pybnesian.learning.algorithms.callbacks import SaveModel from pybnesian.learning.operators import OperatorPool, ArcOperatorSet, ChangeNodeTypeSet from pybnesian.learning.scores import ValidatedLikelihood import pathlib import os import experiments_helper hc = GreedyHillClimbing() change_node = ChangeNodeTypeSet() for d in experiments_helper.DATASETS: for i in experiments_helper.INSTANCES: df = pd.read_csv(d + "_" + str(i) + '.csv') pdag_lc = load('models/' + d + '/' + str(i) + '/PC/graph-lc.pickle') try: dag_lc = pdag_lc.to_dag() except ValueError: dag_lc = pdag_lc.to_approximate_dag() vl = ValidatedLikelihood(df, k=10, seed=experiments_helper.SEED) for p in experiments_helper.PATIENCE: result_folder = 'models/' + d + '/' + str( i) + '/PC/SPBN_CKDE/LinearCorrelation/' + str(p) pathlib.Path(result_folder).mkdir(parents=True, exist_ok=True) if not os.path.exists(result_folder + '/end.lock'): cb_save = SaveModel(result_folder)
def test_pc_rcot_kdebn(train_data, test_data, result_folder, idx_fold): models_folder = result_folder + '/PC/KDEBN/RCoT/' + str(idx_fold) all_models = sorted(glob.glob(models_folder + '/*.pickle')) final_model = load(all_models[-1]) final_model.fit(train_data) return final_model.slogl(test_data)
number = int(os.path.splitext(last_file)[0]) bn.save(result_folder + '/' + str(number+1).zfill(6) + ".pickle") with open(result_folder + '/end.lock', 'w') as f: pass df_200 = pd.read_csv('synthetic_200.csv') df_2000 = pd.read_csv('synthetic_2000.csv') df_10000 = pd.read_csv('synthetic_10000.csv') patience = experiments_helper.PATIENCE for df, model_folder in [(df_200, 'models/200'), (df_2000, 'models/2000'), (df_10000, 'models/10000')]: print("Folder " + model_folder) pdag_lc = load(model_folder + '/PC/graph-lc.pickle') try: dag_lc = pdag_lc.to_dag() except ValueError: dag_lc = pdag.to_approximate_dag() find_node_types(df, dag_lc, model_folder, 'LinearCorrelation', patience) pdag_rcot = load(model_folder + '/PC/graph-rcot.pickle') try: dag_rcot = pdag_rcot.to_dag() except ValueError: dag_rcot = pdag.to_approximate_dag() find_node_types(df, dag_rcot, model_folder, 'RCoT', patience)
import numpy as np np.random.seed(0) import pandas as pd import pathlib import glob from pybnesian import load import experiments_helper from generate_dataset_spbn import slogl_model true_model = load('true_model.pickle') df_200 = pd.read_csv('synthetic_200.csv') df_2000 = pd.read_csv('synthetic_2000.csv') df_10000 = pd.read_csv('synthetic_10000.csv') df_test = pd.read_csv('synthetic_test.csv') print("True model logl: " + str(slogl_model(df_test))) patience = experiments_helper.PATIENCE for df, model_folder in [(df_200, 'models/200'), (df_2000, 'models/2000'), (df_10000, 'models/10000')]: print("Folder " + model_folder) for p in patience: result_folder = model_folder + '/HillClimbing/SPBN_CKDE/' + str(p) pathlib.Path(result_folder).mkdir(parents=True, exist_ok=True) all_models = sorted(glob.glob(result_folder + '/*.pickle')) final_model = load(all_models[-1]) final_model.fit(df) slogl = final_model.slogl(df_test)