p = Preprocessor([('encoder', encoder)]) discretized_data, est = p.apply(h) info = p.info # --------------------- VALIDATION TEST------------------------------- nodes_type_mixed = gru.nodes_types(h) columns = [ col for col in h.columns.to_list() if nodes_type_mixed[col] in ['disc', 'disc_num'] ] # GET ONLY DISCRETE discrete_data = h[columns] discretized_data, est = p.apply(discrete_data) # warning info = p.info bn = Networks.HybridBN() bn.add_nodes(descriptor=info) # error # ------------------------------ p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) discretized_data, est = p.apply(h) info = p.info # --------------------------------------- print("has_logit=False, use_mixture=False") bn = Networks.HybridBN() bn.add_nodes(descriptor=info) for node in bn.nodes: print(f"{node.name}: {node.type}") # only gaussian and discrete nodes print("#" * 150) bn.add_edges(data=discretized_data, optimizer='HC', scoring_function=('MI', ))
import pandas as pd from sklearn import preprocessing as pp import bamt.Networks as Networks vk_data = pd.read_csv(r"data\real data\vk_data.csv").sample(150) encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform') p = Preprocessor([('encoder', encoder)]) discretized_data, est = p.apply(vk_data) info = p.info bn = Networks.HybridBN(has_logit=False, use_mixture=False) bn.add_nodes(descriptor=info) params = { "init_nodes": ["sex", "has_pets", "is_parent", "relation", "tr_per_month"], "init_edges": [("age", "mean_tr"), ("sex", "mean_tr"), ("sex", "has_pets"), ("is_parent", "has_pets"), ("has_pets", "median_tr"), ("is_driver", "tr_per_month"), ("tr_per_month", "median_tr"), ("tr_per_month", "relation")] } bn.add_edges(data=discretized_data, optimizer='HC', scoring_function=('MI', ), params=params) bn.fit_parameters(data=vk_data)
print(h.describe()) print("-----") p2 = time.time() print(f"Time elapsed for preparing data: {p2 - p1}") encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile') p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) # ----------- discrete_data, est = p.apply(h) info = p.info bn = Networks.HybridBN(has_logit=True) # all may vary bn.add_nodes(descriptor=info) bn.add_edges(data=discrete_data, optimizer='HC', scoring_function=('MI',)) bn.get_info(as_df=False) t1 = time.time() bn.fit_parameters(data=h) t2 = time.time() print(f'PL elapsed: {t2 - t1}') columns = ['Lithology', 'Structural setting', 'Porosity', 'Depth'] validY = h[columns].dropna() validX = h.drop(columns, axis=1).dropna() time_1 = time.time() pred_param = bn.predict(validX, parall_count=3)
cont_test_data = cont_data[cont_data.columns[:-1]] cont_target = cont_data[cont_data.columns[-1]] disc_test_data = disc_data[disc_data.columns[:-1]] disc_target = disc_data[disc_data.columns[-1]] hybrid_test_data = hybrid_data[hybrid_data.columns[:-1]] hybrid_target = hybrid_data[hybrid_data.columns[-1]] encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform') p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) # Discrete pipeline discretized_data, _ = p.apply(disc_data) disc_bn = Networks.DiscreteBN() info = p.info disc_bn.add_nodes(info) disc_bn.add_edges(data=discretized_data, scoring_function=('K2', K2Score)) disc_bn.fit_parameters(data=disc_data) disc_bn.calculate_weights(discretized_data) disc_predicted_values = disc_bn.predict(test=disc_test_data) disc_predicted_values = pd.DataFrame.from_dict(disc_predicted_values, orient='columns') synth_disc_data = disc_bn.sample(50) disc_bn.save('./disc_bn.json') disc_bn2 = Networks.DiscreteBN() disc_bn2.load('./disc_bn.json') synth_disc_data2 = disc_bn2.sample(50) # print(disc_bn.weights)
print("-----") p2 = time.time() print(f"Time elapsed for preparing data: {p2 - p1}") encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile') p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) # ----------- discrete_data, est = p.apply(h) info = p.info bn = Networks.HybridBN(use_mixture=False, has_logit=True) # all may vary bn.add_nodes(descriptor=info) bn.add_edges(data=discrete_data, optimizer='HC', scoring_function=('MI', ), classifier=RandomForestClassifier()) bn.get_info(as_df=False) t1 = time.time() bn.fit_parameters(data=h) t2 = time.time() print(f'PL elaspsed: {t2 - t1}') # for num, el in enumerate(bn.sample(10, as_df=False), 1): # print('\n', num) # for name, val in el.items():
encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform') p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) discretized_data, est = p.apply(vk_data) info = p.info # Make some errors info['types']['relation'] = 'unknown' info['types']['sex'] = 'helicopter' bn = Networks.HybridBN() bn.add_nodes(descriptor=info) # ---------- bn.set_nodes(['A node']) print(bn.nodes == []) class MyNode: pass def __repr__(self): return 'MyNode'
p2 = time.time() print(f"Time elapsed for uploading data: {p2 - p1}") encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform') p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) nodes_type_mixed = gru.nodes_types(data) discretized_data, est = p.apply(data) # info info = p.info bn = Networks.DiscreteBN() bn.add_nodes(descriptor=info) bn.add_edges(data=discretized_data, optimizer='HC', scoring_function=('K2', K2Score)) bn.get_info() t1 = time.time() bn.fit_parameters(data=data) t2 = time.time() print(f'PL elaspsed: {t2 - t1}') for node, d in bn.distributions.items(): print(node) for param, value in d.items(): print(f"{param}:{value}") # for num, el in enumerate(bn.sample(20), 1):
p2 = time.time() print(f"Time elapsed for uploading data: {p2 - p1}") discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile') p = Preprocessor([('discretizer', discretizer)]) # ----------- nodes_type_mixed = p.get_nodes_types(h) columns = [col for col in h.columns.to_list() if not nodes_type_mixed[col] in ['disc', 'disc_num']] # GET ONLY CONT discrete_data = h[columns] discretized_data, est = p.apply(discrete_data) # info info = p.info bn = Networks.ContinuousBN(use_mixture=True) # use_mixture = False as well bn.add_nodes(descriptor=info) bn.add_edges(data=discretized_data, optimizer='HC', scoring_function=('MI',)) bn.get_info(as_df=False) t1 = time.time() bn.fit_parameters(data=h) t2 = time.time() print(f'PL elaspsed: {t2 - t1}') # Without async: 0.00699925422668457 # With: 0.0019998550415039062 print('Improvement: %.d' % (0.00699925422668457 // 0.0019998550415039062)) # After rebuilding: 0.0
p2 = time.time() print(f"Time elapsed for uploading data: {p2 - p1}") encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform') p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) print("#" * 1000) discretized_data, est = p.apply(h) info = p.info bn = Networks.ContinuousBN() bn.add_nodes(descriptor=info) # Error # ----------- nodes_type_mixed = p.get_nodes_types(h) columns = [ col for col in h.columns.to_list() if not nodes_type_mixed[col] in ['disc', 'disc_num'] ] # GET ONLY CONT discrete_data = h[columns] discretized_data, est = p.apply(discrete_data) # info info = p.info bn = Networks.ContinuousBN()
hack_data = pd.read_csv("data/real data/hack_processed_with_rf.csv")[[ 'Tectonic regime', 'Period', 'Lithology', 'Structural setting', 'Gross', 'Netpay', 'Porosity', 'Permeability', 'Depth' ]] encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform') p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) discretized_data, est = p.apply(hack_data) bn = Networks.HybridBN(use_mixture=True, has_logit=True) info = p.info bn.add_nodes(info) structure = [("Tectonic regime", "Structural setting"), ("Gross", "Netpay"), ("Lithology", "Permeability")] bn.set_structure(edges=structure) bn.get_info(as_df=False) with open("hack_p.json") as params: params = json.load(params) bn.set_parameters(params)
from sklearn import preprocessing as pp import bamt.Networks as Networks import json hack_data = pd.read_csv("data/real data/hack_processed_with_rf.csv")[[ 'Tectonic regime', 'Period', 'Lithology', 'Structural setting', 'Gross', 'Netpay', 'Porosity', 'Permeability', 'Depth' ]] encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform') p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) discretized_data, est = p.apply(hack_data) info = p.info bn2 = Networks.HybridBN(use_mixture=True) bn2.add_nodes(info) with open("hack_p.json") as params: with open("hack_s.json") as structure: edges = json.load(structure) params = json.load(params) bn2.set_structure(edges=edges) # bn2.get_info(as_df=False) # bn2.plot("gg2.html")