from bamt.Preprocessors import Preprocessor import pandas as pd from sklearn import preprocessing as pp import bamt.Networks as Networks vk_data = pd.read_csv(r"data\real data\vk_data.csv").sample(150) encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform') p = Preprocessor([('encoder', encoder)]) discretized_data, est = p.apply(vk_data) info = p.info bn = Networks.HybridBN(has_logit=False, use_mixture=False) bn.add_nodes(descriptor=info) params = { "init_nodes": ["sex", "has_pets", "is_parent", "relation", "tr_per_month"], "init_edges": [("age", "mean_tr"), ("sex", "mean_tr"), ("sex", "has_pets"), ("is_parent", "has_pets"), ("has_pets", "median_tr"), ("is_driver", "tr_per_month"), ("tr_per_month", "median_tr"), ("tr_per_month", "relation")] } bn.add_edges(data=discretized_data, optimizer='HC', scoring_function=('MI', ), params=params) bn.fit_parameters(data=vk_data)
cont_test_data = cont_data[cont_data.columns[:-1]] cont_target = cont_data[cont_data.columns[-1]] disc_test_data = disc_data[disc_data.columns[:-1]] disc_target = disc_data[disc_data.columns[-1]] hybrid_test_data = hybrid_data[hybrid_data.columns[:-1]] hybrid_target = hybrid_data[hybrid_data.columns[-1]] encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform') p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) # Discrete pipeline discretized_data, _ = p.apply(disc_data) disc_bn = Networks.DiscreteBN() info = p.info disc_bn.add_nodes(info) disc_bn.add_edges(data=discretized_data, scoring_function=('K2', K2Score)) disc_bn.fit_parameters(data=disc_data) disc_bn.calculate_weights(discretized_data) disc_predicted_values = disc_bn.predict(test=disc_test_data) disc_predicted_values = pd.DataFrame.from_dict(disc_predicted_values, orient='columns') synth_disc_data = disc_bn.sample(50) disc_bn.save('./disc_bn.json') disc_bn2 = Networks.DiscreteBN() disc_bn2.load('./disc_bn.json') synth_disc_data2 = disc_bn2.sample(50)
cols = [ 'Tectonic regime', 'Period', 'Lithology', 'Structural setting', 'Gross', 'Netpay', 'Porosity', 'Permeability', 'Depth' ] h = h[cols] p2 = time.time() print(f"Time elapsed for uploading data: {p2 - p1}") encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform') p = Preprocessor([('encoder', encoder)]) discretized_data, est = p.apply(h) info = p.info # --------------------- VALIDATION TEST------------------------------- nodes_type_mixed = gru.nodes_types(h) columns = [ col for col in h.columns.to_list() if nodes_type_mixed[col] in ['disc', 'disc_num'] ] # GET ONLY DISCRETE discrete_data = h[columns] discretized_data, est = p.apply(discrete_data) # warning info = p.info bn = Networks.HybridBN() bn.add_nodes(descriptor=info) # error
cols = ['Tectonic regime', 'Period', 'Lithology', 'Structural setting', 'Gross', 'Netpay', 'Porosity', 'Permeability', 'Depth'] h = h[cols] print(h.describe()) print("-----") p2 = time.time() print(f"Time elapsed for preparing data: {p2 - p1}") encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile') p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) # ----------- discrete_data, est = p.apply(h) info = p.info bn = Networks.HybridBN(has_logit=True) # all may vary bn.add_nodes(descriptor=info) bn.add_edges(data=discrete_data, optimizer='HC', scoring_function=('MI',)) bn.get_info(as_df=False) t1 = time.time() bn.fit_parameters(data=h) t2 = time.time() print(f'PL elapsed: {t2 - t1}') columns = ['Lithology', 'Structural setting', 'Porosity', 'Depth'] validY = h[columns].dropna() validX = h.drop(columns, axis=1).dropna()
from bamt.Preprocessors import Preprocessor import pandas as pd from sklearn import preprocessing as pp vk_data = pd.read_csv("data/real data/vk_data.csv") encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform') p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) data, en = p.apply(vk_data) # for i, j in en.items(): # print(f"{i}:{j}") # ------------------------ p2 = Preprocessor([('discretizer', discretizer)]) data, en = p2.apply(vk_data) print(data)
'Tectonic regime', 'Period', 'Lithology', 'Structural setting' ]] p2 = time.time() print(f"Time elapsed for uploading data: {p2 - p1}") encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform') p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) nodes_type_mixed = gru.nodes_types(data) discretized_data, est = p.apply(data) # info info = p.info bn = Networks.DiscreteBN() bn.add_nodes(descriptor=info) bn.add_edges(data=discretized_data, optimizer='HC', scoring_function=('K2', K2Score)) bn.get_info() t1 = time.time() bn.fit_parameters(data=data) t2 = time.time() print(f'PL elaspsed: {t2 - t1}') for node, d in bn.distributions.items(): print(node)
# ROWS = 50 # h = h.iloc[:ROWS, :] p2 = time.time() print(f"Time elapsed for uploading data: {p2 - p1}") encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform') p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) print("#" * 1000) discretized_data, est = p.apply(h) info = p.info bn = Networks.ContinuousBN() bn.add_nodes(descriptor=info) # Error # ----------- nodes_type_mixed = p.get_nodes_types(h) columns = [ col for col in h.columns.to_list() if not nodes_type_mixed[col] in ['disc', 'disc_num'] ] # GET ONLY CONT discrete_data = h[columns] discretized_data, est = p.apply(discrete_data) # info
import bamt.Networks as Networks import json hack_data = pd.read_csv("data/real data/hack_processed_with_rf.csv")[[ 'Tectonic regime', 'Period', 'Lithology', 'Structural setting', 'Gross', 'Netpay', 'Porosity', 'Permeability', 'Depth' ]] encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform') p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) discretized_data, est = p.apply(hack_data) bn = Networks.HybridBN(use_mixture=True, has_logit=True) info = p.info bn.add_nodes(info) structure = [("Tectonic regime", "Structural setting"), ("Gross", "Netpay"), ("Lithology", "Permeability")] bn.set_structure(edges=structure) bn.get_info(as_df=False) with open("hack_p.json") as params: params = json.load(params)