def test_parallel_structure_learning(): logps = -19.8282, -345.9527, -4847.59688, -604.0190 for X, logp in zip(datasets, logps): model = BayesianNetwork.from_samples(X, algorithm='exact') model2 = BayesianNetwork.from_samples(X, algorithm='exact', n_jobs=2) assert_equal(model.log_probability(X).sum(), model2.log_probability(X).sum()) assert_almost_equal(model.log_probability(X).sum(), logp, 4)
def test_exact_nan_structure_learning(): logps = -6.13764, -159.6505, -2055.76364, -201.73615 for X, logp in zip(datasets_nan, logps): model = BayesianNetwork.from_samples(X, algorithm='exact') model2 = BayesianNetwork.from_samples(X, algorithm='exact-dp') assert_equal(model.log_probability(X).sum(), model2.log_probability(X).sum()) assert_almost_equal(model.log_probability(X).sum(), logp, 4)
def test_io_from_samples(): model1 = BayesianNetwork.from_samples(X, weights=weights) model2 = BayesianNetwork.from_samples(data_generator) logp1 = model1.log_probability(X) logp2 = model2.log_probability(X) assert_array_almost_equal(logp1, logp2)
def createModel(train, test): print("I am in create model") header = [ 'acceleration_mean', 'acceleration_stdev', 'pitch1', 'pitch2', 'pitch3', 'roll1', 'roll2', 'roll3', 'classes', 'total_accel_sensor_1', 'total_accel_sensor_2', 'total_accel_sensor_4' ] start_time = datetime.now() print("Start time: ", start_time) model = BayesianNetwork.from_samples(train, algorithm='greedy', state_names=header) print("doing model.bake") model.bake() time = datetime.now() - start_time print("Time: ", time) predict = test['classes'].tolist() test['classes'] = None print("Evaluating predict...") test = test.to_numpy() pred_values = model.predict(test) pred_values = [x.item(2) for x in pred_values] main.calculate_accuracy(predict, pred_values)
def _likelihoods(cls, real_data, synthetic_data, metadata=None, structure=None): metadata = cls._validate_inputs(real_data, synthetic_data, metadata) structure = metadata.get('structure', structure) fields = cls._select_fields(metadata, ('categorical', 'boolean')) if not fields: return np.full(len(real_data), np.nan) LOGGER.debug('Fitting the BayesianNetwork to the real data') if structure: if isinstance(structure, dict): structure = BayesianNetwork.from_json( json.dumps(structure)).structure bn = BayesianNetwork.from_structure(real_data[fields].to_numpy(), structure) else: bn = BayesianNetwork.from_samples(real_data[fields].to_numpy(), algorithm='chow-liu') LOGGER.debug('Evaluating likelihood of the synthetic data') probabilities = [] for _, row in synthetic_data[fields].iterrows(): try: probabilities.append(bn.probability([row.to_numpy()])) except ValueError: probabilities.append(0) return np.asarray(probabilities)
def __naive_algorithm(self, X): graph = networkx.DiGraph() for i in range(1, len(self.state_names)): graph.add_edge((0, ), (i, )) return BayesianNetwork.from_samples(X, algorithm=self.algorithm_name, state_names=self.state_names, root=0, constraint_graph=graph)
def fit(self, features, prediction, **kwargs): """Create a Bayesian network from the given samples""" data = pd.concat([features, prediction], axis='columns') self.model = BayesianNetwork.from_samples(X=data, state_names=data.columns, name="Insurance Advisor", **kwargs) self.model.freeze() print("Training finished")
def fit_chow_liu(self, X_train, y_train, sequence_length_train): # TODO: use sequence_length_train self.formatted_labels = self.le.fit_transform(y_train) self.formatted_labels = self.formatted_labels.reshape( self.formatted_labels.shape[0], 1) X = np.concatenate((self.formatted_labels, X_train), axis=1) self.model = BayesianNetwork.from_samples(X, algorithm='chow-liu', state_names=self.state_names, root=0)
def pomegranate_test(): mydb = np.array([[1, 1, 1], [1, 1, 1], [0, 1, 1]]) mymodel = BayesianNetwork.from_samples(mydb) # print(mymodel.node_count()) # mymodel.plot() print(mymodel.probability([[1, 1, 1]])) print(mymodel.probability([[None, 1, 1]])) print(mymodel.predict_proba({}))
def pomegranadeMethod(): # filename features_file = './../data/features.csv' # reading data data = postmaster.readCSVIntoList(features_file) data = np.array(data, dtype='int32') # learn model model = BayesianNetwork.from_samples(data, algorithm='exact') print model.structure model.plot()
def generateSkeleton(data): config = Config() dfrm = getDataFrames(data) print('LOG: Generate Skeleton') model = BayesianNetwork.from_samples(dfrm, algorithm='greedy', state_names=config.variables()) model.bake() with open( 'generatedSkeleton/skeletonGraph' + str(config.nOfBuckets()) + 'buckets.txt', "w+") as f: f.write(model.to_json())
def produceModelsForValidationToJSON(data, train_indices, dirname='./', filename_base='model_bn_'): y = data.iloc[:, 0].values X = data.iloc[:, 1:].values state_names = data.columns.values model_estimating_times = [] model_fitting_time = [] index = 0 for train_index in train_indices: X_train = X[train_index, :].copy() y_train = y[train_index].copy() y_train = reshape(y_train, [-1, 1]) X_train = np.hstack([y_train, X_train]) dummy = np.ones([2, X_train.shape[1]]) dummy[:, 0] = -1 ### all dummy[1, 1:] = 0 X_train = np.vstack([X_train, dummy]) X_train = X_train.astype(int) #Learning structure print "Learning..." tic = time.time() tic2 = time.clock() model = BayesianNetwork.from_samples(X_train, root=0, state_names=state_names, algorithm='chow-liu', n_jobs=8) toc2 = time.clock() toc = time.time() model_estimating_times.append([toc2 - tic2, toc - tic]) print 'Model estimated in %.5f clock, %.5f time' % (toc2 - tic2, toc - tic) tic = time.time() tic2 = time.clock() model.fit(X_train, pseudocount=1, verbose=True) toc2 = time.clock() toc = time.time() model_fitting_time.append([toc2 - tic2, toc - tic]) print 'Model fitted in %.5f clock, %.5f time' % (toc2 - tic2, toc - tic) #model.bake() #print 'Model was baked' string = model.to_json() model_filename = dirname + filename_base + str(index) + '.json' with open(model_filename, 'w+') as f: f.writelines(string) index += 1 np.savez_compressed(dirname + filename_base + 'times', model_estimating_times=model_estimating_times, model_fitting_time=model_fitting_time)
def fit_naive(self, X_train, y_train, sequence_length_train): self.formatted_labels = self.le.fit_transform(y_train) self.formatted_labels = self.formatted_labels.reshape( self.formatted_labels.shape[0], 1) graph = nx.DiGraph() for i in range(1, len(self.state_names)): graph.add_edge((0, ), (i, )) X = np.concatenate((self.formatted_labels, X_train), axis=1) self.model = BayesianNetwork.from_samples(X, algorithm='exact', state_names=self.state_names, root=0, constraint_graph=graph)
def _get_structure(self, X_plus, root=0): """ Get the features dependency structure of the minority class """ bayes = BayesianNetwork.from_samples(X_plus, algorithm='chow-liu', root=root) depend = [] for i in bayes.structure: if i: depend.append(i[0]) else: depend.append(-1) return depend
def run(): seaborn.set_style('whitegrid') X = numpy.random.randint(2, size=(2000, 7)) X[:, 3] = X[:, 1] X[:, 6] = X[:, 1] X[:, 0] = X[:, 2] X[:, 4] = X[:, 5] model = BayesianNetwork.from_samples(X, algorithm='exact') model.structure model.plot()
def setup_random_mixed(): numpy.random.seed(0) global X X = numpy.array([ numpy.random.choice([True, False], size=50), numpy.random.choice(['A', 'B'], size=50), numpy.random.choice(2, size=50) ], dtype=object).T.copy() global weights weights = numpy.abs(numpy.random.randn(50)) global data_generator data_generator = DataGenerator(X, weights) global model model = BayesianNetwork.from_samples(X)
def predict_from_data(self, samples_file_name: str): """ This function will predict diseases from symptoms using a given dataset with an expected structure. The dataset structure must be in the format of symptoms in columns 0 to last-1 and diseases in the last column. :param samples_file_name: the name of the csv_file in the csv folder :return: None """ program_start_before_input = default_timer() samples = pandas.read_csv(f"../csv/{samples_file_name}.csv", delimiter=",", header=None) program_end_before_input = default_timer() user_symptoms = self.__get_symptoms_from_user(samples) program_start_after_input = default_timer() number_symptoms = samples.shape[1] - 1 # number columns in samples model_start_time = default_timer() model = BayesianNetwork.from_samples( X=samples.values, include_edges=[(symptom, number_symptoms) for symptom in range(number_symptoms)], exclude_edges=(list( itertools.combinations( [symptom for symptom in range(number_symptoms)], 2, ))), ) model.bake() model_end_time = default_timer() print( f"Model finished construction in {model_end_time - model_start_time} seconds" ) predicted_disease = model.predict([user_symptoms])[0] prediction_probability = model.probability([predicted_disease]) print( f"The predicted disease is {predicted_disease[-1]} with probability of {prediction_probability}" ) program_end_after_input = default_timer() full_program_runtime = ( program_end_before_input - program_start_before_input) + ( program_end_after_input - program_start_after_input) print( f"The Bayes's Net implementation completed in {full_program_runtime} seconds" )
def test_exact_structure_learning_slap_constraints(): for ds in datasets: dims = numpy.shape(ds)[1] half = int(numpy.ceil(dims / 2)) # Node groups g1 = tuple(range(0, half)) g2 = tuple(range(half, dims)) # Constraint graph: cg = DiGraph() cg.add_edge(g1, g2) cg.add_edge(g2, g2) # Learn constrained network model = BayesianNetwork.from_samples(ds, algorithm='exact', constraint_graph=cg) # Check structure constraints satisfied s = model.structure for node in g1: assert_equal(0, len(s[node]))
def test_pom(): data = get_test_data() data = 4 * data fields = list(data[0].keys()) data_matrix = data_to_matrix(data, fields) network = BayesianNetwork.from_samples(data_matrix, algorithm='exact', pseudocount=0) example = [['1', 'Sara Smith', 'Boston', '100000', 'bat'], ['1', 'Sara Smith', 'Boston', '100000', 'ball'], ['1', 'Sara Smith', 'Boston', '100000', 'hat'], ['1', 'Sara Smith', 'Boston', '100000', 'glove']] prob = network.probability(example) prob /= prob.sum() print(prob) return network
def inference(data, infs): config = Config() dfrm = getDataFrames(data) model = BayesianNetwork.from_samples(dfrm, algorithm='greedy', state_names=config.variables()) model.bake() testsArray = np.array( Enumerable(infs).select(lambda x: [ x.x1, x.y1, x.z1, x.x2, x.y2, x.z2, x.x3, x.y3, x.z3, x.x4, x.y4, x .z4, None ]).to_list()) print('LOG: Predicting') prediction = model.predict(testsArray) if len(infs) > 1: print('LOG: Printing predictions in "' + config.outInference() + '"') with open(config.outInference(), "w+") as f: f.write('\n'.join( Enumerable(prediction).select( lambda x: str(x) + ' ' + parseVal(x[12])).to_list())) else: print('Predicted value is "' + parseVal(prediction[0][12]) + '"')
def testModel(data, tests): config = Config() dfrm = getDataFrames(data) model = BayesianNetwork.from_samples(dfrm, algorithm='greedy', state_names=config.variables()) model.bake() testsArray = np.array( Enumerable(tests).select(lambda x: [ x.x1, x.y1, x.z1, x.x2, x.y2, x.z2, x.x3, x.y3, x.z3, x.x4, x.y4, x .z4, None ]).to_list()) tags = np.array(Enumerable(tests).select(lambda x: x.harClass).to_list()) print('LOG: Testing') prediction = model.predict(testsArray) i = 0 corrects = 0 for p in prediction: if (p[12] == tags[i]): corrects += 1 i += 1 print('Score: ' + str(corrects * 100 / len(tests)) + '%')
def build_bn(df_app, output_dir, options): df_app_tmp = df_app.copy() df_app_tmp.drop('ANNOTATE', axis=1, inplace = True) if options['CLASS'] != '': df_app_tmp.drop('CLASS', axis=1, inplace=True) X = df_app_tmp model = BayesianNetwork.from_samples(X, algorithm='chow-liu') print("\nModel Structure:\n") print(model.structure) for idx, parent in enumerate(model.structure): if len(parent) == 0: print('Singleton: {}'.format(df_app.columns[idx])) elif len(parent) == 1: print('Parent: {} - Child: {}'.format(df_app.columns[parent[0]], df_app.columns[idx])) file_out = ea_decode.options_filename(options) + '_' + 'bn_graph' plt.figure(figsize=(9, 7)) model.plot() if output_dir == '': plt.show() else: plt.savefig(os.path.join(output_dir, file_out)) plt.close() file_out = ea_decode.options_filename(options) + '_bn.mdl' model_file = os.path.join(output_dir, file_out) with open(model_file, 'wb') as f: pickle.dump(model_file, f) logging.info('\n%s: Loglikelihood: %.2f\n', 'BN', model.log_probability(X).sum())
def test_greedy_structure_learning(): logps = -19.8282, -345.9527, -4847.59688, -611.0356 for X, logp in zip(datasets, logps): model = BayesianNetwork.from_samples(X, algorithm='greedy') assert_almost_equal(model.log_probability(X).sum(), logp, 4)
def test_chow_liu_structure_learning(): logps = -19.8282, -344.248785, -4842.40158, -603.2370 for X, logp in zip(datasets, logps): model = BayesianNetwork.from_samples(X, algorithm='chow-liu') assert_almost_equal(model.log_probability(X).sum(), logp, 4)
'Personal Email', 'Professional Email', 'Religion', 'Sexual Orientation', 'Illnesses', 'Hobby/Pastime', 'Hurt Sentiments - Movie', 'Holiday Destination', 'Music Genre', 'Age for Adult movie', 'Favourite Movie', 'Money on cinema weekly', 'Illegal streaming/downloading', 'Favourite Pornstar' ] if block == '3': column_names = [ 'Name', 'Country of Residence', 'Home Postcode', 'Employer Name', 'Work Address', 'Phone Number', 'Relationship Status', 'Lied to Partner', 'Languages', 'Annual Income', 'Shared X-rated movies', 'Lied about Age', 'Musician', 'Favourite Movie Genre', 'Favourite Soundtrack', 'Online rental subscriptions' ] print('Generating Bayesian Network for Question Block ' + block + '.') model = BayesianNetwork.from_samples(subset, state_names=column_names, algorithm=bayes_algorithm) if block not in bayesian_net_models: bayesian_net_models[block] = model plt.title('Truthfulness \n Bayesian Network \n' + 'Block-' + block, fontsize=30, fontweight='bold') model.plot(with_labels=True) save_fig(BAYESIAN_DIR, bayes_algorithm + '_bayesian_net_likert_' + 'block_' + block) print('Saving Bayesian Network for Question Block ' + block + ' in ' + BAYESIAN_DIR + ' directory.')
def train(self, samples, weights, state_names=None): ''' @samples: 2d array. Each row represents a unique point in the joint distribution, with each column representing a random variable. ''' start = time.time() assert state_names is not None self.state_names = state_names weights = np.array(weights, dtype=np.int32) for col in range(samples.shape[1]): self.word2index.append({}) col_alphabets = np.unique(samples[:, col]) for i, alph in enumerate(col_alphabets): self.word2index[col][alph] = i mapped_samples = np.zeros(samples.shape, dtype=np.int32) for i in range(mapped_samples.shape[0]): for j in range(mapped_samples.shape[1]): mapped_samples[i][j] = self.word2index[j][samples[i][j]] if self.save_csv: np.set_printoptions( formatter={'float': lambda x: "{0:0.3f}".format(x)}) np.savetxt("data.csv", mapped_samples, delimiter=",") np.savetxt("counts.csv", weights, delimiter=",") if self.backend == "ourpgm": pgm.py_init.restype = c_void_p print("before py_init") self.ourpgm_model = pgm.py_init( mapped_samples.ctypes.data_as(c_void_p), c_long(mapped_samples.shape[0]), c_long(mapped_samples.shape[1]), weights.ctypes.data_as(c_void_p), c_long(weights.shape[0]), self.use_svd, c_long(self.num_singular_vals), self.recompute) pgm.py_train(c_void_p(self.ourpgm_model)) elif self.backend == "pomegranate": # TODO: cache the trained model, based on hash of mapped samples? # TODO: mapped samples should be extended to include all 0's. self.pom_model = BayesianNetwork.from_samples( mapped_samples, weights=weights, state_names=self.state_names, algorithm="chow-liu", n_jobs=-1) print("pomegranate training done!") if self.alg_name == "greg": # compute all the appropriate SVD's self.edge_svds = {} # TODO: might want to store this globally state_to_idx = {} for i, s in enumerate(self.pom_model.states): state_to_idx[s.name] = i # Expensive computation, so save it if possible misc_cache = klepto.archives.dir_archive( "./misc_cache/edge_svds/") misc_cache.load() for edge in self.pom_model.edges: node1 = state_to_idx[edge[0].name] node2 = state_to_idx[edge[1].name] edge_nodes = [node1, node2] edge_nodes.sort() edge_key = (edge_nodes[0], edge_nodes[1]) # FIXME: check cond_dist = edge[1].distribution assert "ConditionalProbabilityTable" in str( type(cond_dist)) marg1 = self.pom_model.marginal()[node1].values() node1_vals = [ k for k in self.pom_model.marginal() [node1].parameters[0].keys() ] dim1 = len(marg1) marg2 = self.pom_model.marginal()[node2].values() node2_vals = [ k for k in self.pom_model.marginal() [node2].parameters[0].keys() ] dim2 = len(marg2) svd_key = str(marg1) + str(marg2) + str(node1_vals) + str( node2_vals) svd_key = deterministic_hash(svd_key) if svd_key in misc_cache: self.edge_svds[edge_key] = misc_cache[svd_key] print("found edge key {} in cache".format(edge_key)) print(np.max(self.edge_svds[edge_key][1])) continue else: print("did not find edge key {} in cache".format( edge_keedge_key)) joint_mat = np.zeros((dim1, dim2)) for i in range(dim1): for j in range(dim2): ind_term = marg1[i] * marg2[j] # FIXME: assuming that these are state values assert node1_vals[i] == i assert node2_vals[j] == j # FIXME: assuming marg1 is always the parent in the # conditional dist sample = [node1_vals[i], node2_vals[j]] joint_term = cond_dist.probability( sample) * marg1[i] joint_mat[i, j] = (joint_term - ind_term) / math.sqrt(ind_term) # TODO: replace this by scipy.sparse svd's so can only # compute for top-k values uh, sv, vh = np.linalg.svd(joint_mat, full_matrices=False) # print(np.max(sv)) assert np.max(sv) < 1.1 # pdb.set_trace() # TODO: check if this computation is what we need # compute the f and g vectors for xi in range(dim1): uh[xi, :] /= math.sqrt(marg1[xi]) for xj in range(dim2): vh[:, xj] /= math.sqrt(marg2[xj]) assert edge_key not in self.edge_svds self.edge_svds[edge_key] = (uh, sv, vh) misc_cache[svd_key] = self.edge_svds[edge_key] misc_cache.dump() misc_cache.clear() elif self.alg_name == "chow-liu": # should not need to do anything here. print("trained chow-liu using pomegranate") else: assert False print("pgm model took {} seconds to train".format(time.time() - start))
def run_research(df): # Prepare data #df = get_factorized_dataset(path="../data").drop(['veil-type', 'stalk-root'],axis=1) target_column = "class" #Prepare models nb = CategoricalNB(num_epochs=20) df1 = df pgm1 = PGM(df1, num_epochs=20) m1 = BayesianNetwork.from_samples(df1, algorithm='chow-liu') pgm1.import_pomegranate_model(m1, df1.columns) df2 = df.drop(['odor'], axis=1) pgm2 = PGM(df2, num_epochs=20) m2 = BayesianNetwork.from_samples(df2, algorithm='chow-liu') pgm2.import_pomegranate_model(m2, df2.columns) df3 = df.get([ 'odor', 'class', 'spore-print-color', 'gill-color', 'cap-color', 'cap-shape', 'cap-surface', 'gill-size', 'gill-spacing', 'gill-attachment', 'stalk-color-above-ring', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-shape' ]) DAG = nx.DiGraph() edges = [('odor', 'class'), ('spore-print-color', 'class'), ('gill-color', 'class'), ('cap-color', 'class'), ('cap-shape', 'cap-color'), ('cap-surface', 'cap-color'), ('gill-size', 'gill-color'), ('gill-spacing', 'gill-color'), ('gill-attachment', 'gill-color'), ('stalk-color-above-ring', 'class'), ('stalk-surface-below-ring', 'stalk-color-above-ring'), ('stalk-surface-above-ring', 'stalk-color-above-ring'), ('stalk-shape', 'stalk-color-above-ring')] DAG.add_edges_from(edges) pgm3 = PGM(df3, graph=DAG, num_epochs=20) models = [ (df, nb, "Naive Bayes"), (df1, pgm1, "Bayesian Net #1"), (df2, pgm2, "Bayesian Net #2"), (df3, pgm3, "Bayesian Net #3"), ] #Results structure all_results = {} #Prepare file file = open("research_results.txt", "a") # Run experiments for (data, model, model_name) in models[0:1]: print(f"##### {model_name} #####\n") file.write(f"##### {model_name} #####\n") result = test_split(model=model, n_splits=5, df=data, class_column='class') print(result) file.write(str(result) + "\n") all_results[model_name] = result file.write("##### Partial results after test_split #####\n") file.write(str(all_results) + "\n") print("##### Partial results after test_split #####\n") print(str(all_results) + "\n") #Run experiments for (data, model, model_name) in models[1:]: print(f"##### {model_name} #####\n") file.write(f"##### {model_name} #####\n") result = test_cross(model=model, n_splits=5, df=data, class_column='class') print(result) file.write(str(result) + "\n") all_results[model_name] = result file.write("##### Combined results #####\n") file.write(str(all_results) + "\n") print("##### Combined results #####\n") print(str(all_results) + "\n") generate_plots(all_results) generate_plots_std(all_results) #Close file file.close() print("Done.")
def test_greedy_nan_structure_learning(): logps = -7.5239, -159.6505, -2058.5706, -203.7662 for X, logp in zip(datasets_nan, logps): model = BayesianNetwork.from_samples(X, algorithm='greedy') assert_almost_equal(model.log_probability(X).sum(), logp, 4)
from pomegranate import BayesianNetwork import seaborn, time import numpy seaborn.set_style('whitegrid') X = numpy.random.randint(2, size=(2000, 7)) X[:, 3] = X[:, 1] X[:, 6] = X[:, 1] X[:, 0] = X[:, 2] X[:, 4] = X[:, 5] model = BayesianNetwork.from_samples(X, algorithm='exact') print(model.structure) model.plot()
def __init__( self, # dataset, table, num_samples, algorithm="greedy", max_parents=-1, topological_sampling_order=True, use_pgm=True, discretize=None, discretize_method="equal_size", root=None): from pomegranate import BayesianNetwork self.discretize = discretize self.discretize_method = discretize_method self.table = copy.deepcopy(table) self.dataset = np.stack([ col.discretize(self.table.data[cname]) for cname, col in self.table.columns.items() ], axis=1) self.algorithm = algorithm self.topological_sampling_order = topological_sampling_order self.num_samples = num_samples self.discrete_mapping = self.build_discrete_mapping( self.dataset, discretize, discretize_method) self.discrete_table = self.apply_discrete_mapping( self.dataset, self.discrete_mapping) L.info('calling BayesianNetwork.from_samples...') t = time.time() self.model = BayesianNetwork.from_samples(self.discrete_table, algorithm=self.algorithm, max_parents=max_parents, n_jobs=NUM_THREADS, root=root) L.info(f'done! took {(time.time() - t)/60:.2f} mins') def size(states): n = 0 for state in states: if "distribution" in state: dist = state["distribution"] else: dist = state if dist["name"] == "DiscreteDistribution": for p in dist["parameters"]: n += len(p) elif dist["name"] == "ConditionalProbabilityTable": for t in dist["table"]: n += len(t) if "parents" in dist: for parent in dist["parents"]: n += size(dist["parents"]) else: assert False, dist["name"] return n self.size = 4 * size(json.loads(self.model.to_json())["states"]) L.info(f'model size is {self.size/1024/1024:.2f}MB') # print('json:\n', self.model.to_json()) self.json_size = len(self.model.to_json()) self.use_pgm = use_pgm # print(self.model.to_json()) if topological_sampling_order: self.sampling_order = [] while len(self.sampling_order) < len(self.model.structure): for i, deps in enumerate(self.model.structure): if i in self.sampling_order: continue # already ordered if all(d in self.sampling_order for d in deps): self.sampling_order.append(i) L.debug(f"Building sampling order {self.sampling_order}") else: self.sampling_order = list(range(len(self.model.structure))) L.info(f"Using sampling order {self.sampling_order} {str(self)}") if use_pgm: from pgmpy.models import BayesianModel data = pd.DataFrame(self.discrete_table.astype(np.int64)) spec = [] orphans = [] for i, parents in enumerate(self.model.structure): for p in parents: spec.append((p, i)) if not parents: orphans.append(i) L.info(f"Model spec {spec}") model = BayesianModel(spec) for o in orphans: model.add_node(o) L.info('calling pgm.BayesianModel.fit...') t = time.time() model.fit(data) L.info(f'done! took {(time.time() - t)/60:.2f} mins') self.pgm_model = model
def fit(self, data, categorical_columns=tuple(), ordinal_columns=tuple()): self.discretizer = DiscretizeTransformer(n_bins=15) self.discretizer.fit(data, categorical_columns, ordinal_columns) discretized_data = self.discretizer.transform(data) self.model = BayesianNetwork.from_samples(discretized_data, algorithm='chow-liu')