def create(self, network_factory): network = network_factory.create() cluster = builder.try_get_node(network, "Cluster") if cluster is None: cluster = builder.create_cluster_variable( network, self._latent_states, variable_name=self._latent_variable_name) if not dk.empty(self._continuous): for c_name in self._continuous.columns: self._logger.info("Pre-processing {} column".format(c_name)) c = builder.create_continuous_variable(network, c_name) try: builder.create_link(network, cluster, c) except ValueError as e: self._logger.warn(e) if not dk.empty(self._discrete): for d_name in self._discrete.columns: if d_name in self._discrete_states: states = self._discrete_states[str(d_name)] else: states = dk.compute(self._discrete[str( d_name)].dropna().unique()).tolist() try: c = builder.create_discrete_variable( network, self._discrete, str(d_name), states) builder.create_link(network, cluster, c) except BaseException as e: self._logger.warn(e) return network
def main(): logger = logging.getLogger() logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) bayesianpy.jni.attach(logger) db_folder = bayesianpy.utils.get_path_to_parent_dir(__file__) iris = pd.read_csv(os.path.join(db_folder, "data/iris.csv"), index_col=False) network = bayesianpy.network.create_network() cluster = builder.create_cluster_variable(network, 4) node = builder.create_multivariate_continuous_node( network, iris.drop('iris_class', axis=1).columns.tolist(), "joint") builder.create_link(network, cluster, node) class_variable = builder.create_discrete_variable( network, iris, 'iris_class', iris['iris_class'].unique()) builder.create_link(network, cluster, class_variable) head_variables = [ 'sepal_length', 'sepal_width', 'petal_length', 'petal_width' ] with bayesianpy.data.DataSet(iris, db_folder, logger) as dataset: model = bayesianpy.model.NetworkModel(network, logger) model.train(dataset) queries = [ bayesianpy.model.QueryConditionalJointProbability( head_variables=[v], tail_variables=['iris_class']) for v in head_variables ] (engine, _, _) = bayesianpy.model.InferenceEngine(network).create() query = bayesianpy.model.SingleQuery(network, engine, logger) results = query.query(queries, aslist=True) jd = bayesianpy.visual.JointDistribution() fig = plt.figure(figsize=(10, 10)) for i, r in enumerate(list(results)): ax = fig.add_subplot(2, 2, i + 1) jd.plot_distribution_with_variance(ax, iris, queries[i].get_head_variables(), r) plt.show()
def create(self, network_factory: bayesianpy.network.NetworkFactory): network = network_factory.create() cluster = builder.create_cluster_variable(network, self._latent_states) if not dk.empty(self._continuous): for c_name in self._continuous.columns: c = builder.create_discretised_variable(network, self._continuous, c_name, bin_count=self._bin_count, mode=self._binning_mode, zero_crossing=self._zero_crossing) builder.create_link(network, cluster, c) if not dk.empty(self._discrete): for d_name in self._discrete.columns: states = dk.compute(self._discrete[d_name].dropna().unique()) c = builder.create_discrete_variable(network, self._discrete, d_name, states) builder.create_link(network, cluster, c) return network
def analyse(self, df: pd.DataFrame, continuous_variable_names: List[str]): kf = NewKFold(n_splits=3, shuffle=self._shuffle) network_factory = bayesianpy.network.NetworkFactory(self._logger) variations = [1, 5, 10, 20, 30] results = {} with bayesianpy.data.DataSet(df, logger=self._logger) as dataset: ll = defaultdict(list) for variable in continuous_variable_names: likelihoods = [] for cluster_count in variations: weighted = [] weights = [] for k, (train_indexes, test_indexes) in enumerate(kf): x_train, x_test = train_indexes, test_indexes nt = network_factory.create() cluster = builder.create_cluster_variable( nt, cluster_count) node = builder.create_continuous_variable(nt, variable) builder.create_link(nt, cluster, node) model = bayesianpy.model.NetworkModel(nt, self._logger) try: ll = model.train(dataset.subset( x_train)).get_metrics()['loglikelihood'] except BaseException as e: self._logger.warning(e) continue weighted.append(ll) weights.append(len(x_train)) likelihoods.append(np.average(weighted, weights=weights)) max_index = np.argmax(likelihoods) if variations[max_index] > 5: results.update({variable: True}) else: results.update({variable: False}) return results
def create(self, network_factory: bayesianpy.network.NetworkFactory): network = self._template.create(network_factory) cluster = builder.create_cluster_variable(network, self._latent_states) for node in bayesianpy.network.get_nodes(network): if node == cluster: continue builder.create_link(network, cluster, node) if self._target_nodes is not None: for target_node in self._target_nodes: target = builder.get_node(network, target_node) builder.delete_links_from(network, target) if self._remove_target_node: bayesianpy.network.remove_node(network, self._target_nodes) return network
def main(): logger = logging.getLogger() logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) bayesianpy.jni.attach(logger) db_folder = bayesianpy.utils.get_path_to_parent_dir(__file__) iris = pd.read_csv(os.path.join(db_folder, "data/iris.csv"), index_col=False) network = bayesianpy.network.create_network() cluster = builder.create_cluster_variable(network, 4) node = builder.create_multivariate_continuous_node( network, iris.drop('iris_class', axis=1).columns.tolist(), "joint") builder.create_link(network, cluster, node) class_variable = builder.create_discrete_variable( network, iris, 'iris_class', iris['iris_class'].unique()) builder.create_link(network, cluster, class_variable) jd = bayesianpy.visual.JointDistribution() def plot(head_variables, results): fig = plt.figure(figsize=(10, 10)) n = len(head_variables) - 1 total = n * (n + 1) / 2 k = 1 for i, hv in enumerate(head_variables): for j in range(i + 1, len(head_variables)): ax = fig.add_subplot(total / 2, 2, k) jd.plot_distribution_with_covariance( ax, iris, (head_variables[i], head_variables[j]), results) k += 1 plt.show() with bayesianpy.data.DataSet(iris, db_folder, logger) as dataset: model = bayesianpy.model.NetworkModel(network, logger) model.train(dataset) head_variables = [ 'sepal_length', 'sepal_width', 'petal_length', 'petal_width' ] query_type_class = bayesianpy.model.QueryConditionalJointProbability( head_variables=head_variables, tail_variables=['iris_class', 'Cluster']) (engine, _, _) = bayesianpy.model.InferenceEngine(network).create() query = bayesianpy.model.Query(network, engine, logger) results_class = query.execute([query_type_class]) plot(head_variables, results_class) query_type_cluster = bayesianpy.model.QueryConditionalJointProbability( head_variables=head_variables, tail_variables=['Cluster']) results_cluster = query.execute([query_type_cluster]) plot(head_variables, results_cluster)
def main(): logger = logging.getLogger() logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) bayesianpy.jni.attach(logger) db_folder = bayesianpy.utils.get_path_to_parent_dir(__file__) iris = pd.read_csv(os.path.join(db_folder, "data/iris.csv"), index_col=False) # manually build the network, leaving out the 'iris-class' variable network = bayesianpy.network.create_network() cluster = builder.create_cluster_variable(network, 4) node = builder.create_multivariate_continuous_node(network, iris.columns.tolist(), "joint") builder.create_link(network, cluster, node) with bayesianpy.data.DataSet(iris.drop('iris_class', axis=1), db_folder, logger) as dataset: # build the 'normal' model on two of the classes model = bayesianpy.model.NetworkModel(network, logger) subset = dataset.subset( iris[(iris.iris_class == "Iris-versicolor") | (iris.iris_class == "Iris-virginica")].index.tolist()) model.train(subset) variables = [ 'sepal_length', 'sepal_width', 'petal_length', 'petal_width' ] # query the trained model on all the data, including the Iris-setosa class # get the loglikelihood value for the whole model on each individual sample, # the lower the loglikelihood value the less likely the data point has been # generated by the model. results = model.batch_query(dataset, [bayesianpy.model.QueryModelStatistics()]) cmap = plt.cm.get_cmap('Blues_r') fig = plt.figure(figsize=(10, 10)) k = 1 for i, v in enumerate(variables): for j in range(i + 1, len(variables)): v1 = variables[j] ax = fig.add_subplot(3, 2, k) ax.set_title("{} vs {}".format(v, v1)) h = ax.scatter(x=iris[v].tolist(), y=iris[v1].tolist(), c=results['loglikelihood'].tolist(), vmin=results.loglikelihood.min(), vmax=results.loglikelihood.max(), cmap=cmap) k += 1 fig.subplots_adjust(right=0.8) cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7]) fig.colorbar(h, cax=cbar_ax) plt.show()
def main(): logger = logging.getLogger() logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) bayesianpy.jni.attach(logger) db_folder = bayesianpy.utils.get_path_to_parent_dir(__file__) iris = pd.read_csv(os.path.join(db_folder, "data/iris.csv"), index_col=False) network = bayesianpy.network.create_network() num_clusters = 3 cluster = builder.create_cluster_variable(network, num_clusters) node = builder.create_multivariate_continuous_node( network, iris.drop('iris_class', axis=1).columns.tolist(), "joint") builder.create_link(network, cluster, node) class_variable = builder.create_discrete_variable( network, iris, 'iris_class', iris['iris_class'].unique()) builder.create_link(network, cluster, class_variable) train, test = train_test_split(iris, test_size=0.7) # train the model and query the most likely states and probability of each latent state. with bayesianpy.data.DataSet(iris, db_folder, logger) as dataset: model = bayesianpy.model.NetworkModel(network, logger) model.train(dataset.subset(train.index.tolist())) test_subset = dataset.subset(test.index.tolist()) results = model.batch_query( test_subset, # creates columns Cluster$$Cluster0, Cluster$$Cluster1, # Cluster$$Cluster2, as # suffix is set to an empty string. [ bayesianpy.model.QueryStateProbability("Cluster", suffix=""), # creates column 'iris_class_maxlikelihood' bayesianpy.model.QueryMostLikelyState("iris_class"), # creates column 'Cluster_maxlikelihood' bayesianpy.model.QueryMostLikelyState("Cluster") ]) cluster_accuracy = {} # get a list of cluster accuracies, using the Bayes Server Confusion matrix class # weighted by the Cluster accuracy. with bayesianpy.data.DataSet(results, db_folder, logger) as resultset: for c in range(num_clusters): matrix = bayesianpy.jni.bayesServerAnalysis()\ .ConfusionMatrix.create(resultset.create_data_reader_command(), "iris_class", "iris_class_maxlikelihood", "Cluster$$Cluster{}".format(c)) cluster_accuracy.update( {'Cluster{}'.format(c): matrix.getAccuracy()}) # generate samples from the trained model, to give us some additional testing data. samples = bayesianpy.model.Sampling(network).sample(num_samples=20).drop( ["Cluster", "iris_class"], axis=1) reader = bayesianpy.data.DataFrameReader(samples) inference = bayesianpy.model.InferenceEngine(network).create_engine() evidence = bayesianpy.model.Evidence(network, inference) query = bayesianpy.model.SingleQuery(network, inference, logger) query_type = [bayesianpy.model.QueryStateProbability('Cluster', suffix="")] # query the expected Cluster membership, and generate a wrapper for # comparing the values, weighted by cluster membership. while reader.read(): result = query.query(query_type, evidence=evidence.apply(reader.to_dict())) cv_results = [] for i, (key, value) in enumerate(result.items()): n = bayesianpy.network.Discrete.fromstring(key) weighting = cluster_accuracy[n.state] cv_results.append(bayesianpy.jni.bayesServerAnalysis(). DefaultCrossValidationTestResult( jp.JDouble(weighting), jp.JObject(value, jp.java.lang.Object), jp.java.lang.Double(jp.JDouble(value)))) score = bayesianpy.jni.bayesServerAnalysis().CrossValidation.combine( jp.java.util.Arrays.asList(cv_results), bayesianpy.jni.bayesServerAnalysis().CrossValidationCombineMethod. WEIGHTED_AVERAGE) # append the score on to the existing dataframe samples.set_value(reader.get_index(), 'score', score) variables = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] cmap = plt.cm.get_cmap('Blues') fig = plt.figure(figsize=(10, 10)) k = 1 # plot! for i, v in enumerate(variables): for j in range(i + 1, len(variables)): v1 = variables[j] ax = fig.add_subplot(3, 2, k) ax.set_title("{} vs {}".format(v, v1)) ax.scatter(x=iris[v].tolist(), y=iris[v1].tolist(), facecolors='none', alpha=0.1) h = ax.scatter(x=samples[v].tolist(), y=samples[v1].tolist(), c=samples['score'].tolist(), vmin=samples.score.min(), vmax=samples.score.max(), cmap=cmap) k += 1 fig.subplots_adjust(right=0.8) cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7]) fig.colorbar(h, cax=cbar_ax) plt.show()