def test_optimization(self): np.random.seed(17) d1 = np.random.normal(10, 5, size=2000).tolist() d2 = np.random.normal(30, 5, size=2000).tolist() data = d1 + d2 data = np.array(data).reshape((-1, 10)) data = data.astype(np.float32) ds_context = Context(meta_types=[MetaType.REAL] * data.shape[1], parametric_types=[Gaussian] * data.shape[1]) spn = learn_parametric(data, ds_context) spn.weights = [0.8, 0.2] spn.children[0].children[0].mean = 3.0 py_ll = np.sum(log_likelihood(spn, data)) print(spn.weights, spn.children[0].children[0].mean) EM_optimization(spn, data, iterations=10) print(spn.weights, spn.children[0].children[0].mean) py_ll_opt = np.sum(log_likelihood(spn, data)) self.assertLessEqual(py_ll, py_ll_opt) self.assertAlmostEqual(spn.weights[0], 0.5, 6) self.assertAlmostEqual(spn.weights[1], 0.5, 6) self.assertAlmostEqual(spn.children[0].children[0].mean, 10.50531, 4)
def learn_parametric_spn(data, parametric_types): from spn.algorithms.LearningWrappers import learn_parametric ds_context = Context(parametric_types=parametric_types).add_domains(data) ds_context.add_domains(data) spn = learn_parametric(data, ds_context, min_instances_slice=100, threshold=0.01) return spn
def train(args): print('Training...') for i in range(len(args.spk_list)): spn_path = args.MODEL_DIR + '/' + args.spk_list[i]['spk_id'] + '.p' if not os.path.isfile(spn_path): with open(spn_path, 'wb') as f: pickle.dump([], f) print(chr(27) + "[2J") print( "Learn structure, spk: %i (%s)... (min_instances_slice: %i, threshold: %1.3f)." % (i, args.spk_list[i]['spk_id'], args.min_instances_slice, args.threshold)) train_batch = featpy.lsse( args.spk_list[i]['train_clean_speech'], args.spk_list[i]['train_clean_speech_len'], args.Nw, args.Ns, args.NFFT, args.fs, args.H) print("Features extracted.") ds_context = Context(parametric_types=[Gaussian] * args.M).add_domains(train_batch) with silence(): spn_spk = learn_parametric( train_batch, ds_context, min_instances_slice=args.min_instances_slice, threshold=args.threshold, cpus=args.ncores) with open(spn_path, 'wb') as f: pickle.dump(spn_spk, f)
def learn_whittle_spn_2d(train_data, n_RV, n_min_slice, init_scope=None): from spn.structure.leaves.parametric.Parametric import MultivariateGaussian # learn spn ds_context = Context(parametric_types=[MultivariateGaussian] * n_RV).add_domains(train_data) print('learning WSPN') # need to pair RVs # need flag for 2d? l_rfft = get_l_rfft(args) # l_rfft!=None --> 2d/pair gaussian node, is_2d=True --> pairwise gaussian, full covariance matrix wspn = learn_parametric(train_data, ds_context, min_instances_slice=n_min_slice, threshold=args.threshold, initial_scope=init_scope, cpus=1, l_rfft=l_rfft, is_2d=True) save_path = get_save_path(args) check_path(save_path) f = open(save_path + 'wspn_2d.pkl', 'wb') pickle.dump(wspn, f) f.close() return wspn
def test_learn(self): from sklearn.datasets import load_iris iris = load_iris() X = iris.data y = iris.target.reshape(-1, 1) train_data = np.hstack((X, y)) from spn.algorithms.LearningWrappers import learn_parametric, learn_classifier from spn.structure.leaves.parametric.Parametric import Categorical, MultivariateGaussian from spn.structure.Base import Context spn_classification = learn_parametric( train_data, Context( parametric_types=[ MultivariateGaussian, MultivariateGaussian, MultivariateGaussian, MultivariateGaussian, Categorical, ] ).add_domains(train_data), multivariate_leaf=True, )
def learn_PSPN(): import numpy as np np.random.seed(123) a = np.random.randint(2, size=1000).reshape(-1, 1) b = np.random.randint(3, size=1000).reshape(-1, 1) c = np.r_[np.random.normal(10, 5, (300, 1)), np.random.normal(20, 10, (700, 1))] d = 5 * a + 3 * b + c train_data = np.c_[a, b, c, d] from spn.structure.Base import Context from spn.structure.leaves.parametric.Parametric import Categorical, Gaussian ds_context = Context( parametric_types=[Categorical, Categorical, Gaussian, Gaussian ]).add_domains(train_data) from spn.algorithms.LearningWrappers import learn_parametric spn = learn_parametric(train_data, ds_context, min_instances_slice=20) from spn.algorithms.Statistics import get_structure_stats print(get_structure_stats(spn))
def learn_CLTSPN(): import numpy as np np.random.seed(123) train_data = np.random.binomial( 1, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.1], size=(100, 10)) print(np.mean(train_data, axis=0)) from spn.structure.leaves.cltree.CLTree import create_cltree_leaf from spn.structure.Base import Context from spn.structure.leaves.parametric.Parametric import Bernoulli ds_context = Context(parametric_types=[ Bernoulli, Bernoulli, Bernoulli, Bernoulli, Bernoulli, Bernoulli, Bernoulli, Bernoulli, Bernoulli, Bernoulli, ]).add_domains(train_data) from spn.algorithms.LearningWrappers import learn_parametric spn = learn_parametric( train_data, ds_context, min_instances_slice=20, min_features_slice=1, multivariate_leaf=True, leaves=create_cltree_leaf, ) from spn.algorithms.Statistics import get_structure_stats print(get_structure_stats(spn)) from spn.io.Text import spn_to_str_equation print(spn_to_str_equation(spn)) from spn.algorithms.Inference import log_likelihood ll = log_likelihood(spn, train_data) print(np.mean(ll))
def run_oSLRAU(dataset, update_after_no_min_batches, prune_after): data = get_data(dataset) data = np.where(np.isnan(data), np.ma.array(data, mask=np.isnan(data)).mean(axis=0), data) from sklearn.model_selection import train_test_split train_data, test_data = train_test_split(data, test_size=0.33, random_state=42) # make first mini_batch from data mini_batch_size = 50 first_mini_batch = data[0:mini_batch_size] n = first_mini_batch.shape[1] # num of variables print(n) context = [Gaussian] * n ds_context = Context( parametric_types=context).add_domains(first_mini_batch) # Learn initial spn spn = learn_parametric(first_mini_batch, ds_context) plot_spn(spn, 'intitial_spn.pdf') print(np.mean(log_likelihood(spn, test_data))) oSLRAU_params = oSLRAUParams(mergebatch_threshold=128, corrthresh=0.1, mvmaxscope=1, equalweight=True, currVals=True) no_of_minibatches = int(data.shape[0] / mini_batch_size) # update using oSLRAU for i in range(1, no_of_minibatches): mini_batch = data[i * mini_batch_size:(i + 1) * mini_batch_size] update_structure = False if update_after_no_min_batches // i == 0: print(i) update_structure = True spn = oSLRAU(spn, mini_batch, oSLRAU_params, update_structure) if i == prune_after: spn = Prune_oSLRAU(spn) print(np.mean(log_likelihood(spn, test_data))) plot_spn(spn, 'final_spn.pdf')
def learn_whittle_spn_1d(train_data, n_RV, n_min_slice=2000, init_scope=None): from spn.structure.leaves.parametric.Parametric import Gaussian # learn spn ds_context = Context(parametric_types=[Gaussian] * n_RV).add_domains(train_data) print('learning WSPN') # l_rfft=None --> 1d gaussian node, is_pair does not work wspn = learn_parametric(train_data, ds_context, min_instances_slice=n_min_slice, threshold=ARGS.threshold, initial_scope=init_scope, cpus=1, l_rfft=None, is_pair=False) save_path = get_save_path(ARGS) check_path(save_path) f = open(save_path + 'wspn_1d.pkl', 'wb') pickle.dump(wspn, f) f.close() return wspn
def _fit(self, var_types=None, **kwargs): df = self.data.copy() # Exchange all object columns for their codes for key, value in self._categorical_variables.items(): df[key] = value['categorical'].codes self._nameToVarType = var_types #Check if variable types are given if self._nameToVarType is None: raise ValueError("missing argument 'var_types'") self._initial_names = self.names.copy() self._initial_names_count = len(self._initial_names) self._initial_names_to_index = {self._initial_names[i]: i for i in range(self._initial_names_count)} # Initialize _density_mask with np.nan self._density_mask = np.array( [np.nan for i in self._initial_names] ).reshape(-1, self._initial_names_count).astype(float) # Initialize _condition with np.nan self._condition = np.repeat( np.nan, self._initial_names_count ).reshape(-1, self._initial_names_count).astype(float) self._marginalized = set() self._conditioned = set() try: var_types = [self._nameToVarType[name] for name in self.names] except KeyError as err: raise ValueError('missing var type information for some dimension {}.'.format(err.args[0])) if self._spn_type == 'spn': context = Context(parametric_types=var_types).add_domains(df.values) self._spn = learn_parametric(df.values, context) elif self._spn_type == 'mspn': context = Context(meta_types=var_types).add_domains(df.values) self._spn = learn_mspn(df.values, context) else: raise Exception("Type of SPN not known: " + self._spn_type) return self._unbound_updater,
def test_eval_gaussian(self): np.random.seed(17) data = np.random.normal(10, 0.01, size=2000).tolist() + np.random.normal( 30, 10, size=2000).tolist() data = np.array(data).reshape((-1, 10)) data = data.astype(np.float32) ds_context = Context(meta_types=[MetaType.REAL] * data.shape[1], parametric_types=[Gaussian] * data.shape[1]) spn = learn_parametric(data, ds_context) ll = log_likelihood(spn, data) tf_ll = eval_tf(spn, data) self.assertTrue(np.all(np.isclose(ll, tf_ll)))
def test_bernoulli_spn_ll(self): train_data = get_binary_data("dna")[3] train_data = train_data[:, 0:3] ds_context = Context(parametric_types=[Bernoulli] * 3, feature_names=["x0", "x1", "x2"]).add_domains(train_data) from spn.algorithms.LearningWrappers import learn_parametric spn = learn_parametric(train_data, ds_context, min_instances_slice=1500) print(get_structure_stats(spn)) sympyecc = spn_to_sympy(spn) print(sympyecc)
def test_optimization(self): np.random.seed(17) data = np.random.normal(10, 0.01, size=2000).tolist() + np.random.normal( 30, 10, size=2000).tolist() data = np.array(data).reshape((-1, 10)) data = data.astype(np.float32) ds_context = Context(meta_types=[MetaType.REAL] * data.shape[1], parametric_types=[Gaussian] * data.shape[1]) spn = learn_parametric(data, ds_context) spn.weights = [0.8, 0.2] py_ll = log_likelihood(spn, data) tf_graph, data_placeholder, variable_dict = spn_to_tf_graph(spn, data) loss = likelihood_loss(tf_graph) output = tf.train.AdamOptimizer(0.001).minimize(loss) with tf.Session() as session: session.run(tf.global_variables_initializer()) for step in range(50): session.run(output, feed_dict={data_placeholder: data}) # print("loss:", step, session.run(-loss, feed_dict={data_placeholder: data})) tf_ll_opt = session.run(tf_graph, feed_dict={ data_placeholder: data }).reshape(-1, 1) tf_graph_to_spn(variable_dict) py_ll_opt = log_likelihood(spn, data) # print(tf_ll_opt.sum(), py_ll_opt.sum()) self.assertTrue(np.all(np.isclose(tf_ll_opt, py_ll_opt))) self.assertLess(py_ll.sum(), tf_ll_opt.sum())
def learn_parametric_spn(data, parametric_types, rdc_threshold=0.3, min_instances_slice=0.05, clustering='kmeans'): ds_context = Context(parametric_types=parametric_types).add_domains(data) ds_context.add_domains(data) mis = int(len(data) * min_instances_slice) t0 = time.time() spn = learn_parametric(data, ds_context, threshold=rdc_threshold, min_instances_slice=mis, rows=clustering) const_time = time.time() - t0 return spn, const_time
def test_optimization(self): np.random.seed(17) data = np.random.normal(10, 0.01, size=2000).tolist() + np.random.normal( 30, 10, size=2000).tolist() data = np.array(data).reshape((-1, 10)) data = data.astype(np.float32) ds_context = Context(meta_types=[MetaType.REAL] * data.shape[1], parametric_types=[Gaussian] * data.shape[1]) spn = learn_parametric(data, ds_context) spn.weights = [0.8, 0.2] py_ll = log_likelihood(spn, data) print(spn.weights) EM_optimization(spn, data) print(spn.weights) py_ll_opt = log_likelihood(spn, data)
from spn.algorithms.LearningWrappers import learn_parametric from spn.algorithms.Inference import log_likelihood train_data = np.random.binomial( 1, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.1], size=(100, 10)) ds_context = Context(parametric_types=[ Bernoulli, Bernoulli, Bernoulli, Bernoulli, Bernoulli, Bernoulli, Bernoulli, Bernoulli, Bernoulli, Bernoulli, ]).add_domains(train_data) spn = learn_parametric( train_data, ds_context, min_instances_slice=20, min_features_slice=1, multivariate_leaf=True, leaves=create_cltree_leaf, ) ll = log_likelihood(spn, train_data) print(np.mean(ll))
def fit(self, train_X): param_type = [Gaussian for _ in range(train_X.shape[1])] self.spnfitter = learn_parametric(train_X, Context(parametric_types=param_type).add_domains(train_X), min_instances_slice=20)
if __name__ == '__main__': add_parametric_inference_support() add_parametric_text_support() np.random.seed(42) data = np.random.randint(low=0, high=3, size=600).reshape(-1, 3) #print(data) ds_context = Context( meta_types=[MetaType.DISCRETE, MetaType.DISCRETE, MetaType.DISCRETE]) ds_context.add_domains(data) ds_context.parametric_types = [Poisson, Poisson, Categorical] spn = Sum() for label, count in zip(*np.unique(data[:, 2], return_counts=True)): branch = learn_parametric(data[data[:, 2] == label, :], ds_context, min_instances_slice=10000) spn.children.append(branch) spn.weights.append(count / data.shape[0]) spn.scope.extend(branch.scope) print(spn) print(spn_to_str_equation(spn)) print(log_likelihood(spn, data))
def _fit(self, var_types=None, **kwargs): if self._spn_type == None: raise Exception("No SPN-type provided") if var_types != None: self.var_types = var_types else: var_types = self.var_types df = self.data.copy() # Exchange all object columns for their codes as SPFLOW cannot deal with Strings for key, value in self._categorical_variables.items(): df[key] = value['categorical'].codes self._nameToVarType = var_types # Check if variable types are given if self._nameToVarType is None: raise ValueError("missing argument 'var_types'") self._initial_names = self.names.copy() self._initial_names_count = len(self._initial_names) self._initial_names_to_index = { self._initial_names[i]: i for i in range(self._initial_names_count) } # Initialize _state_mask with np.nan self._state_mask = np.array([ np.nan for i in self._initial_names ]).reshape(-1, self._initial_names_count).astype(float) # Initialize _condition with np.nan self._condition = np.repeat(np.nan, self._initial_names_count).reshape( -1, self._initial_names_count).astype(float) self._marginalized = set() self._conditioned = set() try: var_types = [self._nameToVarType[name] for name in self.names] except KeyError as err: raise ValueError( 'missing var type information for dimension: {}.'.format( err.args[0])) if self._spn_type == 'spn': context = Context(parametric_types=var_types).add_domains( df.values) self._spn = learn_parametric(df.values, context) elif self._spn_type == 'mspn': context = Context(meta_types=var_types).add_domains(df.values) self._spn = learn_mspn(df.values, context) else: raise Exception("Type of SPN not known: " + self._spn_type) # TODO: DEBUG OUTPUT for NIPS2020 if self._spn: plot_spn(self._spn, fname=Path( f"../../bin/experiments/spn_graphs/{self.name}.pdf")) plot_spn_to_svg( self._spn, fname=Path( f"../../bin/experiments/spn_graphs/{self.name}.svg")) return self._unbound_updater,
num_mpes = 1 num_samples = 10 cspns = [] mpe_query_blocks = None sample_query_blocks = None for i, ((tr_block, block_idx), conditional_blocks) in enumerate(datasets): print("learning", i) conditional_features_count = (tr_block.shape[1] // len(block_idx)) * conditional_blocks if i == 0: # spn ds_context = Context(meta_types=[MetaType.REAL] * tr_block.shape[1]) ds_context.add_domains(tr_block) ds_context.parametric_types = [Gaussian] * tr_block.shape[1] cspn = learn_parametric(tr_block, ds_context, min_instances_slice=20, ohe=False, memory=memory) else: cspn = learn_conditional( tr_block, Context( meta_types=[MetaType.REAL] * tr_block.shape[1], parametric_types=[Conditional_Gaussian] * tr_block.shape[1], ).add_domains(tr_block), scope=list(range(conditional_features_count)), min_instances_slice=30, memory=memory, ) cspns.append(cspn) print("done") # for i, ((tr_block, block_idx), conditional_blocks) in enumerate(datasets):
# # spn # ds_context = Context(meta_types=[MetaType.REAL] * blocked_images[0].shape[1]) # ds_context.add_domains(blocked_images[0]) # ds_context.parametric_type = [Poisson] * blocked_images[0].shape[1] # # print("data ready", data.shape) # #the following two options should be working now. # # spn = learn_structure(upperimage, ds_context, get_split_rows_random_partition(np.random.RandomState(17)), get_split_cols_random_partition(np.random.RandomState(17)), create_parametric_leaf) # spn = learn_parametric(blocked_images[0], ds_context, min_instances_slice=0.1*len(data), ohe=False) # spn ds_context = Context(meta_types=[MetaType.DISCRETE] * 10) ds_context.add_domains(data_labels) ds_context.parametric_types = [Bernoulli] * blocked_images[0].shape[1] spn = learn_parametric(data_labels, ds_context, min_instances_slice=0.3 * len(data_labels)) # first cspn dataIn = data_labels dataOut = blocked_images[0] ds_context = Context(meta_types=[MetaType.DISCRETE] * dataOut.shape[1]) ds_context.add_domains(dataOut) ds_context.parametric_types = [Conditional_Poisson] * dataOut.shape[1] scope = list(range(dataOut.shape[1])) print(np.shape(dataIn), np.shape(dataOut)) print(dataIn[0], dataOut[0]) cspn_1st = learn_conditional(np.concatenate((dataOut, dataIn), axis=1), ds_context, scope,
cspns.append(spn) print("loaded %s from cache " % i) # continue # except: # pass if i > 40: break print("learning %s " % i) spn = None if i == 0: ds_context = Context(parametric_types=[CategoricalDictionary]) ds_context.add_domains(tr_block) spn = learn_parametric(tr_block, ds_context, min_instances_slice=min_instances_slice, ohe=False) else: cspn = CSPNClassifier(parametric_types=[Gaussian] * block_size, cluster_univariate=True, min_instances_slice=min_instances_slice, alpha=alpha, allow_sum_nodes=True ) y = tr_block[:, 0:block_size] X = tr_block[:, block_size:] cspn.fit(X, y) spn = cspn.cspn pickle.dump(spn, open(fname, "wb"))
def train_spn(window_size=3, min_instances_slice=10000, features=None, number_of_classes=3): if features is None: features = [20, 120] add_parametric_inference_support() add_parametric_text_support() data = get_data_in_window(window_size=window_size, features=features, three_classes=number_of_classes == 3) sss = sk.model_selection.StratifiedShuffleSplit(test_size=0.2, train_size=0.8, random_state=42) for train_index, test_index in sss.split( data[:, 0:window_size * window_size * len(features)], data[:, (window_size * window_size * len(features)) + (int(window_size * window_size / 2))]): X_train, X_test = data[train_index], data[test_index] context_list = list() parametric_list = list() number_of_features = len(features) for _ in range(number_of_features * window_size * window_size): context_list.append(MetaType.REAL) parametric_list.append(Gaussian) for _ in range(window_size * window_size): context_list.append(MetaType.DISCRETE) parametric_list.append(Categorical) ds_context = Context(meta_types=context_list) ds_context.add_domains(data) ds_context.parametric_types = parametric_list spn = load_spn(window_size, features, min_instances_slice, number_of_classes) if spn is None: spn = Sum() for class_pixel in tqdm(range(-window_size * window_size, 0)): for label, count in zip( *np.unique(data[:, class_pixel], return_counts=True)): train_data = X_train[X_train[:, class_pixel] == label, :] branch = learn_parametric( train_data, ds_context, min_instances_slice=min_instances_slice) spn.children.append(branch) spn.weights.append(train_data.shape[0]) spn.scope.extend(branch.scope) spn.weights = (np.array(spn.weights) / sum(spn.weights)).tolist() assign_ids(spn) save_spn(spn, window_size, features, min_instances_slice, number_of_classes) res = np.ndarray((X_test.shape[0], number_of_classes)) for i in tqdm(range(number_of_classes)): tmp = X_test.copy() tmp[:, -int((window_size**2) / 2)] = i res[:, i] = log_likelihood(spn, tmp)[:, 0] predicted_classes = np.argmax(res, axis=1).reshape((X_test.shape[0], 1)) correct_predicted = 0 for x, y in zip(X_test[:, -5], predicted_classes): if x == y[0]: correct_predicted += 1 accuracy = correct_predicted / X_test.shape[0] return spn, accuracy
print("_______") zeros[:, :horizontal_middle, :vertical_middle] = dataIn.reshape(len(data), 4, 4) #data[:, :horizontal_middle, :vertical_middle] zeros[:, :horizontal_middle, vertical_middle:] = dataOut.reshape(len(data), 4, 4) #data[:, :horizontal_middle, vertical_middle:] print(zeros[0], np.shape(zeros)) #print(np.concatenate((dataIn, dataOut), axis=1).reshape(len(dataIn), 4, 8)[0]) """ # spn ds_context = Context(meta_types=[MetaType.REAL] * blocked_images[0].shape[1]) ds_context.add_domains(blocked_images[0]) ds_context.parametric_types = [Poisson] * blocked_images[0].shape[1] print("data ready", data.shape) # the following two options should be working now. spn = learn_parametric(blocked_images[0], ds_context, min_instances_slice=0.1 * len(data), ohe=False) # cspn dataIn = blocked_images[ 0] # data[:, :horizontal_middle, :vertical_middle].reshape(len(data), -1) dataOut = blocked_images[ 1] # data[:, :horizontal_middle, vertical_middle:].reshape(len(data), -1) ds_context = Context(meta_types=[MetaType.REAL] * dataOut.shape[1]) ds_context.add_domains(dataOut) ds_context.parametric_types = [Conditional_Poisson] * dataOut.shape[1] scope = list(range(dataOut.shape[1])) print(np.shape(dataIn), np.shape(dataOut))
def __learn_spmn_structure(self, remaining_vars_data, remaining_vars_scope, curr_information_set_scope, index): logging.info( f'start of new recursion in __learn_spmn_structure method of SPMN') logging.debug(f'remaining_vars_scope: {remaining_vars_scope}') logging.debug( f'curr_information_set_scope: {curr_information_set_scope}') # rest set is remaining variables excluding the variables in current information set rest_set_scope = [ var_scope for var_scope in remaining_vars_scope if var_scope not in curr_information_set_scope ] logging.debug(f'rest_set_scope: {rest_set_scope}') scope_index = sum([len(x) for x in self.params.partial_order[:index]]) next_scope_index = sum( [len(x) for x in self.params.partial_order[:index + 1]]) if remaining_vars_scope == curr_information_set_scope: # this is last information set in partial order. Base case of recursion # test if current information set is a decision node if self.params.partial_order[index][ 0] in self.params.decision_nodes: raise Exception( f'last information set of partial order either contains random ' f'and utility variables or just a utility variable. ' f'This contains decision variable: {self.params.partial_order[index][0]}' ) else: # contains just the random and utility variables logging.info( f'at last information set of this recursive call: {curr_information_set_scope}' ) ds_context_last_information_set = get_ds_context( remaining_vars_data, remaining_vars_scope, self.params) if self.params.util_to_bin: last_information_set_spn = learn_parametric( remaining_vars_data, ds_context_last_information_set, min_instances_slice=20, initial_scope=remaining_vars_scope) else: last_information_set_spn = learn_mspn_for_spmn( remaining_vars_data, ds_context_last_information_set, min_instances_slice=20, initial_scope=remaining_vars_scope) logging.info(f'created spn at last information set') return last_information_set_spn # test for decision node. test if current information set is a decision node elif self.params.partial_order[index][0] in self.params.decision_nodes: decision_node = self.params.partial_order[index][0] logging.info(f'Encountered Decision Node: {decision_node}') # cluster the data from remaining variables w.r.t values of decision node clusters_on_next_remaining_vars, dec_vals = split_on_decision_node( remaining_vars_data) decision_node_children_spns = [] index += 1 next_information_set_scope = np.array( range(next_scope_index, next_scope_index + len(self.params.partial_order[index]))).tolist() next_remaining_vars_scope = rest_set_scope self.set_next_operation('Any') logging.info(f'split clusters based on decision node values') for cluster_on_next_remaining_vars in clusters_on_next_remaining_vars: decision_node_children_spns.append( self.__learn_spmn_structure(cluster_on_next_remaining_vars, next_remaining_vars_scope, next_information_set_scope, index)) decision_node_spn_branch = Max( dec_idx=scope_index, dec_values=dec_vals, children=decision_node_children_spns, feature_name=decision_node) assign_ids(decision_node_spn_branch) rebuild_scopes_bottom_up(decision_node_spn_branch) logging.info(f'created decision node') return decision_node_spn_branch # testing for independence else: curr_op = self.get_curr_operation() logging.debug( f'curr_op at prod node (independence test): {curr_op}') if curr_op != 'Sum': # fails if correlated variable set found in previous recursive call. # Without this condition code keeps looping at this stage ds_context = get_ds_context(remaining_vars_data, remaining_vars_scope, self.params) split_cols = get_split_cols_RDC_py() data_slices_prod = split_cols(remaining_vars_data, ds_context, remaining_vars_scope) logging.debug( f'{len(data_slices_prod)} slices found at data_slices_prod: ' ) prod_children = [] next_remaining_vars_scope = [] independent_vars_scope = [] for correlated_var_set_cluster, correlated_var_set_scope, weight in data_slices_prod: if any(var_scope in correlated_var_set_scope for var_scope in rest_set_scope): next_remaining_vars_scope.extend( correlated_var_set_scope) else: # this variable set of current information set is # not correlated to any variable in the rest set logging.info( f'independent variable set found: {correlated_var_set_scope}' ) ds_context_prod = get_ds_context( correlated_var_set_cluster, correlated_var_set_scope, self.params) if self.params.util_to_bin: independent_var_set_prod_child = learn_parametric( correlated_var_set_cluster, ds_context_prod, min_instances_slice=20, initial_scope=correlated_var_set_scope) else: independent_var_set_prod_child = learn_mspn_for_spmn( correlated_var_set_cluster, ds_context_prod, min_instances_slice=20, initial_scope=correlated_var_set_scope) independent_vars_scope.extend(correlated_var_set_scope) prod_children.append(independent_var_set_prod_child) logging.info( f'correlated variables over entire remaining variables ' f'at prod, passed for next recursion: ' f'{next_remaining_vars_scope}') # check if all variables in current information set are consumed if all(var_scope in independent_vars_scope for var_scope in curr_information_set_scope): index += 1 next_information_set_scope = np.array( range( next_scope_index, next_scope_index + len(self.params.partial_order[index]))).tolist() # since current information set is totally consumed next_remaining_vars_scope = rest_set_scope else: # some variables in current information set still remain index = index next_information_set_scope = set( curr_information_set_scope) - set( independent_vars_scope) next_remaining_vars_scope = next_information_set_scope | set( rest_set_scope) # convert unordered sets of scope to sorted lists to keep in sync with partial order next_information_set_scope = sorted( list(next_information_set_scope)) next_remaining_vars_scope = sorted( list(next_remaining_vars_scope)) self.set_next_operation('Sum') next_remaining_vars_data = column_slice_data_by_scope( remaining_vars_data, remaining_vars_scope, next_remaining_vars_scope) logging.info( f'independence test completed for current information set {curr_information_set_scope} ' f'and rest set {rest_set_scope} ') remaining_vars_prod_child = self.__learn_spmn_structure( next_remaining_vars_data, next_remaining_vars_scope, next_information_set_scope, index) prod_children.append(remaining_vars_prod_child) product_node = Product(children=prod_children) assign_ids(product_node) rebuild_scopes_bottom_up(product_node) logging.info(f'created product node') return product_node # Cluster the data else: curr_op = self.get_curr_operation() logging.debug(f'curr_op at sum node (cluster test): {curr_op}') split_rows = get_split_rows_KMeans() # from SPMNHelper.py if self.cluster_by_curr_information_set: curr_information_set_data = column_slice_data_by_scope( remaining_vars_data, remaining_vars_scope, curr_information_set_scope) ds_context_sum = get_ds_context( curr_information_set_data, curr_information_set_scope, self.params) data_slices_sum, km_model = split_rows( curr_information_set_data, ds_context_sum, curr_information_set_scope) logging.info( f'split clusters based on current information set {curr_information_set_scope}' ) else: # cluster on whole remaining variables ds_context_sum = get_ds_context(remaining_vars_data, remaining_vars_scope, self.params) data_slices_sum, km_model = split_rows( remaining_vars_data, ds_context_sum, remaining_vars_scope) logging.info( f'split clusters based on whole remaining variables {remaining_vars_scope}' ) sum_node_children = [] weights = [] index = index logging.debug( f'{len(data_slices_sum)} clusters found at data_slices_sum' ) cluster_num = 0 labels_array = km_model.labels_ logging.debug( f'cluster labels of rows: {labels_array} used to cluster data on ' f'total remaining variables {remaining_vars_scope}') for cluster, scope, weight in data_slices_sum: self.set_next_operation("Prod") # cluster whole remaining variables based on clusters formed. # below methods are useful if clusters were formed on just the current information set cluster_indices = get_row_indices_of_cluster( labels_array, cluster_num) cluster_on_remaining_vars = row_slice_data_by_indices( remaining_vars_data, cluster_indices) # logging.debug(np.array_equal(cluster_on_remaining_vars, cluster )) sum_node_children.append( self.__learn_spmn_structure( cluster_on_remaining_vars, remaining_vars_scope, curr_information_set_scope, index)) weights.append(weight) cluster_num += 1 sum_node = Sum(weights=weights, children=sum_node_children) assign_ids(sum_node) rebuild_scopes_bottom_up(sum_node) logging.info(f'created sum node') return sum_node
right = images2d[:, :, middle:].reshape((images.shape[0], -1)) # format: R|L conditional_training_data = np.concatenate((right.reshape(px, -1), left.reshape(px, -1)), axis=1) # In left, OUT right file_cache_path = "/tmp/cspn.bin" if not os.path.isfile(file_cache_path): spn_training_data = left.reshape(px, -1) spn_training_data = np.repeat(spn_training_data, 10, axis=0) ds_context = Context(parametric_types=[Bernoulli] * left.shape[1]).add_domains(spn_training_data) spn = learn_parametric(spn_training_data, ds_context, min_instances_slice=1) ds_context = Context(parametric_types=[Conditional_Bernoulli] * right.shape[1]).add_domains(right) scope = list(range(right.shape[1])) cspn = learn_conditional(conditional_training_data, ds_context, scope, min_instances_slice=60000000) with open(file_cache_path, 'wb') as f: pickle.dump((cspn, spn), f, pickle.HIGHEST_PROTOCOL) with open(file_cache_path, 'rb') as f: cspn, spn = pickle.load(f) def conditional_input_to_LR(input_images_in_rl): # format L|R images_to_lr = np.concatenate( (input_images_in_rl[:, input_images_in_rl.shape[1] // 2:].reshape(input_images_in_rl.shape[0], px, -1),
def learn_spmn_structure(train_data, index, scope_index, params): train_data = train_data curr_var_set = params.partial_order[index] if params.partial_order[index][0] in params.decision_nodes: decision_node = params.partial_order[index][0] cl, dec_vals= split_on_decision_node(train_data, curr_var_set) spn0 = [] index= index+1 set_next_operation("None") for c in cl: if index < len(params.partial_order): spn0.append(learn_spmn_structure(c, index, scope_index, params)) spn = Max(dec_values=dec_vals, children=spn0, feature_name=decision_node) else: spn = Max(dec_values=dec_vals, children=None, feature_name=decision_node) assign_ids(spn) rebuild_scopes_bottom_up(spn) return spn else: curr_train_data_prod, curr_train_data = get_curr_train_data_prod(train_data, curr_var_set) split_cols = get_split_cols_RDC_py() scope_prod = get_scope_prod(curr_train_data_prod, scope_index, params.feature_names) ds_context_prod = get_ds_context_prod(curr_train_data_prod, scope_prod, index, scope_index, params) data_slices_prod = split_cols(curr_train_data_prod, ds_context_prod, scope_prod) curr_op = get_next_operation() if len(data_slices_prod)>1 or curr_op == "Prod" or index == len(params.partial_order) : set_next_operation("Sum") if params.util_to_bin : spn0 = learn_parametric(curr_train_data_prod, ds_context_prod, min_instances_slice=20, initial_scope= scope_prod) else: spn0 = learn_mspn(curr_train_data_prod, ds_context_prod, min_instances_slice=20, initial_scope=scope_prod) index = index + 1 scope_index = scope_index +curr_train_data_prod.shape[1] if index < len(params.partial_order): spn1 = learn_spmn_structure(curr_train_data, index, scope_index, params) spn = Product(children=[spn0, spn1]) assign_ids(spn) rebuild_scopes_bottom_up(spn) else: spn = spn0 assign_ids(spn) rebuild_scopes_bottom_up(spn) else: split_rows = get_split_rows_KMeans() scope_sum = list(range(train_data.shape[1])) ds_context_sum = get_ds_context_sum(train_data, scope_sum, index, scope_index, params) data_slices_sum = split_rows(train_data, ds_context_sum, scope_sum) spn0 = [] weights = [] index = index if index < len(params.partial_order): for cl, scop, weight in data_slices_sum: set_next_operation("Prod") spn0.append(learn_spmn_structure(cl, index, scope_index, params)) weights.append(weight) spn = Sum(weights=weights, children=spn0) assign_ids(spn) rebuild_scopes_bottom_up(spn) assign_ids(spn) rebuild_scopes_bottom_up(spn) return spn
features = ["birthyear", "gender", "party"] co_keys = [ "corona", "covid", "pandem", "vaccin", "Corona", "Covid", "Pandem", "Vaccin", "impf", "Impf", "Maske", "mask", "Lockdown", "infiz", "Infektio" ] fl_keys = [ "Migrat", "Asyl", "Flücht", "Schlepper", "Seenot", "Einwanderung", "asyl", "flücht", "schlepp", "seenot", "einwander" ] is_keys = ["Islamis", "islamis", "Terror", "terror"] keywords = [co_keys] train_data = get_features(memberlist, features, tweet_list, keywords) spn = build_spn(train_data) print(cross_validate(train_data, 5, label=2)) #print(sample_instances(spn, np.array([0, np.nan] * 50).reshape(-1, 2), RandomState(123))) # tweet_scraping(tweet_list, api) ex = np.array([1976., 1., 4., 0.3]).reshape(-1, 4) ex2 = np.array([4., 0.2]).reshape(-1, 2) ds_context = Context( parametric_types=[Gaussian, Categorical, Categorical, Gaussian ]).add_domains(train_data) spn2 = learn_parametric(train_data, ds_context, min_instances_slice=20) spn_marg = marginalize(spn, [2, 3]) ll = log_likelihood(spn, ex) ll2 = log_likelihood(spn2, ex) llm = log_likelihood(spn_marg, ex) print(ll, np.exp(ll)) print(ll2, np.exp(ll2)) print(llm, np.exp(llm))