def check_obj_and_reconstruction(self, obj): str_val = spn_to_str_equation(obj) obj_val = str_to_spn(str_val) self.assertEqual(str_val, spn_to_str_equation(obj_val))
def test_eval_parametric(self): data = np.array([1, 1, 1, 1, 1, 1, 1], dtype=np.float32).reshape( (1, 7)) spn = (Gaussian(mean=1.0, stdev=1.0, scope=[0]) * Exponential(l=1.0, scope=[1]) * Gamma(alpha=1.0, beta=1.0, scope=[2]) * LogNormal(mean=1.0, stdev=1.0, scope=[3]) * Poisson(mean=1.0, scope=[4]) * Bernoulli(p=0.6, scope=[5]) * Categorical(p=[0.1, 0.2, 0.7], scope=[6])) ll = log_likelihood(spn, data) tf_ll = eval_tf(spn, data) self.assertTrue(np.all(np.isclose(ll, tf_ll))) spn_copy = Copy(spn) tf_graph, data_placeholder, variable_dict = spn_to_tf_graph( spn_copy, data, 1) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) tf_graph_to_spn(variable_dict) str_val = spn_to_str_equation(spn) str_val2 = spn_to_str_equation(spn_copy) self.assertEqual(str_val, str_val2)
def to_str(): spn = create_SPN() spn_marg = marginalize() from spn.io.Text import spn_to_str_equation print(spn_to_str_equation(spn)) print(spn_to_str_equation(spn_marg))
def test_multiple_sum(self): spn = 0.6 * (0.4 * Gaussian(0.0, 1.0, scope=0) + 0.6 * Gaussian( 2.0, 1.0, scope=0)) + 0.4 * Gaussian(2.0, 1.0, scope=0) spn_text = "(0.6*((0.4*(Gaussian(V0|mean=0.0;stdev=1.0)) + 0.6*(Gaussian(V0|mean=2.0;stdev=1.0)))) + 0.4*(Gaussian(V0|mean=2.0;stdev=1.0)))" print(spn_to_str_equation(spn)) self.assertEqual(spn_to_str_equation(spn), spn_text)
def learn_CNET(): import numpy as np np.random.seed(123) train_data = np.random.binomial(1, [0.1, 0.2, 0.3, 0.4], size=(1000, 4)) print(np.mean(train_data, axis=0)) from spn.structure.leaves.cltree.CLTree import create_cltree_leaf from spn.structure.Base import Context from spn.structure.leaves.parametric.Parametric import Bernoulli ds_context = Context( parametric_types=[Bernoulli, Bernoulli, Bernoulli, Bernoulli ]).add_domains(train_data) from spn.algorithms.LearningWrappers import learn_parametric, learn_cnet cnet_naive_mle = learn_cnet(train_data, ds_context, cond="naive_mle", min_instances_slice=20, min_features_slice=1) cnet_random = learn_cnet(train_data, ds_context, cond="random", min_instances_slice=20, min_features_slice=1) from spn.algorithms.Statistics import get_structure_stats from spn.io.Text import spn_to_str_equation from spn.algorithms.Inference import log_likelihood print(get_structure_stats(cnet_naive_mle)) print(spn_to_str_equation(cnet_naive_mle)) ll = log_likelihood(cnet_naive_mle, train_data) print(np.mean(ll)) print(get_structure_stats(cnet_random)) print(spn_to_str_equation(cnet_random)) ll = log_likelihood(cnet_random, train_data) print(np.mean(ll)) from spn.algorithms.MPE import mpe train_data_mpe = train_data.astype(float) train_data_mpe[:, 0] = np.nan print(mpe(cnet_random, train_data_mpe)[:30]) ll = log_likelihood(cnet_random, train_data_mpe) print(np.mean(ll))
def test_sum(self): spn = 0.5 * Gaussian(0.0, 1.0, scope=0) + 0.5 * Gaussian( 2.0, 1.0, scope=0) spn_text = "(0.5*(Gaussian(V0|mean=0.0;stdev=1.0)) + 0.5*(Gaussian(V0|mean=2.0;stdev=1.0)))" self.assertEqual(spn_to_str_equation(spn), spn_text)
def save_exp(spn, ds_name, size, words, data): print(get_structure_stats(spn)) path = os.path.dirname(__file__) outprefix = path + "/spns/%s_%s/" % (ds_name, size) if not os.path.exists(outprefix): os.makedirs(outprefix) with open(outprefix + "eqq.txt", "w") as text_file: print(spn_to_str_equation(spn, words), file=text_file) with open(outprefix + "spn.txt", "w") as text_file: print(spn_to_str_ref_graph(spn, words), file=text_file) with codecs.open(outprefix + "spn.json", "w", "utf-8-sig") as text_file: text_file.write(to_JSON(spn)) with codecs.open(outprefix + "stats.txt", "w", "utf-8-sig") as text_file: text_file.write(get_structure_stats(spn)) text_file.write("\n") text_file.write("ads=%s \t muls=%s\n" % fpga_count_ops(spn)) np.savetxt(outprefix + "all_data.txt", data, delimiter=";", header=";".join(words))
def spn_to_ete(spn, context=None, unroll=False, symbols=_symbols): assert spn is not None tree = Tree() tree.id = spn.id tree.node_type = type(spn) tree.name = symbols.get(tree.node_type, spn.name) queue = [] if not isinstance(spn, Leaf): for i, child in enumerate(spn.children): if unroll: if child in queue: return "-> " + spn.id else: queue.append(child) c = spn_to_ete(child, context=context, unroll=unroll) if isinstance(spn, Sum): c.support = spn.weights[i] tree.add_child(c) else: feature_names = None if context is not None: feature_names = context.feature_names try: tree.name = spn_to_str_equation(spn, feature_names=feature_names) except: if feature_names is None: feature_names = [] tree.name += "(%s)" % ",".join(feature_names) return tree
def test_spn(self): spn = 0.4 * (Gaussian(0.0, 1.0, scope=0) * Gaussian(2.0, 3.0, scope=1)) + \ 0.6 * (Gaussian(4.0, 5.0, scope=0) * Gaussian(6.0, 7.0, scope=1)) spn_text = "(0.4*((Gaussian(V0|mean=0.0;stdev=1.0) * Gaussian(V1|mean=2.0;stdev=3.0))) + " + \ "0.6*((Gaussian(V0|mean=4.0;stdev=5.0) * Gaussian(V1|mean=6.0;stdev=7.0))))" self.assertEqual(spn_to_str_equation(spn), spn_text)
def learn_CLTSPN(): import numpy as np np.random.seed(123) train_data = np.random.binomial( 1, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.1], size=(100, 10)) print(np.mean(train_data, axis=0)) from spn.structure.leaves.cltree.CLTree import create_cltree_leaf from spn.structure.Base import Context from spn.structure.leaves.parametric.Parametric import Bernoulli ds_context = Context(parametric_types=[ Bernoulli, Bernoulli, Bernoulli, Bernoulli, Bernoulli, Bernoulli, Bernoulli, Bernoulli, Bernoulli, Bernoulli, ]).add_domains(train_data) from spn.algorithms.LearningWrappers import learn_parametric spn = learn_parametric( train_data, ds_context, min_instances_slice=20, min_features_slice=1, multivariate_leaf=True, leaves=create_cltree_leaf, ) from spn.algorithms.Statistics import get_structure_stats print(get_structure_stats(spn)) from spn.io.Text import spn_to_str_equation print(spn_to_str_equation(spn)) from spn.algorithms.Inference import log_likelihood ll = log_likelihood(spn, train_data) print(np.mean(ll))
def to_cpp2(node): vartype = "double" spn_eqq = spn_to_str_equation( node, node_to_str={ Histogram: lambda node, x, y: "leaf_node_%s(data[i][%s])" % (node.name, node.scope[0]) }) spn_function = """ {vartype} likelihood(int i, {vartype} data[][{scope_size}]){{ return {spn_eqq}; }} """.format(vartype=vartype, scope_size=len(node.scope), spn_eqq=spn_eqq) init_code = "" leaves_functions = "" for l in get_nodes_by_type(node, Leaf): leaf_name = "leaf_node_%s" % (l.name) leave_function, leave_init = _leaf_to_cpp[type(l)](l, leaf_name, vartype) leaves_functions += leave_function init_code += leave_init return """ #include <iostream> #include <string> #include <vector> #include <boost/algorithm/string.hpp> #include <boost/lexical_cast.hpp> #include <iomanip> #include <chrono> using namespace std; {leaves_functions} {spn_function} int main() {{ {init_code} vector<string> lines; for (string line; getline(std::cin, line);) {{ lines.push_back( line ); }} int n = lines.size()-1; int f = {scope_size}; auto data = new {vartype}[n][{scope_size}](); for(int i=0; i < n; i++){{ std::vector<std::string> strs; boost::split(strs, lines[i+1], boost::is_any_of(";")); for(int j=0; j < f; j++){{ data[i][j] = boost::lexical_cast<{vartype}>(strs[j]); }} }} auto result = new {vartype}[n]; chrono::high_resolution_clock::time_point begin = chrono::high_resolution_clock::now(); for(int j=0; j < 1000; j++){{ for(int i=0; i < n; i++){{ result[i] = likelihood(i, data); }} }} chrono::high_resolution_clock::time_point end = chrono::high_resolution_clock::now(); delete[] data; long double avglikelihood = 0; for(int i=0; i < n; i++){{ avglikelihood += log(result[i]); cout << setprecision(60) << log(result[i]) << endl; }} delete[] result; cout << setprecision(15) << "avg ll " << avglikelihood/n << endl; cout << "size of variables " << sizeof({vartype}) * 8 << endl; cout << setprecision(15)<< "time per instance " << (chrono::duration_cast<chrono::nanoseconds>(end-begin).count() / 1000.0) /n << " ns" << endl; cout << setprecision(15) << "time per task " << (chrono::duration_cast<chrono::nanoseconds>(end-begin).count() / 1000.0) << " ns" << endl; return 0; }} """.format(spn_function=spn_function, vartype=vartype, leaves_functions=leaves_functions, scope_size=len(node.scope), init_code=init_code)
scope=[0], init_weights=b_lf_1_init_weights) b_lf_2_init_weights = {Gaussian: 0.3, Gamma: 0.7} # b_lf_2_init_weights = np.array([.3, .7]) b_fat_right_leaf_2, _priors = type_mixture_leaf_factory( leaf_type='pm', leaf_meta_type=MetaType.REAL, type_to_param_map=pm_continuous_param_map, scope=[1], init_weights=b_lf_2_init_weights) l_r_prod.children = [b_fat_right_leaf_1, b_fat_right_leaf_2] # # composing rebuild_scopes_bottom_up(root) assign_ids(root) print(root) print(spn_to_str_equation(root)) global_W = compute_global_type_weights(root) print('GLOBAL_W', global_W) global_W = compute_global_type_weights(root, aggr_type=True) print('GLOBAL_W', global_W) gw_map = compute_leaf_global_mix_weights(root) print('G MIX W', gw_map) part_map = compute_partition_id_map(root) print('PARTITION MAP', part_map)
def test_prod(self): spn = Gaussian(0.0, 1.0, scope=0) * Gaussian(2.0, 1.0, scope=1) spn_text = "(Gaussian(V0|mean=0.0;stdev=1.0) * Gaussian(V1|mean=2.0;stdev=1.0))" self.assertEqual(spn_to_str_equation(spn), spn_text)
print(np.amax(testdata)) #testdata = whiten(testdata) print(np.amax(testdata)) testdata = testdata.astype(np.float32) print(testdata.dtype) ll = log_likelihood(thespn, testdata) print(ll, np.exp(ll)) optimized_spn = optimize_tf(thespn, testdata, epochs=100, optimizer=tf.train.RMSPropOptimizer(1e-4)) lloptimized = log_likelihood(optimized_spn, testdata) print(lloptimized, np.exp(lloptimized)) print(np.mean(lloptimized)) print(np.mean(ll)) #If done right, first value will be better. As expected, since we optimized ! txt = spn_to_str_equation(optimized_spn) #Uncomment if you wish to save optimized structure ''' text_file = open("./optca.txt", "w") text_file.write(txt) text_file.close() '''
def check_obj_and_reconstruction(self, obj): self.assertEqual( spn_to_str_equation(obj), spn_to_str_equation(str_to_spn(spn_to_str_equation(obj))))
(Gamma, {'alpha': 20, 'beta': 2}), (Exponential, {'l': 5})], } ds_context.param_form_map = type_param_map spn = learn_rand_spn(data, ds_context, min_instances_slice=500, row_a=2, row_b=5, col_a=2, col_b=5, col_threshold=0.3, memory=None, rand_gen=rand_gen) add_parametric_text_support() print(spn_to_str_equation(spn)) print(spn.scope) # # sampling again X, _Z, P = sample_instances(spn, D, N, rand_gen, return_Zs=True, return_partition=True, dtype=np.float64) # # visualizing stats = get_structure_stats_dict(spn) inv_leaf_map = {l.id: spn_to_str_equation(l) # l.__class__.__name__ for l in get_nodes_by_type(spn, Leaf)} title_str = "{} samples from spn with {} sums {} prods {} leaves".format(N, stats['sum'], stats['prod'],
def create_random_unconstrained_type_mixture_leaf( data, ds_context, scope, min_k=MIN_K_CAT, max_k=MAX_K_CAT, max_hyper_p_cat=MAX_HYPER_P_CAT, min_alpha=MIN_ALPHA_GAMMA, max_alpha=MAX_ALPHA_GAMMA): """ Method to be employed by LearnSPN-like pipeline to create a type leaf, based on convext parameters """ assert len( scope ) == 1, "scope of univariate histogram for more than one variable?" assert data.shape[1] == 1, "data has more than one feature?" idx = scope[0] rand_gen = ds_context.rand_gen meta_type = ds_context.meta_types[idx] true_type = ds_context.types[idx] param_map = ds_context.param_form_map[meta_type] priors = ds_context.priors allowed_param_forms = [] for tm, t_map in param_map.items(): for p_class, p_map in t_map.items(): allowed_param_forms.append((p_class, p_map)) # n_param_forms = int(np.sum([len(t_map) for tm, t_map in param_map.items()])) n_param_forms = len(allowed_param_forms) print(n_param_forms, 'meta type', meta_type, 'true type', true_type, 'allowed forms', allowed_param_forms) # # random init weights: only 1.0 over the true type # rand_init_weights = np.zeros(n_param_forms) rand_init_weights = {} allowed_types = np.array([ PARAM_FORM_TYPE_MAP[p_c] == true_type for p_c, p_map in allowed_param_forms ], dtype=bool) n_types = int(allowed_types.sum()) print('Allowed types', allowed_types, n_types) inv_type_map = {} j = 0 for i, t in enumerate(allowed_types): if t: inv_type_map[j] = i j += 1 nonzero_weight_id = rand_gen.choice(n_types) nonzero_weight_id = inv_type_map[nonzero_weight_id] for j, (p_c, _p_map) in enumerate(allowed_param_forms): if j == nonzero_weight_id: rand_init_weights[p_c] = 1.0 else: rand_init_weights[p_c] = 0.0 print('Selected weights', rand_init_weights) assert np.array([v for v in rand_init_weights.values()]).sum() == 1.0 # # random defaults defaults = { Categorical: { 'k': rand_gen.choice(range(min_k, max_k)), 'hyper-p': rand_gen.choice(max_hyper_p_cat) + 1 }, Gamma: { 'alpha': rand_gen.choice(range(min_alpha, max_alpha)) } } print( '\n\trandom default params for gamma and categorical:\n\t\t{}'.format( defaults)) # # random parameters param_map = random_params_from_priors(param_map, defaults, priors, rand_gen) print( '\n\trandom default params for gamma and categorical:\n\t\t{}'.format( defaults)) leaf, _leaf_prior = type_mixture_leaf_factory( leaf_type='pm', leaf_meta_type=meta_type, type_to_param_map=param_map, scope=scope, init_weights=rand_init_weights) print('\nCreated random type leaf: {}'.format(spn_to_str_equation(leaf))) return leaf
def fmt_chld(w, c): return str(w) + \ "*(" + spn_to_str_equation(c, feature_names, node_to_str) + ")" children_strs = map(lambda i: fmt_chld(
from spn.algorithms.Inference import likelihood from spn.io.Text import str_to_spn, to_JSON, spn_to_str_equation from spn.structure.StatisticalTypes import MetaType from spn.structure.leaves.parametric.Inference import add_parametric_inference_support from spn.structure.leaves.parametric.Parametric import * from spn.structure.leaves.parametric.Text import add_parametric_text_support if __name__ == '__main__': add_parametric_text_support() add_parametric_inference_support() cat = Categorical(p=[0.1, 0.2, 0.7]) cat.scope.append(0) print(spn_to_str_equation(cat)) catspn = str_to_spn(spn_to_str_equation(cat)) print(spn_to_str_equation(catspn)) original = Gaussian(mean=0, stdev=10) original.scope.append(0) s = spn_to_str_equation(original) print(s) recovered = str_to_spn(s) print(str_to_spn("Gaussian(V0|mean=1;stdev=10)")) gamma = Gamma(alpha=1, beta=2) gamma.scope.append(0) print(spn_to_str_equation(gamma))
def to_text(self): return spn_to_str_equation(self.spn)
row_b=args.beta_rows[1], col_a=args.beta_cols[0], col_b=args.beta_cols[1], col_threshold=args.col_split_threshold, memory=None, rand_gen=rand_gen) rebuild_scopes_bottom_up(spn) assign_ids(spn) learn_end_t = perf_counter() stats = get_structure_stats_dict(spn) logging.info('\n\nLearned spn in {} with stats:\n\t{}'.format( learn_end_t - learn_start_t, stats)) print(spn_to_str_equation(spn)) print(spn.scope) # # storing the spn on file spn_output_path = os.path.join(out_path, 'spn.model.pkl') store_start_t = perf_counter() with open(spn_output_path, 'wb') as f: pickle.dump(spn, f) store_end_t = perf_counter() logging.info('Stored spn to {} (in {} secs)'.format( spn_output_path, store_end_t - store_start_t)) # # actual sampling, generating the data # returning a partition matrix (discarding the Zs?)
) learn_end_t = perf_counter() learning_time = learn_end_t - learn_start_t logging.info( '\n\nLearned spn in {} secs\n\t with stats:\n\t{}'.format(learning_time, get_structure_stats_dict2(spn))) dump_obj(out_path, 'spn.model.pkl', spn) add_typed_leaves_text_support() add_parametric_inference_support() add_histogram_inference_support() add_histogram_text_support() add_piecewise_text_support() add_piecewise_inference_support() logging.info(spn_to_str_equation(spn)) logging.info(spn.scope) infer_start_t = perf_counter() infer_end_t = perf_counter() #print('Done in {}'.format(infer_end_t - infer_start_t)) samples = [] sample = {} if X_miss is None: X = X_train logging.info('Training on original train') else: logging.info('Training on train split with MISSING VALUES')
if __name__ == '__main__': add_parametric_inference_support() add_parametric_text_support() np.random.seed(42) data = np.random.randint(low=0, high=3, size=600).reshape(-1, 3) #print(data) ds_context = Context( meta_types=[MetaType.DISCRETE, MetaType.DISCRETE, MetaType.DISCRETE]) ds_context.add_domains(data) ds_context.parametric_types = [Poisson, Poisson, Categorical] spn = Sum() for label, count in zip(*np.unique(data[:, 2], return_counts=True)): branch = learn_parametric(data[data[:, 2] == label, :], ds_context, min_instances_slice=10000) spn.children.append(branch) spn.weights.append(count / data.shape[0]) spn.scope.extend(branch.scope) print(spn) print(spn_to_str_equation(spn)) print(log_likelihood(spn, data))