def test_Histogram_discrete_inference(self): data = np.array([1, 1, 2, 3, 3, 3]).reshape(-1, 1) ds_context = Context([MetaType.DISCRETE]) ds_context.add_domains(data) hist = create_histogram_leaf(data, ds_context, [0], alpha=False) prob = np.exp(log_likelihood(hist, data)) self.assertAlmostEqual(float(prob[0]), 2 / 6) self.assertAlmostEqual(float(prob[1]), 2 / 6) self.assertAlmostEqual(float(prob[2]), 1 / 6) self.assertAlmostEqual(float(prob[3]), 3 / 6) self.assertAlmostEqual(float(prob[4]), 3 / 6) self.assertAlmostEqual(float(prob[5]), 3 / 6) data = np.array([1, 1, 2, 3, 3, 3]).reshape(-1, 1) ds_context = Context([MetaType.DISCRETE]) ds_context.add_domains(data) hist = create_histogram_leaf(data, ds_context, [0], alpha=True) # print(np.var(data.shape[0])) prob = np.exp(log_likelihood(hist, data)) self.assertAlmostEqual(float(prob[0]), 3 / 9) self.assertAlmostEqual(float(prob[1]), 3 / 9) self.assertAlmostEqual(float(prob[2]), 2 / 9) self.assertAlmostEqual(float(prob[3]), 4 / 9) self.assertAlmostEqual(float(prob[4]), 4 / 9) self.assertAlmostEqual(float(prob[5]), 4 / 9)
def get_ds_context_sum(curr_train_data, scope, index, scope_index, params): """ returns the Context object of spflow to use with split_rows method while creating sum node for spmn """ n = curr_train_data.shape[1] curr_var_set_sum = params.partial_order[index:len(params.partial_order) + 1] curr_var_set_sum1 = [ var for curr_var_set in curr_var_set_sum for var in curr_var_set ] if params.util_to_bin: context = [Categorical] * n ds_context = Context( parametric_types=context, scope=scope, feature_names=curr_var_set_sum1).add_domains(curr_train_data) # utilty is meta type -- real else: if params.utility_node[0] in curr_var_set_sum1: context = [MetaType.DISCRETE] * (n - 1) context.append(MetaType.REAL) else: context = [MetaType.DISCRETE] * (n) scope = scope ds_context = Context( meta_types=context, scope=scope, feature_names=curr_var_set_sum1).add_domains(curr_train_data) return ds_context
def get_ds_context_prod(curr_train_data, scope, index, scope_index, params): """ returns the Context object of spflow to use with split_cols, learn_mspn or learn_parametric methods of spflow while creating product node for spmn """ n = curr_train_data.shape[1] scope_var = params.feature_names[scope_index:scope_index + n] context = [] # if parametric, all variables are meta type -- categorical if params.util_to_bin: context = [Categorical] * n ds_context = Context( parametric_types=context, scope=scope, feature_names=scope_var).add_domains(curr_train_data) # if mixed, utilty is meta type -- real else: if params.utility_node[0] in scope_var: context = [MetaType.DISCRETE] * (n - 1) context.append(MetaType.REAL) else: context = [MetaType.DISCRETE] * (n) scope = scope ds_context = Context( meta_types=context, scope=scope, feature_names=scope_var).add_domains(curr_train_data) return ds_context
def run(self, run: int, n_folds: int, fold_log: bool): base_path = "../../../data/continuous/" + self.data_name + "/10_folds/" train_datasets = [] test_datasets = [] ds_contexts = [] # Prepare folds' data for i in range(1, 11): train_data_path = base_path + self.data_name + "_" + str(i) + "_train.arff" test_data_path = base_path + self.data_name + "_" + str(i) + "_test.arff" # Load data train_data = arff.loadarff(train_data_path) train_data = pd.DataFrame(train_data[0]) train_data = train_data.values train_datasets.append(train_data) test_data = arff.loadarff(test_data_path) test_data = pd.DataFrame(test_data[0]) test_data = test_data.values test_datasets.append(test_data) # Create context for MSPN algorithm ds_context = Context(self.meta_types) ds_contexts.append(ds_context) # Apply KDE results_path = "../../../results/run_" + str(run) + "/continuous/" + self.data_name + "/" + str(n_folds) + "_folds/KDE/" KDE.apply(train_datasets, self.var_types_string, test_datasets, n_folds, results_path, self.data_name, fold_log)
def train(args): print('Training...') for i in range(len(args.spk_list)): spn_path = args.MODEL_DIR + '/' + args.spk_list[i]['spk_id'] + '.p' if not os.path.isfile(spn_path): with open(spn_path, 'wb') as f: pickle.dump([], f) print(chr(27) + "[2J") print( "Learn structure, spk: %i (%s)... (min_instances_slice: %i, threshold: %1.3f)." % (i, args.spk_list[i]['spk_id'], args.min_instances_slice, args.threshold)) train_batch = featpy.lsse( args.spk_list[i]['train_clean_speech'], args.spk_list[i]['train_clean_speech_len'], args.Nw, args.Ns, args.NFFT, args.fs, args.H) print("Features extracted.") ds_context = Context(parametric_types=[Gaussian] * args.M).add_domains(train_batch) with silence(): spn_spk = learn_parametric( train_batch, ds_context, min_instances_slice=args.min_instances_slice, threshold=args.threshold, cpus=args.ncores) with open(spn_path, 'wb') as f: pickle.dump(spn_spk, f)
def learn_parametric_spn(data, parametric_types): from spn.algorithms.LearningWrappers import learn_parametric ds_context = Context(parametric_types=parametric_types).add_domains(data) ds_context.add_domains(data) spn = learn_parametric(data, ds_context, min_instances_slice=100, threshold=0.01) return spn
def test_Histogram_expectations(self): data = np.random.randn(20000).reshape(-1, 1) ds_context = Context(meta_types=[MetaType.REAL]) ds_context.add_domains(data) hl = create_histogram_leaf(data, ds_context, scope=[0]) expectation = Expectation(hl, set([0])) self.assertAlmostEqual(np.mean(data[:, 0]), expectation[0, 0], 3) data = np.random.randint(0, high=100, size=20000).reshape(-1, 1) ds_context = Context(meta_types=[MetaType.DISCRETE]) ds_context.add_domains(data) hl = create_histogram_leaf(data, ds_context, scope=[0]) expectation = Expectation(hl, set([0])) self.assertAlmostEqual(np.mean(data[:, 0]), expectation[0, 0], 3)
def fit(self, X, y=None): y = y.reshape(y.shape[0], -1) self.num_labels = y.shape[1] self.context = Context(parametric_types=[Bernoulli] * self.num_labels).add_domains(y) self.context.feature_size = X.shape[1] self.scope = list(range(y.shape[1])) data = concatenate_yx(y, X) cspn_type = 1 if cspn_type == 0: self.cspn = create_conditional_leaf(data, self.context, self.scope) elif cspn_type == 1: split_rows = get_split_conditional_rows_KMeans() self.cspn, subtasks = create_sum(data=data, node_id=0, parent_id=0, pos=0, context=self.context, scope=self.scope, split_rows=split_rows) for i, subtask in enumerate(subtasks): self.cspn.children[i] = create_conditional_leaf( subtask[1]['data'], self.context, subtask[1]['scope']) print(self.cspn)
def test_leaf_mpe_bernoulli(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([10, 10], np.eye(2), 5000), np.random.multivariate_normal([1, 1], np.eye(2), 5000), ), axis=0, ) y = np.array([0] * 5000 + [1] * 5000).reshape(-1, 1) # associates y=0 with X=[10,10] # associates y=1 with X=[1,1] data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Bernoulli]) ds_context.feature_size = 2 leaf = create_conditional_leaf(data, ds_context, [0]) res = mpe(leaf, np.array([np.nan, 10, 10]).reshape(-1, 3)) self.assertAlmostEqual(res[0, 0], 0) res = mpe(leaf, np.array([np.nan, 1, 1]).reshape(-1, 3)) self.assertAlmostEqual(res[0, 0], 1) res = mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10]).reshape(-1, 3)) self.assertAlmostEqual(res[0, 0], 1) self.assertAlmostEqual(res[1, 0], 0) with self.assertRaises(AssertionError): mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10, 5, 10, 10]).reshape(-1, 3))
def test_leaf_mpe_gaussian(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([10, 10], np.eye(2), 5000), np.random.multivariate_normal([1, 1], np.eye(2), 5000), ), axis=0, ) y = np.array(np.random.normal(20, 2, 5000).tolist() + np.random.normal(60, 2, 5000).tolist()).reshape(-1, 1) # associates y=20 with X=[10,10] # associates y=60 with X=[1,1] data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Gaussian]) ds_context.feature_size = 2 # leaf = create_conditional_leaf(data, ds_context, [0]) leaf = create_parametric_leaf(data, ds_context, [0]) res = mpe(leaf, np.array([np.nan, 10, 10]).reshape(-1, 3)) self.assertAlmostEqual(res[0, 0], 20.435226001909466) res = mpe(leaf, np.array([np.nan, 1, 1]).reshape(-1, 3)) self.assertAlmostEqual(res[0, 0], 59.4752193542575) res = mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10]).reshape(-1, 3)) self.assertAlmostEqual(res[0, 0], 59.4752193542575) self.assertAlmostEqual(res[1, 0], 20.435226001909466) with self.assertRaises(AssertionError): mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10, 5, 10, 10]).reshape(-1, 3))
def fit(self, X, y=None): self.context = Context( parametric_types=self.parametric_types).add_domains(y) self.context.feature_size = X.shape[1] self.num_labels = y.shape[1] def label_conditional(y, x): from sklearn.cluster import KMeans clusters = KMeans(n_clusters=2, random_state=17, precompute_distances=True).fit_predict(x) return clusters self.cspn = learn_cspn_structure( concatenate_yx(y, X), self.context, split_rows=get_split_rows_conditional_Gower(), # split_rows=get_split_rows_KMeans(), # split_cols=get_split_cols_RDC_py(), split_cols=getCIGroup(alpha=self.alpha), # creeate_leaf = create_leaf_node, create_leaf=create_conditional_leaf, label_conditional=label_conditional, **self.kwargs) return self
def test_sample_range(self): np.random.seed(10) data = np.random.normal(20, scale=5, size=1000).reshape((1000, 1)) numpy_data = np.array(data, np.float64) meta_types = [MetaType.REAL] domains = [[np.min(numpy_data[:, 0]), np.max(numpy_data[:, 0])]] ds_context = Context(meta_types=meta_types, domains=domains) rand_gen = np.random.RandomState(100) pwl = create_piecewise_leaf(data, ds_context, scope=[0], prior_weight=None) rang = [NumericRange([[20]])] ranges = np.array(rang) samples = SamplingRange.sample_piecewise_node(pwl, 10, rand_gen, ranges) self.assertEqual(len(samples), 10) self.assertAlmostEqual(np.average(samples), 20) rang = [NumericRange([[20, 100]])] ranges = np.array(rang) samples = SamplingRange.sample_piecewise_node(pwl, 10, rand_gen, ranges) self.assertTrue(all(samples[samples > 20])) self.assertTrue(all(samples[samples < 100])) rang = [NumericRange([[10, 13], [20, 100]])] ranges = np.array(rang) samples = SamplingRange.sample_piecewise_node(pwl, 10, rand_gen, ranges) self.assertFalse( any(samples[np.where((samples > 13) & (samples < 20))])) self.assertFalse(any(samples[samples < 10]))
def test_conditional_probability(self): # test if conditional probability is correct # same spn as in entropy test # only for generating the ds_context train_data = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [2.0, 0.0, 1.0]]) # spn ds_context = Context(meta_types=[MetaType.DISCRETE] * 3) ds_context.add_domains(train_data) ds_context.parametric_type = [Categorical] * 3 spn = 0.64 * ( ( Categorical(p=[0.25, 0.75, 0.0], scope=0) * ( 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2))) + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2))) ) ) ) + 0.36 * ( ( Categorical(p=[0.0, 0.0, 1.0], scope=0) * ( 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2))) + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2))) ) ) ) # tests x_instance = np.array([1, 1, 0], dtype=float).reshape(1, -1) self.assertAlmostEqual(conditional_probability(spn, 2, x_instance)[0][0], 0.9) self.assertAlmostEqual(conditional_probability(spn, 0, x_instance)[0][0], 0.48) x_instance = np.array([2, 1, 0], dtype=float).reshape(1, -1) self.assertAlmostEqual(conditional_probability(spn, 0, x_instance)[0][0], 0.36)
def test_histogram_samples(self): import numpy as np from numpy.random.mtrand import RandomState from spn.algorithms.Sampling import sample_instances from spn.structure.Base import Context from spn.structure.StatisticalTypes import MetaType from spn.algorithms.LearningWrappers import learn_mspn np.random.seed(123) a = np.random.randint(2, size=10000).reshape(-1, 1) b = np.random.randint(3, size=10000).reshape(-1, 1) c = np.r_[np.random.normal(10, 5, (3000, 1)), np.random.normal(20, 10, (7000, 1))] d = 5 * a + 3 * b + c train_data = np.c_[a, b, c, d] ds_context = Context(meta_types=[ MetaType.DISCRETE, MetaType.DISCRETE, MetaType.REAL, MetaType.REAL ]).add_domains(train_data) mspn = learn_mspn(train_data, ds_context, min_instances_slice=200) samples = sample_instances( mspn, np.array([np.nan, np.nan, np.nan, np.nan] * 100).reshape(-1, 4), RandomState(123)) print(np.max(samples, axis=0), np.min(samples, axis=0)) print(ds_context.domains)
def test_leaf_bernoulli_bootstrap(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([10, 10], np.eye(2), 100), np.random.multivariate_normal([1, 1], np.eye(2), 100), ), axis=0, ) y = np.array([1] * 100 + [0] * 100).reshape(-1, 1) data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Bernoulli]) ds_context.feature_size = 2 leaf = create_conditional_leaf(data, ds_context, [0]) l = likelihood(leaf, data) neg_data = np.concatenate([1 - y, x], axis=1) lneg = likelihood(leaf, neg_data) np.testing.assert_array_almost_equal(l + lneg, 1.0) self.assertTrue(np.all(l >= 0.5)) self.assertTrue(np.all(lneg < 0.5))
def test_leaf_categorical(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([20, 20], np.eye(2), 500), np.random.multivariate_normal([10, 10], np.eye(2), 500), np.random.multivariate_normal([1, 1], np.eye(2), 500), ), axis=0, ) y = np.array([2] * 500 + [1] * 500 + [0] * 500).reshape(-1, 1) data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Categorical]) ds_context.feature_size = 2 leaf = create_conditional_leaf(data, ds_context, [0]) l0 = likelihood(leaf, concatenate_yx(np.ones_like(y) * 0, x)) l1 = likelihood(leaf, concatenate_yx(np.ones_like(y) * 1, x)) l2 = likelihood(leaf, concatenate_yx(np.ones_like(y) * 2, x)) np.testing.assert_array_almost_equal(l0 + l1 + l2, 1.0) self.assertTrue(np.all(l0[1000:1500] > 0.85)) self.assertTrue(np.all(l0[0:1000] < 0.15)) self.assertTrue(np.all(l1[500:1000] > 0.85)) self.assertTrue(np.all(l1[0:500] < 0.15)) self.assertTrue(np.all(l1[1000:1500] < 0.15)) self.assertTrue(np.all(l2[0:500] > 0.85)) self.assertTrue(np.all(l2[500:15000] < 0.15))
def test_leaf_gaussian(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([10, 10], np.eye(2), 5000), np.random.multivariate_normal([1, 1], np.eye(2), 5000), ), axis=0, ) y = np.array( np.random.normal(20, 2, 5000).tolist() + np.random.normal(60, 2, 5000).tolist()).reshape(-1, 1) # associates y=20 with X=[10,10] # associates y=60 with X=[1,1] data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Gaussian]) ds_context.feature_size = 2 leaf = create_conditional_leaf(data, ds_context, [0]) self.assertFalse(np.any(np.isnan(likelihood(leaf, data)))) self.assertGreater(get_ll(leaf, [20, 10, 10]), get_ll(leaf, [20, 1, 1])) self.assertGreater(get_ll(leaf, [60, 1, 1]), get_ll(leaf, [60, 10, 10])) self.assertAlmostEqual(get_ll(leaf, [60, 1, 1]), 0.3476232862652) self.assertAlmostEqual(get_ll(leaf, [20, 10, 10]), 0.3628922322773634)
def test_leaf_no_variance_gaussian(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([10, 10], np.eye(2), 500), np.random.multivariate_normal([1, 1], np.eye(2), 500), ), axis=0, ) y = np.array([1] * 1000).reshape(-1, 1) data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Gaussian]) ds_context.feature_size = 2 leaf = create_conditional_leaf(data, ds_context, [0]) l = likelihood(leaf, data) self.assertEqual(np.var(l[:, 0]), 0) self.assertAlmostEqual(l[0, 0], 0.398942280401432) data[:, 0] = 2 leaf = create_conditional_leaf(data, ds_context, [0]) l = likelihood(leaf, data) self.assertEqual(np.var(l[:, 0]), 0) self.assertAlmostEqual(l[0, 0], 0.398942280401432) data3 = np.array(data) data3[:, 0] = 3 leaf = create_conditional_leaf(data3, ds_context, [0]) l = likelihood(leaf, data) self.assertAlmostEqual(np.var(l[:, 0]), 0) self.assertAlmostEqual(l[0, 0], 0.241970724519143)
def learn_MSPN(): import numpy as np np.random.seed(123) a = np.random.randint(2, size=1000).reshape(-1, 1) b = np.random.randint(3, size=1000).reshape(-1, 1) c = np.r_[np.random.normal(10, 5, (300, 1)), np.random.normal(20, 10, (700, 1))] d = 5 * a + 3 * b + c train_data = np.c_[a, b, c, d] from spn.structure.Base import Context from spn.structure.StatisticalTypes import MetaType ds_context = Context(meta_types=[ MetaType.DISCRETE, MetaType.DISCRETE, MetaType.REAL, MetaType.REAL ]).add_domains(train_data) from spn.algorithms.LearningWrappers import learn_mspn mspn = learn_mspn(train_data, ds_context, min_instances_slice=20) from spn.algorithms.Statistics import get_structure_stats print(get_structure_stats(mspn))
def learn_PSPN(): import numpy as np np.random.seed(123) a = np.random.randint(2, size=1000).reshape(-1, 1) b = np.random.randint(3, size=1000).reshape(-1, 1) c = np.r_[np.random.normal(10, 5, (300, 1)), np.random.normal(20, 10, (700, 1))] d = 5 * a + 3 * b + c train_data = np.c_[a, b, c, d] from spn.structure.Base import Context from spn.structure.leaves.parametric.Parametric import Categorical, Gaussian ds_context = Context( parametric_types=[Categorical, Categorical, Gaussian, Gaussian ]).add_domains(train_data) from spn.algorithms.LearningWrappers import learn_parametric spn = learn_parametric(train_data, ds_context, min_instances_slice=20) from spn.algorithms.Statistics import get_structure_stats print(get_structure_stats(spn))
def classification(): import numpy as np np.random.seed(123) train_data = np.c_[np.r_[np.random.normal(5, 1, (500, 2)), np.random.normal(10, 1, (500, 2))], np.r_[np.zeros((500, 1)), np.ones((500, 1))]] centers = [[5, 5], [10, 10]] import matplotlib.pyplot as plt colors = ['#bda36b', '#7aaab4'] plt.figure() # plt.hold(True) for k, col in zip(range(2), colors): my_members = train_data[:, 2] == k plt.plot(train_data[my_members, 0], train_data[my_members, 1], 'w', markerfacecolor=col, marker='.') plt.plot(centers[k][0], centers[k][1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6) plt.title('Training Data') plt.grid(True) plt.savefig("classification_training_data.png", bbox_inches='tight', pad_inches=0) from spn.algorithms.LearningWrappers import learn_parametric, learn_classifier from spn.structure.leaves.parametric.Parametric import Categorical, Gaussian from spn.structure.Base import Context spn_classification = learn_classifier(train_data, Context(parametric_types=[Gaussian, Gaussian, Categorical]).add_domains( train_data), learn_parametric, 2) test_classification = np.array([3.0, 4.0, np.nan, 12.0, 18.0, np.nan]).reshape(-1, 3) print(test_classification) from spn.algorithms.MPE import mpe print(mpe(spn_classification, test_classification))
def learn_whittle_spn_2d(train_data, n_RV, n_min_slice, init_scope=None): from spn.structure.leaves.parametric.Parametric import MultivariateGaussian # learn spn ds_context = Context(parametric_types=[MultivariateGaussian] * n_RV).add_domains(train_data) print('learning WSPN') # need to pair RVs # need flag for 2d? l_rfft = get_l_rfft(args) # l_rfft!=None --> 2d/pair gaussian node, is_2d=True --> pairwise gaussian, full covariance matrix wspn = learn_parametric(train_data, ds_context, min_instances_slice=n_min_slice, threshold=args.threshold, initial_scope=init_scope, cpus=1, l_rfft=l_rfft, is_2d=True) save_path = get_save_path(args) check_path(save_path) f = open(save_path + 'wspn_2d.pkl', 'wb') pickle.dump(wspn, f) f.close() return wspn
def test_learn(self): from sklearn.datasets import load_iris iris = load_iris() X = iris.data y = iris.target.reshape(-1, 1) train_data = np.hstack((X, y)) from spn.algorithms.LearningWrappers import learn_parametric, learn_classifier from spn.structure.leaves.parametric.Parametric import Categorical, MultivariateGaussian from spn.structure.Base import Context spn_classification = learn_parametric( train_data, Context( parametric_types=[ MultivariateGaussian, MultivariateGaussian, MultivariateGaussian, MultivariateGaussian, Categorical, ] ).add_domains(train_data), multivariate_leaf=True, )
def test_conditional(self): labels = np.c_[np.zeros((500, 1)), np.ones((500, 1))] features = np.c_[ np.r_[np.random.normal(5, 1, (500, 2)), np.random.normal(10, 1, (500, 2))] ] train_data = concatenate_yx(labels, features) ds_context = Context( parametric_types=[Bernoulli] * labels.shape[1] ).add_domains(labels) ds_context.feature_size = 2 def label_conditional(y, x): from sklearn.cluster import KMeans clusters = KMeans( n_clusters=2, random_state=17, precompute_distances=True ).fit_predict(y) return clusters spn = learn_cspn_structure( train_data, ds_context, split_rows=get_split_conditional_rows_KMeans(), split_cols=getCIGroup(), create_leaf=create_conditional_leaf, label_conditional=label_conditional, cluster_univariate=True, )
def build_spn(features): spn_classification = learn_classifier( features, Context( parametric_types=[Gaussian, Categorical, Categorical, Gaussian ]).add_domains(features), learn_parametric, 2) return spn_classification
def test_optimization(self): np.random.seed(17) d1 = np.random.normal(10, 5, size=2000).tolist() d2 = np.random.normal(30, 5, size=2000).tolist() data = d1 + d2 data = np.array(data).reshape((-1, 10)) data = data.astype(np.float32) ds_context = Context(meta_types=[MetaType.REAL] * data.shape[1], parametric_types=[Gaussian] * data.shape[1]) spn = learn_parametric(data, ds_context) spn.weights = [0.8, 0.2] spn.children[0].children[0].mean = 3.0 py_ll = np.sum(log_likelihood(spn, data)) print(spn.weights, spn.children[0].children[0].mean) EM_optimization(spn, data, iterations=10) print(spn.weights, spn.children[0].children[0].mean) py_ll_opt = np.sum(log_likelihood(spn, data)) self.assertLessEqual(py_ll, py_ll_opt) self.assertAlmostEqual(spn.weights[0], 0.5, 6) self.assertAlmostEqual(spn.weights[1], 0.5, 6) self.assertAlmostEqual(spn.children[0].children[0].mean, 10.50531, 4)
def test_naive_factorization(self): np.random.seed(17) data = np.arange(0, 1000).reshape(-1, 8) parent = Sum() parent.children.append(None) ctx = Context() ctx.feature_size = 4 scope = [1, 3, 4, 6] data2 = np.array(data) result = naive_factorization(data=data2, parent=parent, pos=0, context=ctx, scope=list(scope)) self.assertListEqual(data.tolist(), data2.tolist()) self.assertEqual(parent.children[0], result[0][1]['parent']) y, x = get_YX(data, 4) self.assertEqual(len(result), len(scope)) for i, s in enumerate(scope): r = result[i] self.assertEqual(len(r), 2) self.assertEqual(r[0], SplittingOperations.CREATE_LEAF_NODE) self.assertEqual(type(r[1]['parent']), Product) self.assertEqual(r[1]['pos'], i) self.assertListEqual(r[1]['scope'], [s]) self.assertListEqual(r[1]['data'].tolist(), concatenate_yx(y[:, i], x).tolist())
def test_histogram_leaf(self): data = np.array([1, 1, 2, 3, 3, 3]).reshape(-1, 1) ds_context = Context([MetaType.DISCRETE]) ds_context.add_domains(data) hist = create_histogram_leaf(data, ds_context, [0], alpha=False) self.assertTrue( np.array_equal(mpe(hist, np.array([[np.nan]])), np.array([[3]])), "mpe should be 3")
def test_histogram_to_str_and_back(self): data = np.array([1, 1, 2, 3, 3, 3]).reshape(-1, 1) ds_context = Context([MetaType.DISCRETE]) ds_context.add_domains(data) hist = create_histogram_leaf(data, ds_context, [0], alpha=False) self.check_obj_and_reconstruction(hist)
def _fit(self, var_types=None, **kwargs): df = self.data.copy() # Exchange all object columns for their codes for key, value in self._categorical_variables.items(): df[key] = value['categorical'].codes self._nameToVarType = var_types #Check if variable types are given if self._nameToVarType is None: raise ValueError("missing argument 'var_types'") self._initial_names = self.names.copy() self._initial_names_count = len(self._initial_names) self._initial_names_to_index = {self._initial_names[i]: i for i in range(self._initial_names_count)} # Initialize _density_mask with np.nan self._density_mask = np.array( [np.nan for i in self._initial_names] ).reshape(-1, self._initial_names_count).astype(float) # Initialize _condition with np.nan self._condition = np.repeat( np.nan, self._initial_names_count ).reshape(-1, self._initial_names_count).astype(float) self._marginalized = set() self._conditioned = set() try: var_types = [self._nameToVarType[name] for name in self.names] except KeyError as err: raise ValueError('missing var type information for some dimension {}.'.format(err.args[0])) if self._spn_type == 'spn': context = Context(parametric_types=var_types).add_domains(df.values) self._spn = learn_parametric(df.values, context) elif self._spn_type == 'mspn': context = Context(meta_types=var_types).add_domains(df.values) self._spn = learn_mspn(df.values, context) else: raise Exception("Type of SPN not known: " + self._spn_type) return self._unbound_updater,