Пример #1
0
    def test_optimization(self):
        np.random.seed(17)
        d1 = np.random.normal(10, 5, size=2000).tolist()
        d2 = np.random.normal(30, 5, size=2000).tolist()
        data = d1 + d2
        data = np.array(data).reshape((-1, 10))
        data = data.astype(np.float32)

        ds_context = Context(meta_types=[MetaType.REAL] * data.shape[1],
                             parametric_types=[Gaussian] * data.shape[1])

        spn = learn_parametric(data, ds_context)

        spn.weights = [0.8, 0.2]
        spn.children[0].children[0].mean = 3.0

        py_ll = np.sum(log_likelihood(spn, data))

        print(spn.weights, spn.children[0].children[0].mean)

        EM_optimization(spn, data, iterations=10)

        print(spn.weights, spn.children[0].children[0].mean)

        py_ll_opt = np.sum(log_likelihood(spn, data))

        self.assertLessEqual(py_ll, py_ll_opt)
        self.assertAlmostEqual(spn.weights[0], 0.5, 6)
        self.assertAlmostEqual(spn.weights[1], 0.5, 6)
        self.assertAlmostEqual(spn.children[0].children[0].mean, 10.50531, 4)
Пример #2
0
def learn_parametric_spn(data, parametric_types):
    
    from spn.algorithms.LearningWrappers import learn_parametric
    ds_context = Context(parametric_types=parametric_types).add_domains(data)
    ds_context.add_domains(data)
    spn = learn_parametric(data, ds_context, min_instances_slice=100, threshold=0.01)
    return spn
Пример #3
0
def train(args):
    print('Training...')
    for i in range(len(args.spk_list)):
        spn_path = args.MODEL_DIR + '/' + args.spk_list[i]['spk_id'] + '.p'
        if not os.path.isfile(spn_path):
            with open(spn_path, 'wb') as f:
                pickle.dump([], f)
            print(chr(27) + "[2J")
            print(
                "Learn structure, spk: %i (%s)... (min_instances_slice: %i, threshold: %1.3f)."
                % (i, args.spk_list[i]['spk_id'], args.min_instances_slice,
                   args.threshold))
            train_batch = featpy.lsse(
                args.spk_list[i]['train_clean_speech'],
                args.spk_list[i]['train_clean_speech_len'], args.Nw, args.Ns,
                args.NFFT, args.fs, args.H)
            print("Features extracted.")
            ds_context = Context(parametric_types=[Gaussian] *
                                 args.M).add_domains(train_batch)
            with silence():
                spn_spk = learn_parametric(
                    train_batch,
                    ds_context,
                    min_instances_slice=args.min_instances_slice,
                    threshold=args.threshold,
                    cpus=args.ncores)
            with open(spn_path, 'wb') as f:
                pickle.dump(spn_spk, f)
Пример #4
0
def learn_whittle_spn_2d(train_data, n_RV, n_min_slice, init_scope=None):
    from spn.structure.leaves.parametric.Parametric import MultivariateGaussian

    # learn spn
    ds_context = Context(parametric_types=[MultivariateGaussian] *
                         n_RV).add_domains(train_data)

    print('learning WSPN')
    # need to pair RVs
    # need flag for 2d?
    l_rfft = get_l_rfft(args)
    # l_rfft!=None --> 2d/pair gaussian node, is_2d=True --> pairwise gaussian, full covariance matrix
    wspn = learn_parametric(train_data,
                            ds_context,
                            min_instances_slice=n_min_slice,
                            threshold=args.threshold,
                            initial_scope=init_scope,
                            cpus=1,
                            l_rfft=l_rfft,
                            is_2d=True)
    save_path = get_save_path(args)
    check_path(save_path)
    f = open(save_path + 'wspn_2d.pkl', 'wb')
    pickle.dump(wspn, f)
    f.close()

    return wspn
Пример #5
0
    def test_learn(self):
        from sklearn.datasets import load_iris

        iris = load_iris()
        X = iris.data
        y = iris.target.reshape(-1, 1)

        train_data = np.hstack((X, y))

        from spn.algorithms.LearningWrappers import learn_parametric, learn_classifier
        from spn.structure.leaves.parametric.Parametric import Categorical, MultivariateGaussian
        from spn.structure.Base import Context

        spn_classification = learn_parametric(
            train_data,
            Context(
                parametric_types=[
                    MultivariateGaussian,
                    MultivariateGaussian,
                    MultivariateGaussian,
                    MultivariateGaussian,
                    Categorical,
                ]
            ).add_domains(train_data),
            multivariate_leaf=True,
        )
Пример #6
0
def learn_PSPN():
    import numpy as np

    np.random.seed(123)

    a = np.random.randint(2, size=1000).reshape(-1, 1)
    b = np.random.randint(3, size=1000).reshape(-1, 1)
    c = np.r_[np.random.normal(10, 5, (300, 1)),
              np.random.normal(20, 10, (700, 1))]
    d = 5 * a + 3 * b + c
    train_data = np.c_[a, b, c, d]

    from spn.structure.Base import Context
    from spn.structure.leaves.parametric.Parametric import Categorical, Gaussian

    ds_context = Context(
        parametric_types=[Categorical, Categorical, Gaussian, Gaussian
                          ]).add_domains(train_data)

    from spn.algorithms.LearningWrappers import learn_parametric

    spn = learn_parametric(train_data, ds_context, min_instances_slice=20)

    from spn.algorithms.Statistics import get_structure_stats

    print(get_structure_stats(spn))
Пример #7
0
def learn_CLTSPN():
    import numpy as np

    np.random.seed(123)

    train_data = np.random.binomial(
        1, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.1], size=(100, 10))
    print(np.mean(train_data, axis=0))

    from spn.structure.leaves.cltree.CLTree import create_cltree_leaf
    from spn.structure.Base import Context
    from spn.structure.leaves.parametric.Parametric import Bernoulli

    ds_context = Context(parametric_types=[
        Bernoulli,
        Bernoulli,
        Bernoulli,
        Bernoulli,
        Bernoulli,
        Bernoulli,
        Bernoulli,
        Bernoulli,
        Bernoulli,
        Bernoulli,
    ]).add_domains(train_data)

    from spn.algorithms.LearningWrappers import learn_parametric

    spn = learn_parametric(
        train_data,
        ds_context,
        min_instances_slice=20,
        min_features_slice=1,
        multivariate_leaf=True,
        leaves=create_cltree_leaf,
    )

    from spn.algorithms.Statistics import get_structure_stats

    print(get_structure_stats(spn))

    from spn.io.Text import spn_to_str_equation

    print(spn_to_str_equation(spn))

    from spn.algorithms.Inference import log_likelihood

    ll = log_likelihood(spn, train_data)
    print(np.mean(ll))
Пример #8
0
def run_oSLRAU(dataset, update_after_no_min_batches, prune_after):

    data = get_data(dataset)
    data = np.where(np.isnan(data),
                    np.ma.array(data, mask=np.isnan(data)).mean(axis=0), data)

    from sklearn.model_selection import train_test_split
    train_data, test_data = train_test_split(data,
                                             test_size=0.33,
                                             random_state=42)

    # make first mini_batch from data
    mini_batch_size = 50
    first_mini_batch = data[0:mini_batch_size]

    n = first_mini_batch.shape[1]  # num of variables
    print(n)
    context = [Gaussian] * n
    ds_context = Context(
        parametric_types=context).add_domains(first_mini_batch)

    # Learn initial spn
    spn = learn_parametric(first_mini_batch, ds_context)
    plot_spn(spn, 'intitial_spn.pdf')
    print(np.mean(log_likelihood(spn, test_data)))

    oSLRAU_params = oSLRAUParams(mergebatch_threshold=128,
                                 corrthresh=0.1,
                                 mvmaxscope=1,
                                 equalweight=True,
                                 currVals=True)
    no_of_minibatches = int(data.shape[0] / mini_batch_size)

    # update using oSLRAU
    for i in range(1, no_of_minibatches):
        mini_batch = data[i * mini_batch_size:(i + 1) * mini_batch_size]

        update_structure = False
        if update_after_no_min_batches // i == 0:
            print(i)
            update_structure = True
        spn = oSLRAU(spn, mini_batch, oSLRAU_params, update_structure)

        if i == prune_after:
            spn = Prune_oSLRAU(spn)

    print(np.mean(log_likelihood(spn, test_data)))
    plot_spn(spn, 'final_spn.pdf')
Пример #9
0
def learn_whittle_spn_1d(train_data, n_RV, n_min_slice=2000, init_scope=None):
    from spn.structure.leaves.parametric.Parametric import Gaussian

    # learn spn
    ds_context = Context(parametric_types=[Gaussian] * n_RV).add_domains(train_data)

    print('learning WSPN')
    # l_rfft=None --> 1d gaussian node, is_pair does not work
    wspn = learn_parametric(train_data, ds_context, min_instances_slice=n_min_slice, threshold=ARGS.threshold,
                            initial_scope=init_scope, cpus=1, l_rfft=None, is_pair=False)
    save_path = get_save_path(ARGS)
    check_path(save_path)
    f = open(save_path + 'wspn_1d.pkl', 'wb')
    pickle.dump(wspn, f)
    f.close()

    return wspn
Пример #10
0
    def _fit(self, var_types=None, **kwargs):
        df = self.data.copy()
        # Exchange all object columns for their codes
        for key, value in self._categorical_variables.items():
            df[key] = value['categorical'].codes

        self._nameToVarType = var_types

        #Check if variable types are given
        if self._nameToVarType is None:
            raise ValueError("missing argument 'var_types'")

        self._initial_names = self.names.copy()
        self._initial_names_count = len(self._initial_names)
        self._initial_names_to_index = {self._initial_names[i]: i for i in range(self._initial_names_count)}

        # Initialize _density_mask with np.nan
        self._density_mask = np.array(
            [np.nan for i in self._initial_names]
        ).reshape(-1, self._initial_names_count).astype(float)

        # Initialize _condition with np.nan
        self._condition = np.repeat(
            np.nan,
            self._initial_names_count
        ).reshape(-1, self._initial_names_count).astype(float)

        self._marginalized = set()
        self._conditioned = set()

        try:
            var_types = [self._nameToVarType[name] for name in self.names]
        except KeyError as err:
            raise ValueError('missing var type information for some dimension {}.'.format(err.args[0]))

        if self._spn_type == 'spn':
            context = Context(parametric_types=var_types).add_domains(df.values)
            self._spn = learn_parametric(df.values, context)

        elif self._spn_type == 'mspn':
            context = Context(meta_types=var_types).add_domains(df.values)
            self._spn = learn_mspn(df.values, context)
        else:
            raise Exception("Type of SPN not known: " + self._spn_type)
        return self._unbound_updater,
Пример #11
0
    def test_eval_gaussian(self):
        np.random.seed(17)
        data = np.random.normal(10, 0.01,
                                size=2000).tolist() + np.random.normal(
                                    30, 10, size=2000).tolist()
        data = np.array(data).reshape((-1, 10))
        data = data.astype(np.float32)

        ds_context = Context(meta_types=[MetaType.REAL] * data.shape[1],
                             parametric_types=[Gaussian] * data.shape[1])

        spn = learn_parametric(data, ds_context)

        ll = log_likelihood(spn, data)

        tf_ll = eval_tf(spn, data)

        self.assertTrue(np.all(np.isclose(ll, tf_ll)))
Пример #12
0
    def test_bernoulli_spn_ll(self):
        train_data = get_binary_data("dna")[3]
        train_data = train_data[:, 0:3]
        ds_context = Context(parametric_types=[Bernoulli] * 3,
                             feature_names=["x0", "x1",
                                            "x2"]).add_domains(train_data)

        from spn.algorithms.LearningWrappers import learn_parametric

        spn = learn_parametric(train_data,
                               ds_context,
                               min_instances_slice=1500)

        print(get_structure_stats(spn))

        sympyecc = spn_to_sympy(spn)

        print(sympyecc)
Пример #13
0
    def test_optimization(self):
        np.random.seed(17)
        data = np.random.normal(10, 0.01,
                                size=2000).tolist() + np.random.normal(
                                    30, 10, size=2000).tolist()
        data = np.array(data).reshape((-1, 10))
        data = data.astype(np.float32)

        ds_context = Context(meta_types=[MetaType.REAL] * data.shape[1],
                             parametric_types=[Gaussian] * data.shape[1])

        spn = learn_parametric(data, ds_context)

        spn.weights = [0.8, 0.2]

        py_ll = log_likelihood(spn, data)

        tf_graph, data_placeholder, variable_dict = spn_to_tf_graph(spn, data)

        loss = likelihood_loss(tf_graph)

        output = tf.train.AdamOptimizer(0.001).minimize(loss)

        with tf.Session() as session:
            session.run(tf.global_variables_initializer())
            for step in range(50):
                session.run(output, feed_dict={data_placeholder: data})
                # print("loss:", step, session.run(-loss, feed_dict={data_placeholder: data}))

            tf_ll_opt = session.run(tf_graph,
                                    feed_dict={
                                        data_placeholder: data
                                    }).reshape(-1, 1)

            tf_graph_to_spn(variable_dict)

        py_ll_opt = log_likelihood(spn, data)

        # print(tf_ll_opt.sum(), py_ll_opt.sum())

        self.assertTrue(np.all(np.isclose(tf_ll_opt, py_ll_opt)))

        self.assertLess(py_ll.sum(), tf_ll_opt.sum())
Пример #14
0
def learn_parametric_spn(data,
                         parametric_types,
                         rdc_threshold=0.3,
                         min_instances_slice=0.05,
                         clustering='kmeans'):

    ds_context = Context(parametric_types=parametric_types).add_domains(data)
    ds_context.add_domains(data)
    mis = int(len(data) * min_instances_slice)

    t0 = time.time()
    spn = learn_parametric(data,
                           ds_context,
                           threshold=rdc_threshold,
                           min_instances_slice=mis,
                           rows=clustering)
    const_time = time.time() - t0

    return spn, const_time
Пример #15
0
    def test_optimization(self):
        np.random.seed(17)
        data = np.random.normal(10, 0.01,
                                size=2000).tolist() + np.random.normal(
                                    30, 10, size=2000).tolist()
        data = np.array(data).reshape((-1, 10))
        data = data.astype(np.float32)

        ds_context = Context(meta_types=[MetaType.REAL] * data.shape[1],
                             parametric_types=[Gaussian] * data.shape[1])

        spn = learn_parametric(data, ds_context)

        spn.weights = [0.8, 0.2]

        py_ll = log_likelihood(spn, data)

        print(spn.weights)

        EM_optimization(spn, data)

        print(spn.weights)

        py_ll_opt = log_likelihood(spn, data)
Пример #16
0
from spn.algorithms.LearningWrappers import learn_parametric
from spn.algorithms.Inference import log_likelihood

train_data = np.random.binomial(
    1, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.1], size=(100, 10))

ds_context = Context(parametric_types=[
    Bernoulli,
    Bernoulli,
    Bernoulli,
    Bernoulli,
    Bernoulli,
    Bernoulli,
    Bernoulli,
    Bernoulli,
    Bernoulli,
    Bernoulli,
]).add_domains(train_data)

spn = learn_parametric(
    train_data,
    ds_context,
    min_instances_slice=20,
    min_features_slice=1,
    multivariate_leaf=True,
    leaves=create_cltree_leaf,
)

ll = log_likelihood(spn, train_data)
print(np.mean(ll))
Пример #17
0
 def fit(self, train_X):
     param_type = [Gaussian for _ in range(train_X.shape[1])]
     self.spnfitter = learn_parametric(train_X, 
                                   Context(parametric_types=param_type).add_domains(train_X), 
                                   min_instances_slice=20)
Пример #18
0
if __name__ == '__main__':
    add_parametric_inference_support()
    add_parametric_text_support()

    np.random.seed(42)
    data = np.random.randint(low=0, high=3, size=600).reshape(-1, 3)

    #print(data)

    ds_context = Context(
        meta_types=[MetaType.DISCRETE, MetaType.DISCRETE, MetaType.DISCRETE])
    ds_context.add_domains(data)
    ds_context.parametric_types = [Poisson, Poisson, Categorical]

    spn = Sum()

    for label, count in zip(*np.unique(data[:, 2], return_counts=True)):
        branch = learn_parametric(data[data[:, 2] == label, :],
                                  ds_context,
                                  min_instances_slice=10000)
        spn.children.append(branch)
        spn.weights.append(count / data.shape[0])

    spn.scope.extend(branch.scope)

    print(spn)

    print(spn_to_str_equation(spn))

    print(log_likelihood(spn, data))
Пример #19
0
    def _fit(self, var_types=None, **kwargs):
        if self._spn_type == None:
            raise Exception("No SPN-type provided")

        if var_types != None:
            self.var_types = var_types
        else:
            var_types = self.var_types

        df = self.data.copy()
        # Exchange all object columns for their codes as SPFLOW cannot deal with Strings
        for key, value in self._categorical_variables.items():
            df[key] = value['categorical'].codes

        self._nameToVarType = var_types

        # Check if variable types are given
        if self._nameToVarType is None:
            raise ValueError("missing argument 'var_types'")

        self._initial_names = self.names.copy()
        self._initial_names_count = len(self._initial_names)
        self._initial_names_to_index = {
            self._initial_names[i]: i
            for i in range(self._initial_names_count)
        }

        # Initialize _state_mask with np.nan
        self._state_mask = np.array([
            np.nan for i in self._initial_names
        ]).reshape(-1, self._initial_names_count).astype(float)

        # Initialize _condition with np.nan
        self._condition = np.repeat(np.nan, self._initial_names_count).reshape(
            -1, self._initial_names_count).astype(float)

        self._marginalized = set()
        self._conditioned = set()

        try:
            var_types = [self._nameToVarType[name] for name in self.names]
        except KeyError as err:
            raise ValueError(
                'missing var type information for dimension: {}.'.format(
                    err.args[0]))

        if self._spn_type == 'spn':
            context = Context(parametric_types=var_types).add_domains(
                df.values)
            self._spn = learn_parametric(df.values, context)

        elif self._spn_type == 'mspn':
            context = Context(meta_types=var_types).add_domains(df.values)
            self._spn = learn_mspn(df.values, context)
        else:
            raise Exception("Type of SPN not known: " + self._spn_type)

        # TODO: DEBUG OUTPUT for NIPS2020
        if self._spn:
            plot_spn(self._spn,
                     fname=Path(
                         f"../../bin/experiments/spn_graphs/{self.name}.pdf"))
            plot_spn_to_svg(
                self._spn,
                fname=Path(
                    f"../../bin/experiments/spn_graphs/{self.name}.svg"))
        return self._unbound_updater,
Пример #20
0
    num_mpes = 1
    num_samples = 10

    cspns = []
    mpe_query_blocks = None
    sample_query_blocks = None
    for i, ((tr_block, block_idx), conditional_blocks) in enumerate(datasets):
        print("learning", i)
        conditional_features_count = (tr_block.shape[1] // len(block_idx)) * conditional_blocks
        if i == 0:
            # spn
            ds_context = Context(meta_types=[MetaType.REAL] * tr_block.shape[1])
            ds_context.add_domains(tr_block)
            ds_context.parametric_types = [Gaussian] * tr_block.shape[1]

            cspn = learn_parametric(tr_block, ds_context, min_instances_slice=20, ohe=False, memory=memory)
        else:
            cspn = learn_conditional(
                tr_block,
                Context(
                    meta_types=[MetaType.REAL] * tr_block.shape[1],
                    parametric_types=[Conditional_Gaussian] * tr_block.shape[1],
                ).add_domains(tr_block),
                scope=list(range(conditional_features_count)),
                min_instances_slice=30,
                memory=memory,
            )
        cspns.append(cspn)
        print("done")

        # for i, ((tr_block, block_idx), conditional_blocks) in enumerate(datasets):
Пример #21
0
    # # spn
    # ds_context = Context(meta_types=[MetaType.REAL] * blocked_images[0].shape[1])
    # ds_context.add_domains(blocked_images[0])
    # ds_context.parametric_type = [Poisson] * blocked_images[0].shape[1]
    #
    # print("data ready", data.shape)
    # #the following two options should be working now.
    # # spn = learn_structure(upperimage, ds_context, get_split_rows_random_partition(np.random.RandomState(17)), get_split_cols_random_partition(np.random.RandomState(17)), create_parametric_leaf)
    # spn = learn_parametric(blocked_images[0], ds_context, min_instances_slice=0.1*len(data), ohe=False)

    # spn
    ds_context = Context(meta_types=[MetaType.DISCRETE] * 10)
    ds_context.add_domains(data_labels)
    ds_context.parametric_types = [Bernoulli] * blocked_images[0].shape[1]
    spn = learn_parametric(data_labels,
                           ds_context,
                           min_instances_slice=0.3 * len(data_labels))

    # first cspn
    dataIn = data_labels
    dataOut = blocked_images[0]
    ds_context = Context(meta_types=[MetaType.DISCRETE] * dataOut.shape[1])
    ds_context.add_domains(dataOut)
    ds_context.parametric_types = [Conditional_Poisson] * dataOut.shape[1]

    scope = list(range(dataOut.shape[1]))
    print(np.shape(dataIn), np.shape(dataOut))
    print(dataIn[0], dataOut[0])
    cspn_1st = learn_conditional(np.concatenate((dataOut, dataIn), axis=1),
                                 ds_context,
                                 scope,
Пример #22
0
    cspns.append(spn)
    print("loaded %s from cache " % i)
        # continue
    # except:
    #     pass

    if i > 40:
        break
    print("learning %s " % i)

    spn = None
    if i == 0:
        ds_context = Context(parametric_types=[CategoricalDictionary])
        ds_context.add_domains(tr_block)

        spn = learn_parametric(tr_block, ds_context, min_instances_slice=min_instances_slice, ohe=False)

    else:
        cspn = CSPNClassifier(parametric_types=[Gaussian] * block_size,
                              cluster_univariate=True, min_instances_slice=min_instances_slice,
                              alpha=alpha,
                              allow_sum_nodes=True
                              )

        y = tr_block[:, 0:block_size]
        X = tr_block[:, block_size:]
        cspn.fit(X, y)
        spn = cspn.cspn

    pickle.dump(spn, open(fname, "wb"))
Пример #23
0
def train_spn(window_size=3,
              min_instances_slice=10000,
              features=None,
              number_of_classes=3):
    if features is None:
        features = [20, 120]

    add_parametric_inference_support()
    add_parametric_text_support()

    data = get_data_in_window(window_size=window_size,
                              features=features,
                              three_classes=number_of_classes == 3)

    sss = sk.model_selection.StratifiedShuffleSplit(test_size=0.2,
                                                    train_size=0.8,
                                                    random_state=42)
    for train_index, test_index in sss.split(
            data[:, 0:window_size * window_size * len(features)],
            data[:, (window_size * window_size * len(features)) +
                 (int(window_size * window_size / 2))]):
        X_train, X_test = data[train_index], data[test_index]

    context_list = list()
    parametric_list = list()
    number_of_features = len(features)
    for _ in range(number_of_features * window_size * window_size):
        context_list.append(MetaType.REAL)
        parametric_list.append(Gaussian)

    for _ in range(window_size * window_size):
        context_list.append(MetaType.DISCRETE)
        parametric_list.append(Categorical)

    ds_context = Context(meta_types=context_list)
    ds_context.add_domains(data)
    ds_context.parametric_types = parametric_list

    spn = load_spn(window_size, features, min_instances_slice,
                   number_of_classes)
    if spn is None:
        spn = Sum()
        for class_pixel in tqdm(range(-window_size * window_size, 0)):
            for label, count in zip(
                    *np.unique(data[:, class_pixel], return_counts=True)):
                train_data = X_train[X_train[:, class_pixel] == label, :]
                branch = learn_parametric(
                    train_data,
                    ds_context,
                    min_instances_slice=min_instances_slice)
                spn.children.append(branch)
                spn.weights.append(train_data.shape[0])

        spn.scope.extend(branch.scope)
        spn.weights = (np.array(spn.weights) / sum(spn.weights)).tolist()

        assign_ids(spn)
        save_spn(spn, window_size, features, min_instances_slice,
                 number_of_classes)

    res = np.ndarray((X_test.shape[0], number_of_classes))

    for i in tqdm(range(number_of_classes)):
        tmp = X_test.copy()
        tmp[:, -int((window_size**2) / 2)] = i
        res[:, i] = log_likelihood(spn, tmp)[:, 0]

    predicted_classes = np.argmax(res, axis=1).reshape((X_test.shape[0], 1))

    correct_predicted = 0
    for x, y in zip(X_test[:, -5], predicted_classes):
        if x == y[0]:
            correct_predicted += 1
    accuracy = correct_predicted / X_test.shape[0]
    return spn, accuracy
Пример #24
0
    print("_______")
    zeros[:, :horizontal_middle, :vertical_middle] = dataIn.reshape(len(data), 4, 4)  #data[:, :horizontal_middle, :vertical_middle]
    zeros[:, :horizontal_middle, vertical_middle:] = dataOut.reshape(len(data), 4, 4) #data[:, :horizontal_middle, vertical_middle:]
    print(zeros[0], np.shape(zeros))    #print(np.concatenate((dataIn, dataOut), axis=1).reshape(len(dataIn), 4, 8)[0])
    """

    # spn
    ds_context = Context(meta_types=[MetaType.REAL] *
                         blocked_images[0].shape[1])
    ds_context.add_domains(blocked_images[0])
    ds_context.parametric_types = [Poisson] * blocked_images[0].shape[1]

    print("data ready", data.shape)
    # the following two options should be working now.
    spn = learn_parametric(blocked_images[0],
                           ds_context,
                           min_instances_slice=0.1 * len(data),
                           ohe=False)

    # cspn
    dataIn = blocked_images[
        0]  # data[:, :horizontal_middle, :vertical_middle].reshape(len(data), -1)
    dataOut = blocked_images[
        1]  # data[:, :horizontal_middle, vertical_middle:].reshape(len(data), -1)

    ds_context = Context(meta_types=[MetaType.REAL] * dataOut.shape[1])
    ds_context.add_domains(dataOut)
    ds_context.parametric_types = [Conditional_Poisson] * dataOut.shape[1]

    scope = list(range(dataOut.shape[1]))
    print(np.shape(dataIn), np.shape(dataOut))
Пример #25
0
    def __learn_spmn_structure(self, remaining_vars_data, remaining_vars_scope,
                               curr_information_set_scope, index):

        logging.info(
            f'start of new recursion in __learn_spmn_structure method of SPMN')
        logging.debug(f'remaining_vars_scope: {remaining_vars_scope}')
        logging.debug(
            f'curr_information_set_scope: {curr_information_set_scope}')

        # rest set is remaining variables excluding the variables in current information set
        rest_set_scope = [
            var_scope for var_scope in remaining_vars_scope
            if var_scope not in curr_information_set_scope
        ]

        logging.debug(f'rest_set_scope: {rest_set_scope}')

        scope_index = sum([len(x) for x in self.params.partial_order[:index]])
        next_scope_index = sum(
            [len(x) for x in self.params.partial_order[:index + 1]])

        if remaining_vars_scope == curr_information_set_scope:
            # this is last information set in partial order. Base case of recursion

            # test if current information set is a decision node
            if self.params.partial_order[index][
                    0] in self.params.decision_nodes:
                raise Exception(
                    f'last information set of partial order either contains random '
                    f'and utility variables or just a utility variable. '
                    f'This contains decision variable: {self.params.partial_order[index][0]}'
                )

            else:
                # contains just the random and utility variables

                logging.info(
                    f'at last information set of this recursive call: {curr_information_set_scope}'
                )
                ds_context_last_information_set = get_ds_context(
                    remaining_vars_data, remaining_vars_scope, self.params)

                if self.params.util_to_bin:

                    last_information_set_spn = learn_parametric(
                        remaining_vars_data,
                        ds_context_last_information_set,
                        min_instances_slice=20,
                        initial_scope=remaining_vars_scope)

                else:

                    last_information_set_spn = learn_mspn_for_spmn(
                        remaining_vars_data,
                        ds_context_last_information_set,
                        min_instances_slice=20,
                        initial_scope=remaining_vars_scope)

            logging.info(f'created spn at last information set')
            return last_information_set_spn

        # test for decision node. test if current information set is a decision node
        elif self.params.partial_order[index][0] in self.params.decision_nodes:

            decision_node = self.params.partial_order[index][0]

            logging.info(f'Encountered Decision Node: {decision_node}')

            # cluster the data from remaining variables w.r.t values of decision node
            clusters_on_next_remaining_vars, dec_vals = split_on_decision_node(
                remaining_vars_data)

            decision_node_children_spns = []
            index += 1

            next_information_set_scope = np.array(
                range(next_scope_index, next_scope_index +
                      len(self.params.partial_order[index]))).tolist()

            next_remaining_vars_scope = rest_set_scope
            self.set_next_operation('Any')

            logging.info(f'split clusters based on decision node values')
            for cluster_on_next_remaining_vars in clusters_on_next_remaining_vars:

                decision_node_children_spns.append(
                    self.__learn_spmn_structure(cluster_on_next_remaining_vars,
                                                next_remaining_vars_scope,
                                                next_information_set_scope,
                                                index))

            decision_node_spn_branch = Max(
                dec_idx=scope_index,
                dec_values=dec_vals,
                children=decision_node_children_spns,
                feature_name=decision_node)

            assign_ids(decision_node_spn_branch)
            rebuild_scopes_bottom_up(decision_node_spn_branch)
            logging.info(f'created decision node')
            return decision_node_spn_branch

        # testing for independence
        else:

            curr_op = self.get_curr_operation()
            logging.debug(
                f'curr_op at prod node (independence test): {curr_op}')

            if curr_op != 'Sum':  # fails if correlated variable set found in previous recursive call.
                # Without this condition code keeps looping at this stage

                ds_context = get_ds_context(remaining_vars_data,
                                            remaining_vars_scope, self.params)

                split_cols = get_split_cols_RDC_py()
                data_slices_prod = split_cols(remaining_vars_data, ds_context,
                                              remaining_vars_scope)

                logging.debug(
                    f'{len(data_slices_prod)} slices found at data_slices_prod: '
                )

                prod_children = []
                next_remaining_vars_scope = []
                independent_vars_scope = []

                for correlated_var_set_cluster, correlated_var_set_scope, weight in data_slices_prod:

                    if any(var_scope in correlated_var_set_scope
                           for var_scope in rest_set_scope):

                        next_remaining_vars_scope.extend(
                            correlated_var_set_scope)

                    else:
                        # this variable set of current information set is
                        # not correlated to any variable in the rest set

                        logging.info(
                            f'independent variable set found: {correlated_var_set_scope}'
                        )

                        ds_context_prod = get_ds_context(
                            correlated_var_set_cluster,
                            correlated_var_set_scope, self.params)

                        if self.params.util_to_bin:

                            independent_var_set_prod_child = learn_parametric(
                                correlated_var_set_cluster,
                                ds_context_prod,
                                min_instances_slice=20,
                                initial_scope=correlated_var_set_scope)

                        else:

                            independent_var_set_prod_child = learn_mspn_for_spmn(
                                correlated_var_set_cluster,
                                ds_context_prod,
                                min_instances_slice=20,
                                initial_scope=correlated_var_set_scope)
                        independent_vars_scope.extend(correlated_var_set_scope)
                        prod_children.append(independent_var_set_prod_child)

                logging.info(
                    f'correlated variables over entire remaining variables '
                    f'at prod, passed for next recursion: '
                    f'{next_remaining_vars_scope}')

                # check if all variables in current information set are consumed
                if all(var_scope in independent_vars_scope
                       for var_scope in curr_information_set_scope):

                    index += 1
                    next_information_set_scope = np.array(
                        range(
                            next_scope_index, next_scope_index +
                            len(self.params.partial_order[index]))).tolist()

                    # since current information set is totally consumed
                    next_remaining_vars_scope = rest_set_scope

                else:
                    # some variables in current information set still remain
                    index = index

                    next_information_set_scope = set(
                        curr_information_set_scope) - set(
                            independent_vars_scope)
                    next_remaining_vars_scope = next_information_set_scope | set(
                        rest_set_scope)

                    # convert unordered sets of scope to sorted lists to keep in sync with partial order
                    next_information_set_scope = sorted(
                        list(next_information_set_scope))
                    next_remaining_vars_scope = sorted(
                        list(next_remaining_vars_scope))

                self.set_next_operation('Sum')

                next_remaining_vars_data = column_slice_data_by_scope(
                    remaining_vars_data, remaining_vars_scope,
                    next_remaining_vars_scope)

                logging.info(
                    f'independence test completed for current information set {curr_information_set_scope} '
                    f'and rest set {rest_set_scope} ')

                remaining_vars_prod_child = self.__learn_spmn_structure(
                    next_remaining_vars_data, next_remaining_vars_scope,
                    next_information_set_scope, index)

                prod_children.append(remaining_vars_prod_child)

                product_node = Product(children=prod_children)
                assign_ids(product_node)
                rebuild_scopes_bottom_up(product_node)

                logging.info(f'created product node')
                return product_node

            # Cluster the data
            else:

                curr_op = self.get_curr_operation()
                logging.debug(f'curr_op at sum node (cluster test): {curr_op}')

                split_rows = get_split_rows_KMeans()  # from SPMNHelper.py

                if self.cluster_by_curr_information_set:

                    curr_information_set_data = column_slice_data_by_scope(
                        remaining_vars_data, remaining_vars_scope,
                        curr_information_set_scope)

                    ds_context_sum = get_ds_context(
                        curr_information_set_data, curr_information_set_scope,
                        self.params)
                    data_slices_sum, km_model = split_rows(
                        curr_information_set_data, ds_context_sum,
                        curr_information_set_scope)

                    logging.info(
                        f'split clusters based on current information set {curr_information_set_scope}'
                    )

                else:
                    # cluster on whole remaining variables
                    ds_context_sum = get_ds_context(remaining_vars_data,
                                                    remaining_vars_scope,
                                                    self.params)
                    data_slices_sum, km_model = split_rows(
                        remaining_vars_data, ds_context_sum,
                        remaining_vars_scope)

                    logging.info(
                        f'split clusters based on whole remaining variables {remaining_vars_scope}'
                    )

                sum_node_children = []
                weights = []
                index = index
                logging.debug(
                    f'{len(data_slices_sum)} clusters found at data_slices_sum'
                )

                cluster_num = 0
                labels_array = km_model.labels_
                logging.debug(
                    f'cluster labels of rows: {labels_array} used to cluster data on '
                    f'total remaining variables {remaining_vars_scope}')

                for cluster, scope, weight in data_slices_sum:

                    self.set_next_operation("Prod")

                    # cluster whole remaining variables based on clusters formed.
                    # below methods are useful if clusters were formed on just the current information set

                    cluster_indices = get_row_indices_of_cluster(
                        labels_array, cluster_num)
                    cluster_on_remaining_vars = row_slice_data_by_indices(
                        remaining_vars_data, cluster_indices)

                    # logging.debug(np.array_equal(cluster_on_remaining_vars, cluster ))

                    sum_node_children.append(
                        self.__learn_spmn_structure(
                            cluster_on_remaining_vars, remaining_vars_scope,
                            curr_information_set_scope, index))

                    weights.append(weight)

                    cluster_num += 1

                sum_node = Sum(weights=weights, children=sum_node_children)

                assign_ids(sum_node)
                rebuild_scopes_bottom_up(sum_node)
                logging.info(f'created sum node')
                return sum_node
Пример #26
0
    right = images2d[:, :, middle:].reshape((images.shape[0], -1))

    # format: R|L
    conditional_training_data = np.concatenate((right.reshape(px, -1), left.reshape(px, -1)), axis=1)





    # In left, OUT right
    file_cache_path = "/tmp/cspn.bin"
    if not os.path.isfile(file_cache_path):
        spn_training_data = left.reshape(px, -1)
        spn_training_data = np.repeat(spn_training_data, 10, axis=0)
        ds_context = Context(parametric_types=[Bernoulli] * left.shape[1]).add_domains(spn_training_data)
        spn = learn_parametric(spn_training_data, ds_context, min_instances_slice=1)

        ds_context = Context(parametric_types=[Conditional_Bernoulli] * right.shape[1]).add_domains(right)
        scope = list(range(right.shape[1]))
        cspn = learn_conditional(conditional_training_data, ds_context, scope, min_instances_slice=60000000)
        with open(file_cache_path, 'wb') as f:
            pickle.dump((cspn, spn), f, pickle.HIGHEST_PROTOCOL)

    with open(file_cache_path, 'rb') as f:
        cspn, spn = pickle.load(f)


    def conditional_input_to_LR(input_images_in_rl):
        # format L|R
        images_to_lr = np.concatenate(
            (input_images_in_rl[:, input_images_in_rl.shape[1] // 2:].reshape(input_images_in_rl.shape[0], px, -1),
Пример #27
0
def learn_spmn_structure(train_data, index, scope_index, params):


    train_data = train_data
    curr_var_set = params.partial_order[index]

    if params.partial_order[index][0] in  params.decision_nodes:

        decision_node = params.partial_order[index][0]
        cl, dec_vals= split_on_decision_node(train_data, curr_var_set)
        spn0 = []
        index= index+1
        set_next_operation("None")

        for c in cl:

            if index < len(params.partial_order):

                spn0.append(learn_spmn_structure(c, index, scope_index, params))
                spn = Max(dec_values=dec_vals, children=spn0, feature_name=decision_node)

            else:
                spn = Max(dec_values=dec_vals, children=None, feature_name=decision_node)

        assign_ids(spn)
        rebuild_scopes_bottom_up(spn)
        return spn



    else:

        curr_train_data_prod, curr_train_data = get_curr_train_data_prod(train_data, curr_var_set)

        split_cols = get_split_cols_RDC_py()
        scope_prod = get_scope_prod(curr_train_data_prod, scope_index, params.feature_names)

        ds_context_prod = get_ds_context_prod(curr_train_data_prod, scope_prod, index, scope_index, params)

        data_slices_prod = split_cols(curr_train_data_prod, ds_context_prod, scope_prod)
        curr_op = get_next_operation()


        if len(data_slices_prod)>1 or curr_op == "Prod" or index == len(params.partial_order) :
            set_next_operation("Sum")

            if params.util_to_bin :

                spn0 = learn_parametric(curr_train_data_prod, ds_context_prod, min_instances_slice=20, initial_scope= scope_prod)

            else:

                spn0 = learn_mspn(curr_train_data_prod, ds_context_prod, min_instances_slice=20,
                                    initial_scope=scope_prod)

            index = index + 1
            scope_index = scope_index +curr_train_data_prod.shape[1]

            if index < len(params.partial_order):

                spn1 = learn_spmn_structure(curr_train_data, index, scope_index, params)
                spn = Product(children=[spn0, spn1])

                assign_ids(spn)
                rebuild_scopes_bottom_up(spn)

            else:
                spn = spn0
                assign_ids(spn)
                rebuild_scopes_bottom_up(spn)

        else:

            split_rows = get_split_rows_KMeans()
            scope_sum = list(range(train_data.shape[1]))

            ds_context_sum = get_ds_context_sum(train_data, scope_sum, index, scope_index, params)
            data_slices_sum = split_rows(train_data, ds_context_sum, scope_sum)

            spn0 = []
            weights = []
            index = index

            if index < len(params.partial_order):

                for cl, scop, weight in data_slices_sum:

                    set_next_operation("Prod")
                    spn0.append(learn_spmn_structure(cl, index, scope_index, params))
                    weights.append(weight)

                spn = Sum(weights=weights, children=spn0)
                assign_ids(spn)
                rebuild_scopes_bottom_up(spn)

        assign_ids(spn)
        rebuild_scopes_bottom_up(spn)
        return spn
Пример #28
0
    features = ["birthyear", "gender", "party"]
    co_keys = [
        "corona", "covid", "pandem", "vaccin", "Corona", "Covid", "Pandem",
        "Vaccin", "impf", "Impf", "Maske", "mask", "Lockdown", "infiz",
        "Infektio"
    ]
    fl_keys = [
        "Migrat", "Asyl", "Flücht", "Schlepper", "Seenot", "Einwanderung",
        "asyl", "flücht", "schlepp", "seenot", "einwander"
    ]
    is_keys = ["Islamis", "islamis", "Terror", "terror"]
    keywords = [co_keys]
    train_data = get_features(memberlist, features, tweet_list, keywords)
    spn = build_spn(train_data)
    print(cross_validate(train_data, 5, label=2))
    #print(sample_instances(spn, np.array([0, np.nan] * 50).reshape(-1, 2), RandomState(123)))
    # tweet_scraping(tweet_list, api)
    ex = np.array([1976., 1., 4., 0.3]).reshape(-1, 4)
    ex2 = np.array([4., 0.2]).reshape(-1, 2)
    ds_context = Context(
        parametric_types=[Gaussian, Categorical, Categorical, Gaussian
                          ]).add_domains(train_data)
    spn2 = learn_parametric(train_data, ds_context, min_instances_slice=20)

    spn_marg = marginalize(spn, [2, 3])
    ll = log_likelihood(spn, ex)
    ll2 = log_likelihood(spn2, ex)
    llm = log_likelihood(spn_marg, ex)
    print(ll, np.exp(ll))
    print(ll2, np.exp(ll2))
    print(llm, np.exp(llm))