예제 #1
0
def test2():
    numpy.random.seed(42)
    dsname, data, labels, classes, families = getDiabetes()

    labels = [l for l in labels]

    print(data.shape)

    print(data)
    featureTypes = [
        'discrete', 'continuous', 'continuous', 'continuous', 'continuous',
        'continuous', 'continuous', 'continuous', 'continuous'
    ]
    featureTypes = [
        'continuous', 'categorical', 'continuous', 'continuous', 'continuous',
        'continuous', 'continuous', 'continuous', 'continuous'
    ]
    # families[0] = 'bernoulli'

    # spn = SPN.LearnStructure(data, featureNames=labels, domains = domains,
    # families=families, row_split_method=Splitting.KmeansRows(),
    # col_split_method=Splitting.IndependenceTest(alpha=0.00001),
    spn = SPN.LearnStructure(data,
                             featureTypes=featureTypes,
                             row_split_method=Splitting.KmeansRows(),
                             col_split_method=Splitting.RDCTest(threshold=0.3),
                             min_instances_slice=50,
                             cluster_first=False)

    print(spn)
    # print(numpy.unique(data))

    ll = spn.root.eval(data)

    print("Sum LL", numpy.sum(ll))
예제 #2
0
def test1(data, features):
    
    data = data[:, 1:20]
    features = features[0:data.shape[1]]

    arcs, mixt = getArchetypes(data, 3)
    
    nrfolds = 10
    
    
    stats = Stats(name=dsname)
    
    for train, test, i in kfolded(mixt, nrfolds):
        c = Chrono().start()
        spn = SPN.LearnStructure(train, featureTypes=["continuous"] * train.shape[1], row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.3),
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=100)
        c.end()
        
        spn.root.validate()
        ll = numpy.mean(spn.root.eval(test))
        
        print(ll)
        
        stats.add("HSPN", Stats.LOG_LIKELIHOOD, ll)
        stats.add("HSPN", Stats.TIME, c.elapsed())
        
        stats.save("stats_" + dsname + ".json")
    
    print(arcs)
예제 #3
0
def test1():
    numpy.random.seed(42)
    data = numpy.random.poisson(5, 1000).reshape(1000, 1)

    for i in numpy.unique(data):
        print(i, numpy.sum(data == i))

    featureTypes = ["discrete"]
    featureTypes = ["categorical"]

    spn = SPN.LearnStructure(
        data,
        featureTypes=featureTypes,
        row_split_method=Splitting.KmeansRows(),
        col_split_method=Splitting.IndependenceTest(),
        # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
        # domains, families=families, row_split_method=Splitting.KmeansRows(),
        # col_split_method=Splitting.RDCTest(),
        min_instances_slice=100)

    print(spn)
    print(numpy.unique(data))
    evdata = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    print(evdata)

    ll = (spn.root.eval(numpy.asarray(evdata).reshape(len(evdata), 1)))

    print("Probs", numpy.exp(ll))
    print("Sum LL", numpy.sum(ll))
    print(numpy.histogram(data, bins="auto", density=True))
예제 #4
0
    def learn():
        spn = SPN.LearnStructure(data, featureTypes=["discrete"] * data.shape[1], row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3, linear=True),
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=200)
    

        return spn
예제 #5
0
 def learn(data):
     spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3),
                              domains=domains,
                              alpha=1,
                              # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                              # domains, families=families, row_split_method=Splitting.KmeansRows(),
                              # col_split_method=Splitting.RDCTest(),
                              min_instances_slice=50)
     return spn
예제 #6
0
 def learn(data, min_instances_slice, feature_names, domains, featureTypes):
     spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.1, linear=True),
                             featureNames=feature_names,
                             domains=domains,
                              # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                              # domains, families=families, row_split_method=Splitting.KmeansRows(),
                              # col_split_method=Splitting.RDCTest(),
                              min_instances_slice=min_instances_slice)
     return spn
예제 #7
0
 def learn(data):
     spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3),
                              domains=domains,
                              alpha=0.1,
                              families = ['histogram'] * data.shape[1],
                              # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                              # domains, families=families, row_split_method=Splitting.KmeansRows(),
                              # col_split_method=Splitting.RDCTest(),
                              #min_instances_slice=int(data.shape[0]*0.01))
                              min_instances_slice=200)
     return spn
예제 #8
0
def test2(data, features):
    arc, mixt = getArchetypes(data, 3)
    
    print(mixt)
    
    0 / 0
    
    
    spn = SPN.LearnStructure(mixt, featureTypes=["continuous"] * mixt.shape[1], row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.3),
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=100)
예제 #9
0
def learn(data,
          featureTypes,
          families,
          domains,
          min_instances_slice,
          alpha=0.1):
    spn = SPN.LearnStructure(data,
                             alpha=alpha,
                             featureTypes=featureTypes,
                             row_split_method=Splitting.KmeansRDCRows(),
                             col_split_method=Splitting.RDCTest(threshold=0.3),
                             domains=domains,
                             families=families,
                             min_instances_slice=min_instances_slice)
    return spn
예제 #10
0
    def estimate_density(self, training_data, validation_data=None):
        """Fit a MSPN on the training data. The variable validation_data is
        never used."""
        feature_types = []
        feature_names = []
        families = []
        for feat, str_type in training_data.features:
            feature_types.append(str_type)
            feature_names.append(feat.symbol_name())
            if 'leaf' in self.learner_args:
                families.append(self.learner_args['leaf'])
            else:
                families.append(MSPNLearner.SPN_feat_fams[feat.symbol_type()])

        if 'row_split' in self.learner_args:
            if self.learner_args['row_split'] == 'gower':
                row_split_method = Splitting.Gower(n_clusters=2)
            elif self.learner_args['row_split'] == 'rdc-kmeans':
                row_split_method = Splitting.KmeansRDCRows(n_clusters=2,
                                                           k=20,
                                                           OHE=1)
            else:
                raise NotImplementedError()

        else:
            row_split_method = Splitting.KmeansRDCRows(n_clusters=2,
                                                       k=20,
                                                       OHE=1)

        col_split_method = Splitting.RDCTest(threshold=0.1, OHE=1, linear=1)

        rand_seed = self.learner_args['seed']
        mspnargs = {
            k: v
            for k, v in self.learner_args.items()
            if k not in ['seed', 'leaf', 'row_split']
        }

        # let MSPNs sort this out
        families = None
        self.spn = SPN.LearnStructure(asarray(training_data.data),
                                      feature_types,
                                      families=families,
                                      featureNames=feature_names,
                                      rand_seed=rand_seed,
                                      row_split_method=row_split_method,
                                      col_split_method=col_split_method,
                                      **mspnargs)
예제 #11
0
def learn(data,
          featureTypes,
          families,
          domains,
          feature_names,
          min_instances_slice,
          prior_weight=0.0):
    return SPN.LearnStructure(
        data,
        prior_weight=prior_weight,
        featureTypes=featureTypes,
        row_split_method=Splitting.KmeansRDCRows(),
        col_split_method=Splitting.RDCTest(threshold=0.3),
        domains=domains,
        families=families,
        featureNames=feature_names,
        min_instances_slice=min_instances_slice)
예제 #12
0
def learn(data,
          featureTypes,
          families,
          domains,
          feature_names,
          min_instances_slice,
          row_split_method,
          col_split_method,
          prior_weight=0.0):
    return SPN.LearnStructure(data,
                              prior_weight=prior_weight,
                              featureTypes=featureTypes,
                              row_split_method=row_split_method,
                              col_split_method=col_split_method,
                              domains=domains,
                              families=families,
                              featureNames=feature_names,
                              min_instances_slice=min_instances_slice)
예제 #13
0
def test8():
    from tensorflow.examples.tutorials.mnist import input_data
    mnist = input_data.read_data_sets("MNIST_data/", one_hot=False)
    data, target = mnist.train.images, mnist.train.labels

    featureTypes = ["continuous"] * data.shape[1] + ["categorical"]

    featureNames = ["pixel"] * data.shape[1] + ["label"]

    data = numpy.hstack((data, target.reshape(data.shape[0], 1)))
    print(featureTypes)
    print(data.shape)

    spn = SPN.LearnStructure(data,
                             featureTypes=featureTypes,
                             featureNames=featureNames,
                             row_split_method=Splitting.KmeansRows(),
                             col_split_method=Splitting.RDCTest(threshold=0.4),
                             min_instances_slice=500,
                             cluster_first=True)
    # RDCTestOHEpy

    print("learned")

    spn.root.validate()

    data, target = mnist.test.images, mnist.test.labels

    data = numpy.hstack((data, target.reshape(data.shape[0], 1)))

    classes = numpy.unique(target)
    results = numpy.zeros((data.shape[0], len(classes)))

    print("testing")
    # print(spn)
    for c in classes:
        data[:, -1] = c
        results[:, c] = spn.root.eval(data)

    print("done")

    predictions = numpy.argmax(results, axis=1)

    print('MAP accuracy : ', accuracy_score(target, predictions))
예제 #14
0
def learn_spn(dataset="data/iris",
              precision=25,
              independence=0.1,
              header=0,
              date=None,
              isotonic=False,
              histogram=True,
              types=False):
    skiprows = [1] if types else []
    df = pd.read_csv(dataset,
                     delimiter=",",
                     header=header,
                     parse_dates=date,
                     skiprows=skiprows)
    df = df.dropna(axis=0, how='any')
    featureNames = df.columns.values.tolist() if header == 0 else [
        "X_{}".format(i) for i in range(len(df.columns))
    ]

    dtypes = df.dtypes

    if types:
        featureTypes = []
        families = []
        with open(dataset, 'r') as csvfile:
            csvreader = csv.reader(csvfile, delimiter=',', quotechar='|')
            csvreader.__next__()
            _types = csvreader.__next__()
        for featureType in _types:
            print(featureType)
            if featureType == 'cat':
                featureTypes.append('categorical')
                if histogram:
                    families.append('histogram')
                elif isotonic:
                    families.append('isotonic')
                else:
                    families.append('piecewise')
            elif featureType == 'con':
                featureTypes.append('continuous')
                families.append('piecewise' if not isotonic else 'isotonic')
            elif featureType == 'dis':
                featureTypes.append('discrete')
                families.append('piecewise' if not isotonic else 'isotonic')
            else:
                featureTypes.append('unknown')
                families.append('piecewise' if not isotonic else 'isotonic')

    def to_featureTypes(types):
        featureTypes = []
        families = []
        for featureType in types:
            if featureType.kind == 'O':
                featureTypes.append('categorical')
                if histogram:
                    families.append('histogram')
                elif isotonic:
                    families.append('isotonic')
                else:
                    families.append('piecewise')
            elif featureType.kind == 'f':
                featureTypes.append('continuous')
                families.append('piecewise' if not isotonic else 'isotonic')
            elif featureType.kind == np.dtype('i'):
                featureTypes.append('discrete')
                families.append('piecewise' if not isotonic else 'isotonic')
            else:
                featureTypes.append('unknown')
                families.append('piecewise' if not isotonic else 'isotonic')
        return featureTypes, families

    if not types:
        featureTypes, families = to_featureTypes(dtypes)

    data_dictionary = {
        'features':
        [{
            "name": name,
            "family": family,
            "type": typ,
            'pandas_type': dtypes[i]
        }
         for i, (name, family,
                 typ) in enumerate(zip(featureNames, families, featureTypes))],
        'num_entries':
        len(df)
    }

    # print(df.info())

    idx = df.columns

    for id, name in enumerate(idx):
        if featureTypes[id] == 'categorical':
            lb = LabelEncoder()
            data_dictionary['features'][id]["encoder"] = lb
            df[name] = df[name].astype('category')
            df[name] = lb.fit_transform(df[name])
            data_dictionary['features'][id]["values"] = lb.transform(
                lb.classes_)
        if dtypes[id].kind == 'M':
            df[name] = (df[name] - df[name].min()) / np.timedelta64(1, 'D')

    # print(df.head())
    data = np.array(df)

    # print(featureTypes)
    spn = SPN.LearnStructure(
        data,
        featureTypes=featureTypes,
        featureNames=featureNames,
        min_instances_slice=precision,
        families=families,
        row_split_method=Splitting.KmeansRDCRows(),
        col_split_method=Splitting.RDCTest(threshold=independence))

    spn.name = dataset
    return spn, data_dictionary
예제 #15
0
    traindata = numpy.c_[train_x, train_y]
    testdata = numpy.c_[test_x, test_y]
    #testdata = testdata[0:5,:]
    
    gn1 = GaussianNode("gn1", 0, "X0", 1.0, 1.0)
    pn1 = PoissonNode("pn1", 1, "X1", 1.0)
    bn1 = BernoulliNode("bn1", 2, "X2", 0.0)
    p1 = ProductNode("p1", gn1, pn1, bn1)
    
    gn2 = GaussianNode("gn2", 0, "X0", 10.0, 1.0)
    pn2 = PoissonNode("pn2", 1, "X1", 10.0)
    bn2 = BernoulliNode("bn2", 2, "X2", 1.0)
    p2 = ProductNode("p1", gn2, pn2, bn2)

    s1 = SumNode("s1", [0.5, 0.5], p1, p2)
    spn = SPN()
    spn.root = s1
    
    c = Chrono().start()
    
    with tf.device("/cpu:0"):
        tf.reset_default_graph()
                    
        with tf.name_scope('input'):
            X = tf.placeholder(tf.float64, [None, 3], name="x")
        
        with tf.name_scope('SPN') as scope:
            spn.root.initTf(X)
            costf = JointCost(spn.root)
        
        train_op = tf.train.AdamOptimizer().minimize(costf)
예제 #16
0
@author: molina
'''
import numpy

from tfspn.SPN import SPN, Splitting
import tensorflow as tf

if __name__ == '__main__':

    gen = numpy.random.poisson(5, 1000)

    data = numpy.transpose(numpy.vstack((gen, gen)))

    spn = SPN.LearnStructure(data,
                             min_instances_slice=200,
                             families=["poisson", "poisson"],
                             row_split_method=Splitting.KmeansRows(),
                             col_split_method=Splitting.IndependenceTest())

    #    THIS PRODUCES THE FOLLOWING SPN:
    #
    #     SumNode_0 SumNode(0.154*ProductNode_3, 0.188*ProductNode_6, 0.158*ProductNode_10, 0.076*ProductNode_13, 0.13999999999999999*ProductNode_18, 0.176*ProductNode_21, 0.108*ProductNode_24){
    #     ProductNode_3 ProductNode(PoissonNode_4, PoissonNode_5){
    #         PoissonNode_4 P(X_0_|λ=6.0)
    #         PoissonNode_5 P(X_1_|λ=6.0)
    #         }
    #     ProductNode_6 ProductNode(PoissonNode_7, PoissonNode_8){
    #         PoissonNode_7 P(X_0_|λ=5.0)
    #         PoissonNode_8 P(X_1_|λ=5.0)
    #         }
    #     ProductNode_10 ProductNode(PoissonNode_11, PoissonNode_12){
예제 #17
0
#
# load augmented mnist in MLC format
dataset_name = args.dataset
logging.info('Looking for dataset {} ...in dir {}'.format(
    dataset_name, args.data_dir))
(train, valid,
 test), feature_names, feature_types, domains = loadMLC(dataset_name,
                                                        base_path='',
                                                        data_dir=args.data_dir)
logging.info('Loaded\n\ttrain:\t{}\n\tvalid:\t{}\n\ttest:\t{}'.format(
    train.shape, valid.shape, test.shape))

load_start_t = perf_counter()
# spn = SPN.FromFile(args.spn)
spn = SPN.from_pickle(args.spn)
# spn = None
load_end_t = perf_counter()
# print(spn)
logging.info('spn loaded from pickle in {} secs'.format(load_end_t -
                                                        load_start_t))

logging.info('\n\nstructure stats:')
n_nodes = spn.n_nodes()
logging.info('# nodes {}'.format(n_nodes))
n_sum_nodes = spn.n_sum_nodes()
logging.info('\t# sum nodes {}'.format(n_sum_nodes))
n_prod_nodes = spn.n_prod_nodes()
logging.info('\t# prod nodes {}'.format(n_prod_nodes))
n_leaves = spn.n_leaves()
logging.info('\t# leaf nodes {}'.format(n_leaves))
예제 #18
0
def test5():
    numpy.random.seed(42)

    data = numpy.zeros((5000, 2))

    idx = numpy.random.choice(data.shape[0],
                              int(data.shape[0] / 2),
                              replace=False)

    data[idx, 1] = 1

    idx0 = data[:, 1] == 0
    idx1 = data[:, 1] == 1

    data[idx0, 0] = numpy.random.normal(100, 30, numpy.sum(idx0))

    data[idx1, 0] = numpy.random.normal(200, 30, numpy.sum(idx1))

    print(data)

    featureNames = ["Gaussian", "Categorical"]
    featureTypes = ["continuous", "discrete"]

    # spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(alpha=0.01),
    # spn = SPN.LearnStructure(data, featureTypes=featureTypes,
    # featureNames=featureNames, row_split_method=Splitting.KmeansRows(),
    # col_split_method=Splitting.RDCTest(),
    spn = SPN.LearnStructure(data,
                             featureTypes=featureTypes,
                             featureNames=featureNames,
                             row_split_method=Splitting.KmeansRows(),
                             col_split_method=Splitting.RDCTestOHEpy(),
                             min_instances_slice=500,
                             cluster_first=True)

    spn.root.validate()

    from mpl_toolkits.mplot3d import Axes3D
    from matplotlib.collections import PolyCollection
    from matplotlib.colors import colorConverter
    import matplotlib.pyplot as plt
    import numpy as np

    fig = plt.figure()
    ax = fig.gca(projection='3d')

    cc = lambda arg: colorConverter.to_rgba(arg, alpha=0.6)

    xs = np.arange(0, 300, 0.5)
    verts = []
    zs = [0, 1]

    maxys = 0
    for z in zs:
        testdata = numpy.zeros((len(xs), len(zs)))
        testdata[:, 0] = xs
        testdata[:, 1] = z

        ys = numpy.zeros_like(xs)

        ys[:] = numpy.exp(spn.root.eval(testdata))

        maxys = max(maxys, numpy.max(ys))

        ys[0], ys[-1] = 0, 0
        verts.append(list(zip(xs, ys)))

    poly = PolyCollection(verts, facecolors=[cc('r'), cc('g')])
    poly.set_alpha(0.7)
    ax.add_collection3d(poly, zs=zs, zdir='y')

    ax.set_xlabel('X')
    ax.set_xlim3d(0, 300)
    ax.set_ylabel('Y')
    ax.set_ylim3d(-1, 1)
    ax.set_zlabel('Z')
    ax.set_zlim3d(0, maxys)

    plt.show()

    ll = spn.root.eval(data)

    print("Sum LL", numpy.sum(ll))
예제 #19
0
def test6():
    numpy.random.seed(42)

    datablocks = []

    yd = [0, 1, 2, 3]
    xd = [0, 1]

    for x in xd:
        for y in yd:
            block = numpy.zeros((2000, 3))
            block[:, 1] = x
            block[:, 2] = y
            if (x == 1 and y == 0) or (x == 0 and y == 1) or (
                    x == 1 and y == 2) or (x == 0 and y == 3):
                block[:, 0] = numpy.random.normal(200, 30, block.shape[0])
            else:
                block[:, 0] = numpy.random.normal(100, 30, block.shape[0])

            datablocks.append(block)

    data = numpy.vstack(datablocks)

    print(data.shape)

    featureNames = ["Gaussian", "Categorical", "Discrete"]
    featureTypes = ["continuous", "categorical", "discrete"]

    # spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(alpha=0.01),
    # spn = SPN.LearnStructure(data, featureTypes=featureTypes,
    # featureNames=featureNames, row_split_method=Splitting.KmeansRows(),
    # col_split_method=Splitting.RDCTest(),
    spn = SPN.LearnStructure(data,
                             featureTypes=featureTypes,
                             featureNames=featureNames,
                             row_split_method=Splitting.KmeansRows(),
                             col_split_method=Splitting.RDCTestOHEpy(),
                             min_instances_slice=50,
                             cluster_first=True)

    spn.root.validate()

    from matplotlib.collections import PolyCollection
    from matplotlib.colors import colorConverter
    import matplotlib.pyplot as plt
    import matplotlib.gridspec as gridspec

    gs = gridspec.GridSpec(len(xd), len(yd))

    fig = plt.figure(figsize=(8, 8))

    xall = numpy.arange(0, 300, 0.5)
    i = 0
    for x in xd:
        for y in yd:
            testdata = numpy.zeros((len(xall), 3))
            testdata[:, 0] = xall
            testdata[:, 1] = x
            testdata[:, 2] = y

            pbs = numpy.zeros_like(xall)

            pbs[:] = numpy.exp(spn.root.eval(testdata))

            ax = plt.Subplot(fig, gs[i])
            i += 1

            ax.set_title('%s %s' % (x, y))
            ax.plot(xall, pbs, 'r--')

            fig.add_subplot(ax)

    plt.show()

    ll = spn.root.eval(data)

    print("Sum LL", numpy.sum(ll))
예제 #20
0
def test7():
    numpy.random.seed(42)

    D = numpy.loadtxt("bank.csv", delimiter=";", skiprows=0, dtype="S")
    D = numpy.char.strip(D)

    featureNames = [str(f) for f in D[0, :]]
    D = D[1:, :]
    featureTypes = [
        "discrete",
        "categorical",
        "categorical",
        "categorical",
        "continuous",
        "continuous",
        "categorical",
        "categorical",
        "categorical",
        "discrete",
        "categorical",
        "discrete",
        "categorical",
        "continuous",
        "discrete",
        "categorical",
        "categorical",
    ]
    print(len(featureTypes))
    print(len(featureNames))

    def isinteger(x):
        return numpy.all(numpy.equal(numpy.mod(x, 1), 0))

    cols = []
    types = []
    domains = []

    index = [0, 5]

    D = D[:, index]

    for col in range(D.shape[1]):
        b, c = numpy.unique(D[:, col], return_inverse=True)

        try:
            # could convert to float
            if isinteger(b.astype(float)):
                # was integer
                cols.append(D[:, col].astype(int))
                types.append("discrete")
                domains.append(b.astype(int))
                continue

            # was float
            cols.append(D[:, col].astype(float))
            types.append("continuous")
            domains.append(b.astype(float))

            continue
        except:
            # was discrete
            cols.append(c)
            types.append("categorical")
            domains.append(b.astype(str))

    data = numpy.column_stack(cols)
    print(featureNames)

    print(domains)
    featureNames = [featureNames[i] for i in index]
    print(featureNames)
    print(types)

    data[:, 1] = numpy.sign(data[:, 1]) * numpy.log(numpy.abs(data[:, 1]) + 1)

    # spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(alpha=0.01),
    # spn = SPN.LearnStructure(data, featureTypes=featureTypes,
    # featureNames=featureNames, row_split_method=Splitting.KmeansRows(),
    # col_split_method=Splitting.RDCTest(),
    spn = SPN.LearnStructure(
        data,
        featureTypes=types,
        featureNames=featureNames,
        row_split_method=Splitting.KmeansRows(),
        col_split_method=Splitting.RDCTest(threshold=0.000001),
        min_instances_slice=1000,
        cluster_first=False)
    # RDCTestOHEpy

    spn.root.validate()

    print(spn)

    spn.save_pdf_graph("bank.pdf")

    ll = spn.root.eval(data)

    from matplotlib.collections import PolyCollection
    from matplotlib.colors import colorConverter
    import matplotlib.pyplot as plt
    import matplotlib.gridspec as gridspec

    for i in [0, 1]:

        x = numpy.sort(data[:, i]).reshape(data.shape[0], 1)

        fig = plt.figure(figsize=(8, 8))
        x1 = numpy.zeros_like(data)
        x1[:, i] = x[:, 0]

        color_idx = numpy.linspace(0, 1, len(spn.root.children))

        for cidx, c in enumerate(spn.root.children):

            y = numpy.exp(c.children[i].eval(x1))

            plt.plot(x, y, '--', color=plt.cm.cool(color_idx[cidx]))

    plt.show()

    # print("Probs", numpy.exp(ll))
    print("Sum LL", numpy.sum(ll))
예제 #21
0
def test3():
    numpy.random.seed(42)
    dsname, data, featureNames, featureTypes, doms = getAdult()

    doctorateVal = numpy.where(doms[2] == "Doctorate")[0][0]
    stategovVal = numpy.where(doms[1] == "State-gov")[0][0]

    print(featureNames)

    print(len(featureNames))

    print(data[0, :])
    print(data.shape)
    print(doctorateVal, stategovVal)

    pD = numpy.sum(data[:, 2] == doctorateVal) / data.shape[0]
    pSD = numpy.sum(
        numpy.logical_and(data[:, 2] == doctorateVal, data[:, 1]
                          == stategovVal)) / data.shape[0]
    pS = numpy.sum(data[:, 1] == stategovVal) / data.shape[0]

    print("pD", pD)
    print("pSD", pSD)
    pS_D = pSD / pD
    print("pS_D", pS_D)

    # spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(alpha=0.01),
    # spn = SPN.LearnStructure(data, featureTypes=featureTypes,
    # featureNames=featureNames, row_split_method=Splitting.KmeansRows(),
    # col_split_method=Splitting.RDCTest(),
    spn = SPN.LearnStructure(data,
                             featureTypes=featureTypes,
                             featureNames=featureNames,
                             row_split_method=Splitting.KmeansRows(),
                             col_split_method=Splitting.RDCTest(threshold=0.3),
                             min_instances_slice=3,
                             cluster_first=True)

    spn.root.validate()

    print("SPN Learned")
    margSPN_SD = spn.root.marginalizeOut(
        [0, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13])
    margSPN_SD.Prune()

    print(margSPN_SD)

    dataSD = numpy.zeros_like(data[0, :]).reshape(1, data.shape[1])
    dataSD[0, 1] = stategovVal
    dataSD[0, 2] = doctorateVal
    print(dataSD)

    spnSD = (numpy.exp(margSPN_SD.eval(dataSD)))

    margSPN_D = spn.root.marginalizeOut(
        [0, 1, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13])
    margSPN_D.Prune()

    print(margSPN_D)

    dataD = numpy.zeros_like(data[0, :]).reshape(1, data.shape[1])
    dataD[0, 2] = doctorateVal
    print(dataD)

    spnD = (numpy.exp(margSPN_D.eval(dataD)))

    print("pD", pD)
    print("pS", pS)
    print("pSD", pSD)
    pS_D = pSD / pD
    print("pS_D", pS_D)

    print("spn pD", spnD)
    print("spn pSD", spnSD)
    spnS_D = spnSD / spnD
    print("spn pS_D", spnS_D)

    print("doctorateVal", doctorateVal)
    print("stategovVal", stategovVal)

    ll = spn.root.eval(data)

    # print("Probs", numpy.exp(ll))
    print("Sum LL", numpy.sum(ll))
예제 #22
0
        spn_json_path = os.path.join(out_path, 'spn.{}.json'.format(f))

        # train = whole_data[train_index]
        # test = whole_data[test_index]

        learn_start_t = perf_counter()
        spn = SPN.LearnStructure(train,
                                 featureNames=feature_names,
                                 families=families,
                                 domains=domains,
                                 featureTypes=feature_types,
                                 min_instances_slice=args.min_inst_slice,
                                 bin_width=args.tail_width,
                                 alpha=args.alpha,
                                 isotonic=args.isotonic,
                                 pw_bootstrap=args.bootstraps,
                                 avg_pw_boostrap=args.average_bootstraps,
                                 row_split_method=row_split_method,
                                 col_split_method=col_split_method,
                                 cluster_first=cluster_first,
                                 kernel_family=args.kernel_family,
                                 kernel_bandwidth=args.kernel_bandwidth,
                                 kernel_metric=args.kernel_metric,
                                 prior_weight=args.prior_weight,
                                 rand_seed=args.seed
                                 )
        learn_end_t = perf_counter()
        learning_time = learn_end_t - learn_start_t
        logging.info('\n\n*****Spn learned in {} secs! *****'.format(learning_time))

        logging.info('Learned spn:\n{}'.format(spn))
        # spn.ToFile(spn_json_path)
예제 #23
0
    y = data[:, 2]

    train_x, test_x, train_y, test_y = train_test_split(x,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=1337)

    traindata = numpy.c_[train_x, train_y]
    testdata = numpy.c_[test_x, test_y]

    #print(traindata)
    #print(testdata)

    spn = SPN.LearnStructure(traindata,
                             min_instances_slice=200,
                             families=["gaussian", "gaussian", "bernoulli"],
                             row_split_method=Splitting.KmeansRows(),
                             col_split_method=Splitting.IndependenceTest())

    print(spn)

    predictions = predict(spn, testdata, 2)
    print('MAP accuracy : ', accuracy_score(test_y, predictions))

    tf.reset_default_graph()

    c = Chrono().start()

    with tf.device("/cpu:0"):
        tf.reset_default_graph()
예제 #24
0
from mlutils.datasets import getCIFAR10
from tfspn.SPN import SPN, Splitting

dsname, train, test, labels_train, labels_test = getCIFAR10(grayscale=True)

data = numpy.vstack((train, test))

ds = numpy.hstack((train, labels_train))

domains = [numpy.unique(ds[:, i]) for i in range(ds.shape[1])]

spn = SPN.LearnStructure(ds,
                         prior_weight=0.0,
                         featureTypes=["gaussian"] * train.shape[1] +
                         ["discrete"],
                         row_split_method=Splitting.RandomPartitionRows(),
                         col_split_method=Splitting.RDCTest(threshold=0.3,
                                                            OHE=True),
                         domains=domains,
                         families=["gaussian"] * ds.shape[1],
                         min_instances_slice=5000000)

print("learned")

ts = numpy.hstack(test, numpy.zeros_like(labels_test) / 0)

ts = ts[0:10, :]

print(ts[0, :])

predicted_labels = spn.root.mpe_eval(ts)
예제 #25
0
    # plt.hist(test[:,0], bins=100, histtype='stepfilled', normed=True, color='r', alpha=0.5, label='Uniform')
    #
    # plt.show()

    # print(domains)
    print(feature_names)
    print(feature_types)
    print(train.shape)

    # spn = SPN.LearnStructure(train, featureNames=feature_names, domains=domains,  featureTypes=feature_types, row_split_method=Splitting.RandomPartitionConditioningRows(), col_split_method=Splitting.RDCTestOHEpy(threshold=0.75),
    # spn = SPN.LearnStructure(train, featureNames=feature_names, domains=domains,  featureTypes=feature_types, row_split_method=Splitting.DBScanOHE(eps=1.0, min_samples=2), col_split_method=Splitting.RDCTestOHEpy(threshold=0.75),
    # spn = SPN.LearnStructure(train, featureNames=feature_names,
    # domains=domains,  featureTypes=feature_types,
    # row_split_method=Splitting.KmeansOHERows(),
    # col_split_method=Splitting.RDCTest(threshold=0.75),
    spn = SPN.LearnStructure(train, featureNames=feature_names, domains=domains,  featureTypes=feature_types, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.05),
                             min_instances_slice=20,  cluster_first=True)

    print(spn)

    result.append([dsname, numpy.mean(spn.root.eval(train)), numpy.mean(
        spn.root.eval(valid)), numpy.mean(spn.root.eval(test))])
    print("train", numpy.mean(spn.root.eval(train)))
    print("valid", numpy.mean(spn.root.eval(valid)))
    print("test", numpy.mean(spn.root.eval(test)))

    print("train", numpy.min(spn.root.eval(train)))
    print("valid", numpy.min(spn.root.eval(valid)))
    print("test", numpy.min(spn.root.eval(test)))

print(result)