def test1(data, features): data = data[:, 1:20] features = features[0:data.shape[1]] arcs, mixt = getArchetypes(data, 3) nrfolds = 10 stats = Stats(name=dsname) for train, test, i in kfolded(mixt, nrfolds): c = Chrono().start() spn = SPN.LearnStructure( train, featureTypes=["continuous"] * train.shape[1], row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.3), # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=100) c.end() spn.root.validate() ll = numpy.mean(spn.root.eval(test)) print(ll) stats.add("HSPN", Stats.LOG_LIKELIHOOD, ll) stats.add("HSPN", Stats.TIME, c.elapsed()) stats.save("stats_" + dsname + ".json") print(arcs)
def test2(): numpy.random.seed(42) dsname, data, labels, classes, families = getDiabetes() labels = [l for l in labels] print(data.shape) print(data) featureTypes = [ 'discrete', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous' ] featureTypes = [ 'continuous', 'categorical', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous' ] # families[0] = 'bernoulli' # spn = SPN.LearnStructure(data, featureNames=labels, domains = domains, # families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.IndependenceTest(alpha=0.00001), spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.3), min_instances_slice=50, cluster_first=False) print(spn) # print(numpy.unique(data)) ll = spn.root.eval(data) print("Sum LL", numpy.sum(ll))
def learn(): spn = SPN.LearnStructure( data, featureTypes=["discrete"] * data.shape[1], row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3, linear=True), # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=200) return spn
def learn(): spn = SPN.LearnStructure( traintopics, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.1, linear=True), featureNames=featureNames, domains=domains, # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=100) return spn
def learn(data): spn = SPN.LearnStructure( data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=1, # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=50) return spn
def learn(data): spn = SPN.LearnStructure( data, featureTypes=featureTypes, row_split_method=Splitting.KmeansRDCRows(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=0.1, families=['histogram'] * data.shape[1], # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=min_instances_slice) return spn
def learn(data, featureTypes, families, domains, min_instances_slice, alpha=0.1): spn = SPN.LearnStructure(data, alpha=alpha, featureTypes=featureTypes, row_split_method=Splitting.KmeansRDCRows(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, families=families, min_instances_slice=min_instances_slice) return spn
def estimate_density(self, training_data, validation_data=None): """Fit a MSPN on the training data. The variable validation_data is never used.""" feature_types = [] feature_names = [] families = [] for feat, str_type in training_data.features: feature_types.append(str_type) feature_names.append(feat.symbol_name()) if 'leaf' in self.learner_args: families.append(self.learner_args['leaf']) else: families.append(MSPNLearner.SPN_feat_fams[feat.symbol_type()]) if 'row_split' in self.learner_args: if self.learner_args['row_split'] == 'gower': row_split_method = Splitting.Gower(n_clusters=2) elif self.learner_args['row_split'] == 'rdc-kmeans': row_split_method = Splitting.KmeansRDCRows(n_clusters=2, k=20, OHE=1) else: raise NotImplementedError() else: row_split_method = Splitting.KmeansRDCRows(n_clusters=2, k=20, OHE=1) col_split_method = Splitting.RDCTest(threshold=0.1, OHE=1, linear=1) rand_seed = self.learner_args['seed'] mspnargs = { k: v for k, v in self.learner_args.items() if k not in ['seed', 'leaf', 'row_split'] } # let MSPNs sort this out families = None self.spn = SPN.LearnStructure(asarray(training_data.data), feature_types, families=families, featureNames=feature_names, rand_seed=rand_seed, row_split_method=row_split_method, col_split_method=col_split_method, **mspnargs)
def test2(data, features): arc, mixt = getArchetypes(data, 3) print(mixt) 0 / 0 spn = SPN.LearnStructure( mixt, featureTypes=["continuous"] * mixt.shape[1], row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.3), # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=100)
def learn(data, featureTypes, families, domains, feature_names, min_instances_slice, prior_weight=0.0): return SPN.LearnStructure( data, prior_weight=prior_weight, featureTypes=featureTypes, row_split_method=Splitting.KmeansRDCRows(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, families=families, featureNames=feature_names, min_instances_slice=min_instances_slice)
def test8(): from tensorflow.examples.tutorials.mnist import input_data mnist = input_data.read_data_sets("MNIST_data/", one_hot=False) data, target = mnist.train.images, mnist.train.labels featureTypes = ["continuous"] * data.shape[1] + ["categorical"] featureNames = ["pixel"] * data.shape[1] + ["label"] data = numpy.hstack((data, target.reshape(data.shape[0], 1))) print(featureTypes) print(data.shape) spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.4), min_instances_slice=500, cluster_first=True) # RDCTestOHEpy print("learned") spn.root.validate() data, target = mnist.test.images, mnist.test.labels data = numpy.hstack((data, target.reshape(data.shape[0], 1))) classes = numpy.unique(target) results = numpy.zeros((data.shape[0], len(classes))) print("testing") # print(spn) for c in classes: data[:, -1] = c results[:, c] = spn.root.eval(data) print("done") predictions = numpy.argmax(results, axis=1) print('MAP accuracy : ', accuracy_score(target, predictions))
# plt.hist(test[:,0], bins=100, histtype='stepfilled', normed=True, color='r', alpha=0.5, label='Uniform') # # plt.show() # print(domains) print(feature_names) print(feature_types) print(train.shape) # spn = SPN.LearnStructure(train, featureNames=feature_names, domains=domains, featureTypes=feature_types, row_split_method=Splitting.RandomPartitionConditioningRows(), col_split_method=Splitting.RDCTestOHEpy(threshold=0.75), # spn = SPN.LearnStructure(train, featureNames=feature_names, domains=domains, featureTypes=feature_types, row_split_method=Splitting.DBScanOHE(eps=1.0, min_samples=2), col_split_method=Splitting.RDCTestOHEpy(threshold=0.75), # spn = SPN.LearnStructure(train, featureNames=feature_names, # domains=domains, featureTypes=feature_types, # row_split_method=Splitting.KmeansOHERows(), # col_split_method=Splitting.RDCTest(threshold=0.75), spn = SPN.LearnStructure(train, featureNames=feature_names, domains=domains, featureTypes=feature_types, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.05), min_instances_slice=20, cluster_first=True) print(spn) result.append([dsname, numpy.mean(spn.root.eval(train)), numpy.mean( spn.root.eval(valid)), numpy.mean(spn.root.eval(test))]) print("train", numpy.mean(spn.root.eval(train))) print("valid", numpy.mean(spn.root.eval(valid))) print("test", numpy.mean(spn.root.eval(test))) print("train", numpy.min(spn.root.eval(train))) print("valid", numpy.min(spn.root.eval(valid))) print("test", numpy.min(spn.root.eval(test))) print(result)
marg = pspn.marginalize([0, 1, 2, 3]) print(marg.toEquation()) print(marg) 0 / 0 mspn = learn(train, featureTypes=["discrete"] * data.shape[1], families=["isotonic"] * data.shape[1], domains=domains, feature_names=words, min_instances_slice=200, row_split_method=Splitting.KmeansRDCRows(), col_split_method=Splitting.RDCTest(threshold=0.1, OHE=False)) #print(pspn) # print(mspn) print("sum LL pspn", numpy.sum(pspn.root.eval(test))) print("sum LL mspn", numpy.sum(mspn.root.eval(test))) print("mean LL pspn", numpy.mean(pspn.root.eval(test))) print("mean LL mspn", numpy.mean(mspn.root.eval(test))) 0 / 0 def getmiforfeature(input): spn, i, j = input # return i+j
def test3(): numpy.random.seed(42) dsname, data, featureNames, featureTypes, doms = getAdult() doctorateVal = numpy.where(doms[2] == "Doctorate")[0][0] stategovVal = numpy.where(doms[1] == "State-gov")[0][0] print(featureNames) print(len(featureNames)) print(data[0, :]) print(data.shape) print(doctorateVal, stategovVal) pD = numpy.sum(data[:, 2] == doctorateVal) / data.shape[0] pSD = numpy.sum( numpy.logical_and(data[:, 2] == doctorateVal, data[:, 1] == stategovVal)) / data.shape[0] pS = numpy.sum(data[:, 1] == stategovVal) / data.shape[0] print("pD", pD) print("pSD", pSD) pS_D = pSD / pD print("pS_D", pS_D) # spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(alpha=0.01), # spn = SPN.LearnStructure(data, featureTypes=featureTypes, # featureNames=featureNames, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.3), min_instances_slice=3, cluster_first=True) spn.root.validate() print("SPN Learned") margSPN_SD = spn.root.marginalizeOut( [0, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13]) margSPN_SD.Prune() print(margSPN_SD) dataSD = numpy.zeros_like(data[0, :]).reshape(1, data.shape[1]) dataSD[0, 1] = stategovVal dataSD[0, 2] = doctorateVal print(dataSD) spnSD = (numpy.exp(margSPN_SD.eval(dataSD))) margSPN_D = spn.root.marginalizeOut( [0, 1, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13]) margSPN_D.Prune() print(margSPN_D) dataD = numpy.zeros_like(data[0, :]).reshape(1, data.shape[1]) dataD[0, 2] = doctorateVal print(dataD) spnD = (numpy.exp(margSPN_D.eval(dataD))) print("pD", pD) print("pS", pS) print("pSD", pSD) pS_D = pSD / pD print("pS_D", pS_D) print("spn pD", spnD) print("spn pSD", spnSD) spnS_D = spnSD / spnD print("spn pS_D", spnS_D) print("doctorateVal", doctorateVal) print("stategovVal", stategovVal) ll = spn.root.eval(data) # print("Probs", numpy.exp(ll)) print("Sum LL", numpy.sum(ll))
def test7(): numpy.random.seed(42) D = numpy.loadtxt("bank.csv", delimiter=";", skiprows=0, dtype="S") D = numpy.char.strip(D) featureNames = [str(f) for f in D[0, :]] D = D[1:, :] featureTypes = [ "discrete", "categorical", "categorical", "categorical", "continuous", "continuous", "categorical", "categorical", "categorical", "discrete", "categorical", "discrete", "categorical", "continuous", "discrete", "categorical", "categorical", ] print(len(featureTypes)) print(len(featureNames)) def isinteger(x): return numpy.all(numpy.equal(numpy.mod(x, 1), 0)) cols = [] types = [] domains = [] index = [0, 5] D = D[:, index] for col in range(D.shape[1]): b, c = numpy.unique(D[:, col], return_inverse=True) try: # could convert to float if isinteger(b.astype(float)): # was integer cols.append(D[:, col].astype(int)) types.append("discrete") domains.append(b.astype(int)) continue # was float cols.append(D[:, col].astype(float)) types.append("continuous") domains.append(b.astype(float)) continue except: # was discrete cols.append(c) types.append("categorical") domains.append(b.astype(str)) data = numpy.column_stack(cols) print(featureNames) print(domains) featureNames = [featureNames[i] for i in index] print(featureNames) print(types) data[:, 1] = numpy.sign(data[:, 1]) * numpy.log(numpy.abs(data[:, 1]) + 1) # spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(alpha=0.01), # spn = SPN.LearnStructure(data, featureTypes=featureTypes, # featureNames=featureNames, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), spn = SPN.LearnStructure( data, featureTypes=types, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.000001), min_instances_slice=1000, cluster_first=False) # RDCTestOHEpy spn.root.validate() print(spn) spn.save_pdf_graph("bank.pdf") ll = spn.root.eval(data) from matplotlib.collections import PolyCollection from matplotlib.colors import colorConverter import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec for i in [0, 1]: x = numpy.sort(data[:, i]).reshape(data.shape[0], 1) fig = plt.figure(figsize=(8, 8)) x1 = numpy.zeros_like(data) x1[:, i] = x[:, 0] color_idx = numpy.linspace(0, 1, len(spn.root.children)) for cidx, c in enumerate(spn.root.children): y = numpy.exp(c.children[i].eval(x1)) plt.plot(x, y, '--', color=plt.cm.cool(color_idx[cidx])) plt.show() # print("Probs", numpy.exp(ll)) print("Sum LL", numpy.sum(ll))
def learn_spn(dataset="data/iris", precision=25, independence=0.1, header=0, date=None, isotonic=False, histogram=True, types=False): skiprows = [1] if types else [] df = pd.read_csv(dataset, delimiter=",", header=header, parse_dates=date, skiprows=skiprows) df = df.dropna(axis=0, how='any') featureNames = df.columns.values.tolist() if header == 0 else [ "X_{}".format(i) for i in range(len(df.columns)) ] dtypes = df.dtypes if types: featureTypes = [] families = [] with open(dataset, 'r') as csvfile: csvreader = csv.reader(csvfile, delimiter=',', quotechar='|') csvreader.__next__() _types = csvreader.__next__() for featureType in _types: print(featureType) if featureType == 'cat': featureTypes.append('categorical') if histogram: families.append('histogram') elif isotonic: families.append('isotonic') else: families.append('piecewise') elif featureType == 'con': featureTypes.append('continuous') families.append('piecewise' if not isotonic else 'isotonic') elif featureType == 'dis': featureTypes.append('discrete') families.append('piecewise' if not isotonic else 'isotonic') else: featureTypes.append('unknown') families.append('piecewise' if not isotonic else 'isotonic') def to_featureTypes(types): featureTypes = [] families = [] for featureType in types: if featureType.kind == 'O': featureTypes.append('categorical') if histogram: families.append('histogram') elif isotonic: families.append('isotonic') else: families.append('piecewise') elif featureType.kind == 'f': featureTypes.append('continuous') families.append('piecewise' if not isotonic else 'isotonic') elif featureType.kind == np.dtype('i'): featureTypes.append('discrete') families.append('piecewise' if not isotonic else 'isotonic') else: featureTypes.append('unknown') families.append('piecewise' if not isotonic else 'isotonic') return featureTypes, families if not types: featureTypes, families = to_featureTypes(dtypes) data_dictionary = { 'features': [{ "name": name, "family": family, "type": typ, 'pandas_type': dtypes[i] } for i, (name, family, typ) in enumerate(zip(featureNames, families, featureTypes))], 'num_entries': len(df) } # print(df.info()) idx = df.columns for id, name in enumerate(idx): if featureTypes[id] == 'categorical': lb = LabelEncoder() data_dictionary['features'][id]["encoder"] = lb df[name] = df[name].astype('category') df[name] = lb.fit_transform(df[name]) data_dictionary['features'][id]["values"] = lb.transform( lb.classes_) if dtypes[id].kind == 'M': df[name] = (df[name] - df[name].min()) / np.timedelta64(1, 'D') # print(df.head()) data = np.array(df) # print(featureTypes) spn = SPN.LearnStructure( data, featureTypes=featureTypes, featureNames=featureNames, min_instances_slice=precision, families=families, row_split_method=Splitting.KmeansRDCRows(), col_split_method=Splitting.RDCTest(threshold=independence)) spn.name = dataset return spn, data_dictionary
from tfspn.SPN import SPN, Splitting dsname, train, test, labels_train, labels_test = getCIFAR10(grayscale=True) data = numpy.vstack((train, test)) ds = numpy.hstack((train, labels_train)) domains = [numpy.unique(ds[:, i]) for i in range(ds.shape[1])] spn = SPN.LearnStructure(ds, prior_weight=0.0, featureTypes=["gaussian"] * train.shape[1] + ["discrete"], row_split_method=Splitting.RandomPartitionRows(), col_split_method=Splitting.RDCTest(threshold=0.3, OHE=True), domains=domains, families=["gaussian"] * ds.shape[1], min_instances_slice=5000000) print("learned") ts = numpy.hstack(test, numpy.zeros_like(labels_test) / 0) ts = ts[0:10, :] print(ts[0, :]) predicted_labels = spn.root.mpe_eval(ts) print(predicted_labels[0, :])