def test1(data, features): data = data[:, 1:20] features = features[0:data.shape[1]] arcs, mixt = getArchetypes(data, 3) nrfolds = 10 stats = Stats(name=dsname) for train, test, i in kfolded(mixt, nrfolds): c = Chrono().start() spn = SPN.LearnStructure(train, featureTypes=["continuous"] * train.shape[1], row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.3), # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=100) c.end() spn.root.validate() ll = numpy.mean(spn.root.eval(test)) print(ll) stats.add("HSPN", Stats.LOG_LIKELIHOOD, ll) stats.add("HSPN", Stats.TIME, c.elapsed()) stats.save("stats_" + dsname + ".json") print(arcs)
def pdnperplexity(train, test, features, depth, iters): from pdn.GBMPDN import GBMPDN c = Chrono().start() pdn = GBMPDN(train, features, max_depth=depth, iterations=iters) c.end() pwb, perplexity, words, ll = pdn.perplexity(test) print( "PDN %s,%s=%s %.3f per-word bound, %.3f perplexity estimate based on a held-out corpus of %i documents with %i words" % (iters, depth, ll, pwb, perplexity, test.shape[0], words)) return perplexity, ll, c.elapsed()
def pspnperplexity(train, test, min_slices, ind_test_method, row_cluster_method): c1 = Chrono().start() spn = LearnSPN(alpha=0.001, min_slices=min_slices, cluster_prep_method="sqrt", ind_test_method=ind_test_method, row_cluster_method=row_cluster_method).fit_structure(train) c1.end() time = c1.elapsed() pwb, perplexity, words, logl = spn.perplexity(test) print( "SPN ll=%s %.3f per-word bound, %.1f perplexity estimate based on a held-out corpus of %i documents with %i words" % (logl, pwb, perplexity, test.shape[0], words)) return perplexity, logl, time, spn.size()
def ldaperplexity(train, test, topics): corpus = gensim.matutils.Dense2Corpus(train.astype(int), documents_columns=False) corpusTest = gensim.matutils.Dense2Corpus(test.astype(int), documents_columns=False) dictionary = Dictionary.from_corpus(corpus) with warnings.catch_warnings(): warnings.simplefilter("ignore") c = Chrono().start() lda = runLda(corpus, dictionary, topics=topics) c.end() corpus_words = sum(cnt for document in corpusTest for _, cnt in document) with warnings.catch_warnings(): warnings.simplefilter("ignore") perwordbound = lda.log_perplexity(corpusTest) print( "LDA %.3f per-word bound, %.1f perplexity estimate based on a held-out corpus of %i documents with %i words" % (perwordbound, numpy.exp2(-perwordbound), len(corpusTest), corpus_words)) return numpy.exp2(-perwordbound), c.elapsed()
def generateInput(dsname, data, family): data = data[numpy.var(data, 1) > 0, :] fullcmd = [] cscmd = [] rndcmd = [] for fold, (train_index, test_index) in enumerate(KFold(n_splits=10, random_state=1).split(data)): train, test = data[train_index, :], data[test_index, :] prepareExp(mkfolders("data/" + dsname + "/full/"), fold, train, test, None, family, 0.0, fullcmd, train) for pct in [5, 10, 20, 30, 40]: cschrono = Chrono().start() sampleIndexes, csweight = coreset(train, pct) cstrain = train[sampleIndexes, :] cschrono.end() prepareExp(mkfolders("data/" + dsname + "/cs/" + str(pct) + "/"), fold, cstrain, test, csweight, family, cschrono.elapsed(), cscmd, train) rndchrono = Chrono().start() sampleIndexes, _ = getSamples(train, None, pct) rtrain = train[sampleIndexes, :] rndchrono.end() prepareExp(mkfolders("data/" + dsname + "/rnd/" + str(pct) + "/"), fold, rtrain, test, None, family, rndchrono.elapsed(), rndcmd, train) allcmds = [] allcmds.extend(fullcmd) allcmds.extend(cscmd) allcmds.extend(rndcmd) with open("cmds_"+dsname+".txt", "w") as text_file: text_file.write("\n".join(allcmds))
gn1 = GaussianNode("gn1", 0, "X0", 1.0, 1.0) pn1 = PoissonNode("pn1", 1, "X1", 1.0) bn1 = BernoulliNode("bn1", 2, "X2", 0.0) p1 = ProductNode("p1", gn1, pn1, bn1) gn2 = GaussianNode("gn2", 0, "X0", 10.0, 1.0) pn2 = PoissonNode("pn2", 1, "X1", 10.0) bn2 = BernoulliNode("bn2", 2, "X2", 1.0) p2 = ProductNode("p1", gn2, pn2, bn2) s1 = SumNode("s1", [0.5, 0.5], p1, p2) spn = SPN() spn.root = s1 c = Chrono().start() with tf.device("/cpu:0"): tf.reset_default_graph() with tf.name_scope('input'): X = tf.placeholder(tf.float64, [None, 3], name="x") with tf.name_scope('SPN') as scope: spn.root.initTf(X) costf = JointCost(spn.root) train_op = tf.train.AdamOptimizer().minimize(costf) print(c.end().elapsed())