class LatentDirichletAllocationImpl(): def __init__(self, n_components=10, doc_topic_prior=None, topic_word_prior=None, learning_method='batch', learning_decay=0.7, learning_offset=10.0, max_iter=10, batch_size=128, evaluate_every=(- 1), total_samples=1000000.0, perp_tol=0.1, mean_change_tol=0.001, max_doc_update_iter=100, n_jobs=None, verbose=0, random_state=None, n_topics=None): self._hyperparams = { 'n_components': n_components, 'doc_topic_prior': doc_topic_prior, 'topic_word_prior': topic_word_prior, 'learning_method': learning_method, 'learning_decay': learning_decay, 'learning_offset': learning_offset, 'max_iter': max_iter, 'batch_size': batch_size, 'evaluate_every': evaluate_every, 'total_samples': total_samples, 'perp_tol': perp_tol, 'mean_change_tol': mean_change_tol, 'max_doc_update_iter': max_doc_update_iter, 'n_jobs': n_jobs, 'verbose': verbose, 'random_state': random_state, 'n_topics': n_topics} def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def transform(self, X): return self._sklearn_model.transform(X)
def computeNIPS(dimensions): dsname, data, features = getNips() lda = LatentDirichletAllocation(n_topics=dimensions, max_iter=50, learning_method='online', learning_offset=50., random_state=0) lda.fit(data) mixt = lda.transform(data) def normalize(data): mixtd = data mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001 mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001 mixtnorm = numpy.sum(mixtd, axis=1) mixtd = numpy.divide(mixtd, mixtnorm[:, None]) return mixtd+0.0 mixt = normalize(mixt) print(data.shape) featureTypes = ["continuous"] * mixt.shape[1] domains = [[0,1]] * mixt.shape[1] print(domains) @memory.cache def learn(data): spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=0.1, families = ['histogram'] * data.shape[1], # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=100) return spn stats = Stats(name=dsname) for train, test, i in kfolded(mixt, 10): print(i) #dirichlet_alphas = getDirichlet(train) dirichlet_alphas = dirichlet.mle(train, method='meanprecision', maxiter=1000000) print("dirichlet done") ll = scipy.stats.dirichlet.logpdf(numpy.transpose(test), alpha=dirichlet_alphas) stats.add("DIRICHLET", Stats.LOG_LIKELIHOOD, numpy.sum(ll)) stats.add("DIRICHLET", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll)) spn = learn(train) print("spn done") ll = spn.root.eval(test) print(stats.add("SPN", Stats.LOG_LIKELIHOOD, numpy.sum(ll))) print(stats.add("SPN", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll))) stats.save("results/nips/"+ dsname + "-" + str(dimensions) + ".json")
def getLDA(data, dimensions): lda = LatentDirichletAllocation(n_topics=dimensions, max_iter=50, learning_method='online', learning_offset=50., random_state=0) lda.fit(data) return lda.transform(data)
def test3(data, features): f = numpy.array(features) lda = LatentDirichletAllocation(n_topics=3, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(data) topics = lda.transform(data) print("LEARNING SPN") memory = Memory(cachedir="/tmp/test3", verbose=0, compress=9) @memory.cache def learn(): spn = SPN.LearnStructure(data, featureTypes=["discrete"] * data.shape[1], row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3, linear=True), # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=200) return spn spn = learn() spn.root.validate() prodNodes = spn.get_nodes_by_type(ProductNode) for pn in prodNodes: leaves = pn.get_leaves() words = set() for leaf in leaves: # assuming pwl node: _x = numpy.argmax(leaf.y_range) max_x = leaf.x_range[_x] if max_x < 1.0: continue words.add(features[leaf.featureIdx]) # ll = pn.eval() if len(words) < 4: continue print(words) 0 / 0 prodNodes = spn.get_nodes_by_type(ProductNode) pnll = numpy.zeros((data.shape[0], len(prodNodes))) for i, pn in enumerate(prodNodes): pnll[:, i] = numpy.exp(pn.eval(data)) for i in range(topics.shape[1]): tmax = 0 pnmax = None for j in range(len(prodNodes)): # rdcval = rdcdcor(topics[:,i], pnll[:,j]) bic = numpy.log(data.shape[0]) * len(prodNodes[j].scope) - 2.0 * numpy.log(numpy.sum(pnll[:, j])) if bic > tmax: tmax = bic pnmax = prodNodes[j] print("spn topic") print(pnmax.scope) print(f[list(pnmax.scope)]) print() print_top_words(lda, features, 10)
def test4(data, features): n_topics = 3 featureNames = features + ["%s%s" % x for x in zip(["topic"] * n_topics, range(n_topics))] f = numpy.array(featureNames) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) numpy.random.seed(17) trainIdx = numpy.random.choice([True, False], size=(data.shape[0],), p=[2. / 3, 1. / 3]) testIdx = numpy.logical_not(trainIdx) train = data[trainIdx, :] test = data[testIdx, :] lda.fit(train) topicstrain = lda.transform(train) topicstest = lda.transform(test) maxtrain = numpy.argmax(topicstrain, axis=1) topicstrain = numpy.zeros_like(topicstrain) for i, c in enumerate(maxtrain): topicstrain[i, c] = 1 maxtest = numpy.argmax(topicstest, axis=1) topicstest = numpy.zeros_like(topicstest) for i, c in enumerate(maxtest): topicstest[i, c] = 1 testMPE = numpy.zeros((test.shape[0], len(featureNames))) testMPE = testMPE / 0 testMPE[:, numpy.arange(test.shape[1])] = test print("LEARNING SPN") traintopics = numpy.hstack((train, topicstrain)) # print(testMPE[:,[99, 100,101,102]]) # print(traintopics[:,[99, 100,101,102]]) print(featureNames) featureTypes = ["discrete"] * train.shape[1] + ["continuous"] * topicstrain.shape[1] domains = [] for i, ft in enumerate(featureTypes): if ft == "continuous": r = (0.0, 1.0) fd = estimate_continuous_domain(traintopics, i, range=r, binning_method=20) domain = numpy.array(sorted(fd.keys())) else: domain = numpy.unique(data[:, i]) # print(i, ft, domain) domains.append(domain) memory = Memory(cachedir="/tmp/test4", verbose=0, compress=9) @memory.cache def learn(): spn = SPN.LearnStructure(traintopics, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.1, linear=True), featureNames=featureNames, domains=domains, # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=100) return spn spn = learn() spn.root.validate() prodNodes = spn.get_nodes_by_type(ProductNode) for pn in prodNodes: leaves = pn.get_leaves() words = set() for leaf in leaves: # assuming pwl node: _x = numpy.argmax(leaf.y_range) max_x = leaf.x_range[_x] if max_x < 1.0: continue words.add(featureNames[leaf.featureIdx]) # ll = pn.eval() if len(words) < 4: continue print(words) logs, topicsmpe = spn.root.mpe_eval(testMPE) print(spn.get_leaves()) print(topicsmpe.shape) print(topicsmpe[:, [100, 101, 102]]) maxmpe = numpy.argmax(topicsmpe[:, [100, 101, 102]], axis=1) topicsmpe = numpy.zeros_like(topicsmpe[:, [100, 101, 102]]) for i, c in enumerate(maxmpe): topicsmpe[i, c] = 1 print(topicstest) print(topicsmpe) print(topicstest - topicsmpe) correct = numpy.sum(numpy.abs(topicstest - topicsmpe), axis=1) == 0 print("correct", numpy.sum(correct)) print("incorrect", topicsmpe.shape[0] - numpy.sum(correct)) print_top_words(lda, features, 10)
def computeSimplexExperiment(dsname, data, dimensions, mixttype, min_instances_slice=700): if mixttype == "Archetype": _, mixt = getArchetypes(data, dimensions) if mixt is None: return () elif mixttype == "LDA": lda = LatentDirichletAllocation(n_topics=dimensions, max_iter=50, learning_method='online', learning_offset=50., random_state=0) lda.fit(data) mixt = lda.transform(data) elif mixttype == "RandomSample": mixt = numpy.random.dirichlet((1,1,1), 20).transpose() print(mixt) 0/0 print(mixt.shape) def normalize(data): mixtd = data mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001 mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001 mixtnorm = numpy.sum(mixtd, axis=1) mixtd = numpy.divide(mixtd, mixtnorm[:, None]) return mixtd+0.0 mixt = normalize(mixt) mixt_train, mixt_test = train_test_split(mixt, test_size=0.30, random_state=42) numpy.savetxt("mixt_train.csv", mixt_train) numpy.savetxt("mixt_test.csv", mixt_test) #0/0 featureTypes = ["continuous"] * mixt.shape[1] domains = [] for i, ft in enumerate(featureTypes): if ft == "continuous": r = (0.0, 1.0) fd = estimate_continuous_domain(mixt, i, range=r, binning_method=400) domain = numpy.array(sorted(fd.keys())) else: domain = numpy.unique(data[:, i]) domains.append(domain) dirichlet_alphas = dirichlet.mle(mixt_train, method='meanprecision', maxiter=100000) #@memory.cache def learn(data): spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=0.1, families = ['histogram'] * data.shape[1], # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=min_instances_slice) return spn #for the good pdf it was 700 spn = learn(mixt_train) print(spn) def spnpdf(data): data = data.reshape(-1, mixt.shape[1]) res = spn.root.eval(normalize(data))[0] return res print(dirichlet_alphas) def plotDirichlet(data): data = data.reshape(-1, mixt.shape[1]) try: result = scipy.stats.dirichlet.logpdf(numpy.transpose(normalize(data)), alpha=dirichlet_alphas) except: print(normalize(data)) print(normalize(data)*1.0) print(normalize(data)+1) print(normalize(data)+0) 0/0 return result df_train = pandas.DataFrame() df_test = pandas.DataFrame() dtrain_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_train), alpha=dirichlet_alphas) dtest_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_test), alpha=dirichlet_alphas) df_train["dirichlet_train"] = dtrain_fit df_test["dirichlet_test"] = dtest_fit spn_train_fit = spn.root.eval(mixt_train) spn_test_fit = spn.root.eval(mixt_test) df_train["spn_train"] = spn_train_fit df_test["spn_test"] = spn_test_fit if dimensions == 3: xy_train = cartesian(mixt_train) xy_test = cartesian(mixt_test) filename = 'plots/%s_%s.pdf' % (dsname, mixttype) try: import os os.remove(filename) except OSError: pass pp = PdfPages(filename) markersize = 1.0 # all # fig = plt.figure() # plt.title("dirichlet, original points") # draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12) # #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet) # plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize) # plt.colorbar() # pp.savefig(fig) # train fig = plt.figure() plt.title("Dirichlet, train points") draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12) #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet) plt.plot(xy_train[:, 0], xy_train[:, 1], 'ro', markersize=markersize) #plt.colorbar() fig.tight_layout() pp.savefig(fig) # test fig = plt.figure() plt.title("Dirichlet, test points") draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12) #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet) plt.plot(xy_test[:, 0], xy_test[:, 1], 'ro', markersize=markersize) #plt.colorbar() fig.tight_layout() pp.savefig(fig) # all # fig = plt.figure() # plt.title("spn, original points") # draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12) # #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf) # # plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize) # plt.colorbar() # pp.savefig(fig) # train fig = plt.figure() plt.title("SPN, train points") draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12) #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf) plt.plot(xy_train[:, 0], xy_train[:, 1], 'ro', markersize=markersize) #plt.colorbar() fig.tight_layout() pp.savefig(fig) # test fig = plt.figure() plt.title("SPN, test points") draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12) #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf) plt.plot(xy_test[:, 0], xy_test[:, 1], 'ro', markersize=markersize) #plt.colorbar() fig.tight_layout() pp.savefig(fig) pp.close() return ("name", dsname, "size", data.shape, "type", mixttype, "dims", dimensions, "spn_train_LL", numpy.mean(spn_train_fit), "dir_train_LL", numpy.mean(dtrain_fit), "spn_test_LL", numpy.mean(spn_test_fit), "dir_test_LL", numpy.mean(dtest_fit) , "spn_#_sum_nodes", spn.n_sum_nodes(), "spn_#_prod_nodes", spn.n_prod_nodes(), "spn_#_layers", spn.n_layers() )