def test1(data, features): data = data[:, 1:20] features = features[0:data.shape[1]] arcs, mixt = getArchetypes(data, 3) nrfolds = 10 stats = Stats(name=dsname) for train, test, i in kfolded(mixt, nrfolds): c = Chrono().start() spn = SPN.LearnStructure(train, featureTypes=["continuous"] * train.shape[1], row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.3), # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=100) c.end() spn.root.validate() ll = numpy.mean(spn.root.eval(test)) print(ll) stats.add("HSPN", Stats.LOG_LIKELIHOOD, ll) stats.add("HSPN", Stats.TIME, c.elapsed()) stats.save("stats_" + dsname + ".json") print(arcs)
def test2(data, features): arc, mixt = getArchetypes(data, 3) print(mixt) 0 / 0 spn = SPN.LearnStructure(mixt, featureTypes=["continuous"] * mixt.shape[1], row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.3), # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=100)
def test6(data): print(data.shape) _, mixt = getArchetypes(data, 3) def normalize(data): mixtd = data mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001 mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001 mixtnorm = numpy.sum(mixtd, axis=1) mixtd = numpy.divide(mixtd, mixtnorm[:, None]) return mixtd mixt = normalize(mixt) dirichlet_alphas = dirichlet.mle(mixt, method='meanprecision', maxiter=100000) featureTypes = ["continuous"] * mixt.shape[1] domains = [] for i, ft in enumerate(featureTypes): if ft == "continuous": r = (0.0, 1.0) fd = estimate_continuous_domain(mixt, i, range=r, binning_method=400) domain = numpy.array(sorted(fd.keys())) else: domain = numpy.unique(data[:, i]) domains.append(domain) print(domains) @memory.cache def learn(data): spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=1, # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=50) return spn spn = learn(mixt) print(spn) spn_samples = numpy.zeros((data.shape[0], 3))/0 a,spn_samples = spn.root.sample(spn_samples) spn_samples = normalize(spn_samples) #dtrain_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_train), alpha=dirichlet_alphas) def plotDirichlet(data): data = data.reshape(-1, mixt.shape[1]) result = scipy.stats.dirichlet.logpdf(numpy.transpose(normalize(data)), alpha=dirichlet_alphas) return result def spnpdf(data): data = data.reshape(-1, mixt.shape[1]) res = spn.root.eval(normalize(data))[0] return res xy_all = cartesian(mixt) filename = 'plots/dirichlet_mle.pdf' try: import os os.remove(filename) except OSError: pass pp = PdfPages(filename) # all fig = plt.figure() draw_pdf_contours_func(plotDirichlet) plt.title("dirichlet trained on all, original points") plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize) plt.colorbar() pp.savefig(fig) numpy.random.seed(17) mixt_samples = numpy.random.dirichlet(dirichlet_alphas, data.shape[0]) print(dirichlet_alphas) xy_samples = cartesian(mixt_samples) fig = plt.figure() draw_pdf_contours_func(plotDirichlet) plt.title("dirichlet trained on all, sampled points") plt.plot(xy_samples[:, 0], xy_samples[:, 1], 'ro', markersize=markersize) plt.colorbar() pp.savefig(fig) xy_spn_samples = cartesian(spn_samples) fig = plt.figure() draw_pdf_contours_func(spnpdf) plt.title("spn trained on all, original points") plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize) plt.colorbar() pp.savefig(fig) xy_spn_samples = cartesian(spn_samples) fig = plt.figure() draw_pdf_contours_func(spnpdf) plt.title("spn trained on all, sampled points") plt.plot(xy_spn_samples[:, 0], xy_spn_samples[:, 1], 'ro', markersize=markersize) plt.colorbar() pp.savefig(fig) pp.close()
def getAirPollution(dimensions): dsname, data, features = getAirQualityUCITimeless() idxmissing = data == -200 data = data[:, numpy.sum(idxmissing,0) < 2000] idxmissing = data == -200 data = data[numpy.sum(idxmissing,1) == 0, :] idxmissing = data == -200 print(data.shape) _, mixt = getArchetypes(data, dimensions) if mixt is None: print( "no archetypes", dimensions) #0/0 return def normalize(data): mixtd = data mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001 mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001 mixtnorm = numpy.sum(mixtd, axis=1) mixtd = numpy.divide(mixtd, mixtnorm[:, None]) return mixtd+0.0 mixt = normalize(mixt) print(data.shape) featureTypes = ["continuous"] * mixt.shape[1] domains = [[0,1]] * mixt.shape[1] print(domains) @memory.cache def learn(data): spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=0.1, families = ['histogram'] * data.shape[1], # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), #min_instances_slice=int(data.shape[0]*0.01)) min_instances_slice=200) return spn stats = Stats(name=dsname) for train, test, i in kfolded(mixt, 10): print(i) #dirichlet_alphas = getDirichlet(train) dirichlet_alphas = dirichlet.mle(train, method='meanprecision', maxiter=1000000) print("dirichlet done") ll = scipy.stats.dirichlet.logpdf(numpy.transpose(test), alpha=dirichlet_alphas) stats.add("DIRICHLET", Stats.LOG_LIKELIHOOD, numpy.sum(ll)) stats.add("DIRICHLET", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll)) spn = learn(train) print("spn done") ll = spn.root.eval(test) print(stats.add("SPN", Stats.LOG_LIKELIHOOD, numpy.sum(ll))) print(stats.add("SPN", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll))) stats.save("results/airpollution/"+ dsname + "-" + str(dimensions) + ".json")
def computeSimplexExperiment(dsname, data, dimensions, mixttype, min_instances_slice=700): if mixttype == "Archetype": _, mixt = getArchetypes(data, dimensions) if mixt is None: return () elif mixttype == "LDA": lda = LatentDirichletAllocation(n_topics=dimensions, max_iter=50, learning_method='online', learning_offset=50., random_state=0) lda.fit(data) mixt = lda.transform(data) elif mixttype == "RandomSample": mixt = numpy.random.dirichlet((1,1,1), 20).transpose() print(mixt) 0/0 print(mixt.shape) def normalize(data): mixtd = data mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001 mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001 mixtnorm = numpy.sum(mixtd, axis=1) mixtd = numpy.divide(mixtd, mixtnorm[:, None]) return mixtd+0.0 mixt = normalize(mixt) mixt_train, mixt_test = train_test_split(mixt, test_size=0.30, random_state=42) numpy.savetxt("mixt_train.csv", mixt_train) numpy.savetxt("mixt_test.csv", mixt_test) #0/0 featureTypes = ["continuous"] * mixt.shape[1] domains = [] for i, ft in enumerate(featureTypes): if ft == "continuous": r = (0.0, 1.0) fd = estimate_continuous_domain(mixt, i, range=r, binning_method=400) domain = numpy.array(sorted(fd.keys())) else: domain = numpy.unique(data[:, i]) domains.append(domain) dirichlet_alphas = dirichlet.mle(mixt_train, method='meanprecision', maxiter=100000) #@memory.cache def learn(data): spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=0.1, families = ['histogram'] * data.shape[1], # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=min_instances_slice) return spn #for the good pdf it was 700 spn = learn(mixt_train) print(spn) def spnpdf(data): data = data.reshape(-1, mixt.shape[1]) res = spn.root.eval(normalize(data))[0] return res print(dirichlet_alphas) def plotDirichlet(data): data = data.reshape(-1, mixt.shape[1]) try: result = scipy.stats.dirichlet.logpdf(numpy.transpose(normalize(data)), alpha=dirichlet_alphas) except: print(normalize(data)) print(normalize(data)*1.0) print(normalize(data)+1) print(normalize(data)+0) 0/0 return result df_train = pandas.DataFrame() df_test = pandas.DataFrame() dtrain_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_train), alpha=dirichlet_alphas) dtest_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_test), alpha=dirichlet_alphas) df_train["dirichlet_train"] = dtrain_fit df_test["dirichlet_test"] = dtest_fit spn_train_fit = spn.root.eval(mixt_train) spn_test_fit = spn.root.eval(mixt_test) df_train["spn_train"] = spn_train_fit df_test["spn_test"] = spn_test_fit if dimensions == 3: xy_train = cartesian(mixt_train) xy_test = cartesian(mixt_test) filename = 'plots/%s_%s.pdf' % (dsname, mixttype) try: import os os.remove(filename) except OSError: pass pp = PdfPages(filename) markersize = 1.0 # all # fig = plt.figure() # plt.title("dirichlet, original points") # draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12) # #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet) # plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize) # plt.colorbar() # pp.savefig(fig) # train fig = plt.figure() plt.title("Dirichlet, train points") draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12) #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet) plt.plot(xy_train[:, 0], xy_train[:, 1], 'ro', markersize=markersize) #plt.colorbar() fig.tight_layout() pp.savefig(fig) # test fig = plt.figure() plt.title("Dirichlet, test points") draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12) #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet) plt.plot(xy_test[:, 0], xy_test[:, 1], 'ro', markersize=markersize) #plt.colorbar() fig.tight_layout() pp.savefig(fig) # all # fig = plt.figure() # plt.title("spn, original points") # draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12) # #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf) # # plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize) # plt.colorbar() # pp.savefig(fig) # train fig = plt.figure() plt.title("SPN, train points") draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12) #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf) plt.plot(xy_train[:, 0], xy_train[:, 1], 'ro', markersize=markersize) #plt.colorbar() fig.tight_layout() pp.savefig(fig) # test fig = plt.figure() plt.title("SPN, test points") draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12) #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf) plt.plot(xy_test[:, 0], xy_test[:, 1], 'ro', markersize=markersize) #plt.colorbar() fig.tight_layout() pp.savefig(fig) pp.close() return ("name", dsname, "size", data.shape, "type", mixttype, "dims", dimensions, "spn_train_LL", numpy.mean(spn_train_fit), "dir_train_LL", numpy.mean(dtrain_fit), "spn_test_LL", numpy.mean(spn_test_fit), "dir_test_LL", numpy.mean(dtest_fit) , "spn_#_sum_nodes", spn.n_sum_nodes(), "spn_#_prod_nodes", spn.n_prod_nodes(), "spn_#_layers", spn.n_layers() )