def test4(data, features): n_topics = 3 featureNames = features + ["%s%s" % x for x in zip(["topic"] * n_topics, range(n_topics))] f = numpy.array(featureNames) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) numpy.random.seed(17) trainIdx = numpy.random.choice([True, False], size=(data.shape[0],), p=[2. / 3, 1. / 3]) testIdx = numpy.logical_not(trainIdx) train = data[trainIdx, :] test = data[testIdx, :] lda.fit(train) topicstrain = lda.transform(train) topicstest = lda.transform(test) maxtrain = numpy.argmax(topicstrain, axis=1) topicstrain = numpy.zeros_like(topicstrain) for i, c in enumerate(maxtrain): topicstrain[i, c] = 1 maxtest = numpy.argmax(topicstest, axis=1) topicstest = numpy.zeros_like(topicstest) for i, c in enumerate(maxtest): topicstest[i, c] = 1 testMPE = numpy.zeros((test.shape[0], len(featureNames))) testMPE = testMPE / 0 testMPE[:, numpy.arange(test.shape[1])] = test print("LEARNING SPN") traintopics = numpy.hstack((train, topicstrain)) # print(testMPE[:,[99, 100,101,102]]) # print(traintopics[:,[99, 100,101,102]]) print(featureNames) featureTypes = ["discrete"] * train.shape[1] + ["continuous"] * topicstrain.shape[1] domains = [] for i, ft in enumerate(featureTypes): if ft == "continuous": r = (0.0, 1.0) fd = estimate_continuous_domain(traintopics, i, range=r, binning_method=20) domain = numpy.array(sorted(fd.keys())) else: domain = numpy.unique(data[:, i]) # print(i, ft, domain) domains.append(domain) memory = Memory(cachedir="/tmp/test4", verbose=0, compress=9) @memory.cache def learn(): spn = SPN.LearnStructure(traintopics, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.1, linear=True), featureNames=featureNames, domains=domains, # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=100) return spn spn = learn() spn.root.validate() prodNodes = spn.get_nodes_by_type(ProductNode) for pn in prodNodes: leaves = pn.get_leaves() words = set() for leaf in leaves: # assuming pwl node: _x = numpy.argmax(leaf.y_range) max_x = leaf.x_range[_x] if max_x < 1.0: continue words.add(featureNames[leaf.featureIdx]) # ll = pn.eval() if len(words) < 4: continue print(words) logs, topicsmpe = spn.root.mpe_eval(testMPE) print(spn.get_leaves()) print(topicsmpe.shape) print(topicsmpe[:, [100, 101, 102]]) maxmpe = numpy.argmax(topicsmpe[:, [100, 101, 102]], axis=1) topicsmpe = numpy.zeros_like(topicsmpe[:, [100, 101, 102]]) for i, c in enumerate(maxmpe): topicsmpe[i, c] = 1 print(topicstest) print(topicsmpe) print(topicstest - topicsmpe) correct = numpy.sum(numpy.abs(topicstest - topicsmpe), axis=1) == 0 print("correct", numpy.sum(correct)) print("incorrect", topicsmpe.shape[0] - numpy.sum(correct)) print_top_words(lda, features, 10)
def test6(data): print(data.shape) _, mixt = getArchetypes(data, 3) def normalize(data): mixtd = data mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001 mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001 mixtnorm = numpy.sum(mixtd, axis=1) mixtd = numpy.divide(mixtd, mixtnorm[:, None]) return mixtd mixt = normalize(mixt) dirichlet_alphas = dirichlet.mle(mixt, method='meanprecision', maxiter=100000) featureTypes = ["continuous"] * mixt.shape[1] domains = [] for i, ft in enumerate(featureTypes): if ft == "continuous": r = (0.0, 1.0) fd = estimate_continuous_domain(mixt, i, range=r, binning_method=400) domain = numpy.array(sorted(fd.keys())) else: domain = numpy.unique(data[:, i]) domains.append(domain) print(domains) @memory.cache def learn(data): spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=1, # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=50) return spn spn = learn(mixt) print(spn) spn_samples = numpy.zeros((data.shape[0], 3))/0 a,spn_samples = spn.root.sample(spn_samples) spn_samples = normalize(spn_samples) #dtrain_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_train), alpha=dirichlet_alphas) def plotDirichlet(data): data = data.reshape(-1, mixt.shape[1]) result = scipy.stats.dirichlet.logpdf(numpy.transpose(normalize(data)), alpha=dirichlet_alphas) return result def spnpdf(data): data = data.reshape(-1, mixt.shape[1]) res = spn.root.eval(normalize(data))[0] return res xy_all = cartesian(mixt) filename = 'plots/dirichlet_mle.pdf' try: import os os.remove(filename) except OSError: pass pp = PdfPages(filename) # all fig = plt.figure() draw_pdf_contours_func(plotDirichlet) plt.title("dirichlet trained on all, original points") plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize) plt.colorbar() pp.savefig(fig) numpy.random.seed(17) mixt_samples = numpy.random.dirichlet(dirichlet_alphas, data.shape[0]) print(dirichlet_alphas) xy_samples = cartesian(mixt_samples) fig = plt.figure() draw_pdf_contours_func(plotDirichlet) plt.title("dirichlet trained on all, sampled points") plt.plot(xy_samples[:, 0], xy_samples[:, 1], 'ro', markersize=markersize) plt.colorbar() pp.savefig(fig) xy_spn_samples = cartesian(spn_samples) fig = plt.figure() draw_pdf_contours_func(spnpdf) plt.title("spn trained on all, original points") plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize) plt.colorbar() pp.savefig(fig) xy_spn_samples = cartesian(spn_samples) fig = plt.figure() draw_pdf_contours_func(spnpdf) plt.title("spn trained on all, sampled points") plt.plot(xy_spn_samples[:, 0], xy_spn_samples[:, 1], 'ro', markersize=markersize) plt.colorbar() pp.savefig(fig) pp.close()
def computeSimplexExperiment(dsname, data, dimensions, mixttype, min_instances_slice=700): if mixttype == "Archetype": _, mixt = getArchetypes(data, dimensions) if mixt is None: return () elif mixttype == "LDA": lda = LatentDirichletAllocation(n_topics=dimensions, max_iter=50, learning_method='online', learning_offset=50., random_state=0) lda.fit(data) mixt = lda.transform(data) elif mixttype == "RandomSample": mixt = numpy.random.dirichlet((1,1,1), 20).transpose() print(mixt) 0/0 print(mixt.shape) def normalize(data): mixtd = data mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001 mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001 mixtnorm = numpy.sum(mixtd, axis=1) mixtd = numpy.divide(mixtd, mixtnorm[:, None]) return mixtd+0.0 mixt = normalize(mixt) mixt_train, mixt_test = train_test_split(mixt, test_size=0.30, random_state=42) numpy.savetxt("mixt_train.csv", mixt_train) numpy.savetxt("mixt_test.csv", mixt_test) #0/0 featureTypes = ["continuous"] * mixt.shape[1] domains = [] for i, ft in enumerate(featureTypes): if ft == "continuous": r = (0.0, 1.0) fd = estimate_continuous_domain(mixt, i, range=r, binning_method=400) domain = numpy.array(sorted(fd.keys())) else: domain = numpy.unique(data[:, i]) domains.append(domain) dirichlet_alphas = dirichlet.mle(mixt_train, method='meanprecision', maxiter=100000) #@memory.cache def learn(data): spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=0.1, families = ['histogram'] * data.shape[1], # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=min_instances_slice) return spn #for the good pdf it was 700 spn = learn(mixt_train) print(spn) def spnpdf(data): data = data.reshape(-1, mixt.shape[1]) res = spn.root.eval(normalize(data))[0] return res print(dirichlet_alphas) def plotDirichlet(data): data = data.reshape(-1, mixt.shape[1]) try: result = scipy.stats.dirichlet.logpdf(numpy.transpose(normalize(data)), alpha=dirichlet_alphas) except: print(normalize(data)) print(normalize(data)*1.0) print(normalize(data)+1) print(normalize(data)+0) 0/0 return result df_train = pandas.DataFrame() df_test = pandas.DataFrame() dtrain_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_train), alpha=dirichlet_alphas) dtest_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_test), alpha=dirichlet_alphas) df_train["dirichlet_train"] = dtrain_fit df_test["dirichlet_test"] = dtest_fit spn_train_fit = spn.root.eval(mixt_train) spn_test_fit = spn.root.eval(mixt_test) df_train["spn_train"] = spn_train_fit df_test["spn_test"] = spn_test_fit if dimensions == 3: xy_train = cartesian(mixt_train) xy_test = cartesian(mixt_test) filename = 'plots/%s_%s.pdf' % (dsname, mixttype) try: import os os.remove(filename) except OSError: pass pp = PdfPages(filename) markersize = 1.0 # all # fig = plt.figure() # plt.title("dirichlet, original points") # draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12) # #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet) # plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize) # plt.colorbar() # pp.savefig(fig) # train fig = plt.figure() plt.title("Dirichlet, train points") draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12) #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet) plt.plot(xy_train[:, 0], xy_train[:, 1], 'ro', markersize=markersize) #plt.colorbar() fig.tight_layout() pp.savefig(fig) # test fig = plt.figure() plt.title("Dirichlet, test points") draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12) #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet) plt.plot(xy_test[:, 0], xy_test[:, 1], 'ro', markersize=markersize) #plt.colorbar() fig.tight_layout() pp.savefig(fig) # all # fig = plt.figure() # plt.title("spn, original points") # draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12) # #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf) # # plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize) # plt.colorbar() # pp.savefig(fig) # train fig = plt.figure() plt.title("SPN, train points") draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12) #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf) plt.plot(xy_train[:, 0], xy_train[:, 1], 'ro', markersize=markersize) #plt.colorbar() fig.tight_layout() pp.savefig(fig) # test fig = plt.figure() plt.title("SPN, test points") draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12) #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf) plt.plot(xy_test[:, 0], xy_test[:, 1], 'ro', markersize=markersize) #plt.colorbar() fig.tight_layout() pp.savefig(fig) pp.close() return ("name", dsname, "size", data.shape, "type", mixttype, "dims", dimensions, "spn_train_LL", numpy.mean(spn_train_fit), "dir_train_LL", numpy.mean(dtrain_fit), "spn_test_LL", numpy.mean(spn_test_fit), "dir_test_LL", numpy.mean(dtest_fit) , "spn_#_sum_nodes", spn.n_sum_nodes(), "spn_#_prod_nodes", spn.n_prod_nodes(), "spn_#_layers", spn.n_layers() )
numpy.savetxt(proc_path, proc_data, fmt=feature_formats, delimiter=',') pfdomains = [] for i, (fn, ft, fd) in enumerate(zip(fnames, ftypes, cfdomains)): if ft == 'continuous': r = (fd['min'], fd['max']) print('range', r) # # using astropy fd = estimate_continuous_domain(train, i, range=r, binning_method=args.bins) # fd = estimate_continuous_domain_min_max(train, i, min=fd['min'], max=fd['max']) print(fd) pfdomains.append(fd) # # writing back the domain for the missing values feature_info_name = '{}.{}'.format(d, FEATURE_INFO_EXT) proc_feature_info_path = os.path.join(proc_dir, feature_info_name) save_feature_info_dict(fnames, ftypes, pfdomains, proc_feature_info_path, domain_keys=False, range=False)
feature_info_name = 'autism.{}'.format(FEATURE_INFO_EXT) feature_info_path = os.path.join(args.datadir, feature_info_name) fnames, ftypes, fdomains = load_feature_info_preprocess(feature_info_path) logging.info('Loaded feature info file {}'.format(feature_info_path)) cfdomains = [] for i, (fn, ft, fd) in enumerate(zip(fnames, ftypes, fdomains)): if ft == 'categorical': if fd is None: fd = estimate_categorical_domain(data, i) elif ft == 'continuous': if fd is None: fd = estimate_continuous_domain(data, i, range=(data[:, i].min() - 0.01, data[:, i].max() + 0.01), binning_method=bin_method) elif ft == 'discrete': if fd is None: fd = estimate_categorical_domain(data, i) # try: # _ = [float(k) for k, v in fd.items()] # except: # logging.info('Cannot convert discrete domain to float {}'.format(fd)) # fd = {str(v): v for _k, v in fd.items()} cfdomains.append(fd) # # feature info
print('Reading domains by feature info file stub {}'.format( args.stub_feature_info_file)) _fnames, _ftypes, _domains = load_feature_info_preprocess( args.stub_feature_info_file) domains_x = _domains[:n_x] print(domains_x) print(_domains) else: print('Estimating domains by embeddings') # domains_x = estimate_domains_range(full_x, feature_types_x) bin_method = args.bins bin_range_width = 0.001 domains_x = [ estimate_continuous_domain( full_x, i, range=(full_x[:, i].min() - bin_range_width, full_x[:, i].max() + bin_range_width), binning_method=bin_method) for i in range(n_x) ] domains_y = [estimate_categorical_domain(full_y, i) for i in range(n_y)] domains = domains_x + domains_y print('domains', domains) out_feature_info_path = os.path.join(out_path, 'aug.raelk.features') save_feature_info_dict(feature_names, feature_types, domains, out_feature_info_path, range=False) print('Saved feature info file to ', out_feature_info_path)