예제 #1
0
def test4(data, features):
    n_topics = 3
    
    featureNames = features + ["%s%s" % x for x in zip(["topic"] * n_topics, range(n_topics))]
    
    f = numpy.array(featureNames)
    
    
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
    
    numpy.random.seed(17)
    trainIdx = numpy.random.choice([True, False], size=(data.shape[0],), p=[2. / 3, 1. / 3])
    testIdx = numpy.logical_not(trainIdx)
    
    train = data[trainIdx, :]
    test = data[testIdx, :]
    
    lda.fit(train)
    
    topicstrain = lda.transform(train)
    topicstest = lda.transform(test)
    
    
    maxtrain = numpy.argmax(topicstrain, axis=1)
    topicstrain = numpy.zeros_like(topicstrain)
    for i, c in enumerate(maxtrain):
        topicstrain[i, c] = 1
        
    maxtest = numpy.argmax(topicstest, axis=1)
    topicstest = numpy.zeros_like(topicstest)
    for i, c in enumerate(maxtest):
        topicstest[i, c] = 1
    
    
    
    testMPE = numpy.zeros((test.shape[0], len(featureNames)))
    testMPE = testMPE / 0
    testMPE[:, numpy.arange(test.shape[1])] = test
    
    print("LEARNING SPN")
    
    traintopics = numpy.hstack((train, topicstrain))
    
    # print(testMPE[:,[99, 100,101,102]])

    # print(traintopics[:,[99, 100,101,102]])

    print(featureNames)
    
    featureTypes = ["discrete"] * train.shape[1] + ["continuous"] * topicstrain.shape[1]
    
    domains = []
    for i, ft in enumerate(featureTypes):
        if ft == "continuous":
            r = (0.0, 1.0)
            fd = estimate_continuous_domain(traintopics, i, range=r, binning_method=20)
            domain = numpy.array(sorted(fd.keys()))
        else:
            domain = numpy.unique(data[:, i])

        # print(i, ft, domain)
        domains.append(domain)

        
    memory = Memory(cachedir="/tmp/test4", verbose=0, compress=9)


    @memory.cache
    def learn():
        spn = SPN.LearnStructure(traintopics, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.1, linear=True),
                                featureNames=featureNames,
                                domains=domains,
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=100)
        return spn
    
    spn = learn()

    spn.root.validate()
    
    prodNodes = spn.get_nodes_by_type(ProductNode)

    for pn in prodNodes:
        leaves = pn.get_leaves()
        words = set()
        for leaf in leaves:
            # assuming pwl node:
            _x = numpy.argmax(leaf.y_range)
            max_x = leaf.x_range[_x]
            if max_x < 1.0:
                continue
            
            words.add(featureNames[leaf.featureIdx])
        # ll = pn.eval()
        if len(words) < 4:
            continue
        
        print(words)
        

    
    logs, topicsmpe = spn.root.mpe_eval(testMPE)
    
    print(spn.get_leaves())
    
    print(topicsmpe.shape)
    
    print(topicsmpe[:, [100, 101, 102]])
    
    
    maxmpe = numpy.argmax(topicsmpe[:, [100, 101, 102]], axis=1)
    topicsmpe = numpy.zeros_like(topicsmpe[:, [100, 101, 102]])
    for i, c in enumerate(maxmpe):
        topicsmpe[i, c] = 1
        
        
    print(topicstest)
    print(topicsmpe)

    print(topicstest - topicsmpe)
    correct = numpy.sum(numpy.abs(topicstest - topicsmpe), axis=1) == 0

    print("correct", numpy.sum(correct))
    print("incorrect", topicsmpe.shape[0] - numpy.sum(correct))
    
    print_top_words(lda, features, 10)
예제 #2
0
def test6(data):
    print(data.shape)
    _, mixt = getArchetypes(data, 3)
    
    def normalize(data):
        mixtd = data
        mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001
        mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001
        mixtnorm = numpy.sum(mixtd, axis=1)
        mixtd = numpy.divide(mixtd, mixtnorm[:, None])
        return mixtd
    
    mixt = normalize(mixt)
    
    dirichlet_alphas = dirichlet.mle(mixt, method='meanprecision', maxiter=100000)
    
    featureTypes = ["continuous"] * mixt.shape[1]
    
    domains = []
    for i, ft in enumerate(featureTypes):
        if ft == "continuous":
            r = (0.0, 1.0)
            fd = estimate_continuous_domain(mixt, i, range=r, binning_method=400)
            domain = numpy.array(sorted(fd.keys()))
        else:
            domain = numpy.unique(data[:, i])

        domains.append(domain)
    print(domains)

    @memory.cache
    def learn(data):
        spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3),
                                 domains=domains,
                                 alpha=1,
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=50)
        return spn
    
    spn = learn(mixt)
    print(spn)
    
    spn_samples = numpy.zeros((data.shape[0], 3))/0
    a,spn_samples = spn.root.sample(spn_samples)
    
    spn_samples = normalize(spn_samples)
    
    
    
    #dtrain_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_train), alpha=dirichlet_alphas)
    def plotDirichlet(data):
        data = data.reshape(-1, mixt.shape[1])
        result = scipy.stats.dirichlet.logpdf(numpy.transpose(normalize(data)), alpha=dirichlet_alphas)
        return result
    
    def spnpdf(data):
        data = data.reshape(-1, mixt.shape[1])
        res = spn.root.eval(normalize(data))[0]
        return res
    
    xy_all = cartesian(mixt)
    
    
    filename = 'plots/dirichlet_mle.pdf'
    try:
        import os
        os.remove(filename)
    except OSError:
        pass
    pp = PdfPages(filename)
    
    # all
    fig = plt.figure()
    draw_pdf_contours_func(plotDirichlet)
    plt.title("dirichlet trained on all, original points")
    plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize)
    plt.colorbar()
    pp.savefig(fig)
    
    numpy.random.seed(17)
    mixt_samples = numpy.random.dirichlet(dirichlet_alphas, data.shape[0])
    print(dirichlet_alphas)
    xy_samples = cartesian(mixt_samples)
    
    
    fig = plt.figure()
    draw_pdf_contours_func(plotDirichlet)
    plt.title("dirichlet trained on all, sampled points")
    plt.plot(xy_samples[:, 0], xy_samples[:, 1], 'ro', markersize=markersize)
    plt.colorbar()
    pp.savefig(fig)
    
    xy_spn_samples = cartesian(spn_samples)
    fig = plt.figure()
    draw_pdf_contours_func(spnpdf)
    plt.title("spn trained on all, original points")
    plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize)
    plt.colorbar()
    pp.savefig(fig)
    
    
    xy_spn_samples = cartesian(spn_samples)
    fig = plt.figure()
    draw_pdf_contours_func(spnpdf)
    plt.title("spn trained on all, sampled points")
    plt.plot(xy_spn_samples[:, 0], xy_spn_samples[:, 1], 'ro', markersize=markersize)
    plt.colorbar()
    pp.savefig(fig)
    
    
    
    pp.close()
예제 #3
0
def computeSimplexExperiment(dsname, data, dimensions, mixttype, min_instances_slice=700):
    if mixttype == "Archetype":
        _, mixt = getArchetypes(data, dimensions)
        if mixt is None:
            return ()
    elif mixttype == "LDA":
        lda = LatentDirichletAllocation(n_topics=dimensions, max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
    
        lda.fit(data)
        mixt = lda.transform(data)
    elif mixttype == "RandomSample":
        mixt = numpy.random.dirichlet((1,1,1), 20).transpose()
        print(mixt)
        0/0
        
    print(mixt.shape)
    
    def normalize(data):
        mixtd = data
        mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001
        mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001
        mixtnorm = numpy.sum(mixtd, axis=1)
        mixtd = numpy.divide(mixtd, mixtnorm[:, None])
        return mixtd+0.0
    
    mixt = normalize(mixt)
    mixt_train, mixt_test = train_test_split(mixt, test_size=0.30, random_state=42)


    numpy.savetxt("mixt_train.csv", mixt_train)
    numpy.savetxt("mixt_test.csv", mixt_test)
    #0/0

    featureTypes = ["continuous"] * mixt.shape[1]
    
    domains = []
    for i, ft in enumerate(featureTypes):
        if ft == "continuous":
            r = (0.0, 1.0)
            fd = estimate_continuous_domain(mixt, i, range=r, binning_method=400)
            domain = numpy.array(sorted(fd.keys()))
        else:
            domain = numpy.unique(data[:, i])

        domains.append(domain)
    
    dirichlet_alphas = dirichlet.mle(mixt_train, method='meanprecision', maxiter=100000)

    #@memory.cache
    def learn(data):
        spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3),
                                 domains=domains,
                                 alpha=0.1,
                                 families = ['histogram'] * data.shape[1],
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=min_instances_slice)
        return spn
    #for the good pdf it was 700
    
    
    spn = learn(mixt_train)
    print(spn)
    def spnpdf(data):
        data = data.reshape(-1, mixt.shape[1])
        res = spn.root.eval(normalize(data))[0]
        return res
    
    print(dirichlet_alphas)
    
    def plotDirichlet(data):
        data = data.reshape(-1, mixt.shape[1])
        try:
            result = scipy.stats.dirichlet.logpdf(numpy.transpose(normalize(data)), alpha=dirichlet_alphas)
        except:
            print(normalize(data))
            print(normalize(data)*1.0)
            print(normalize(data)+1)
            print(normalize(data)+0)
            0/0
        return result
    
    df_train = pandas.DataFrame()
    df_test = pandas.DataFrame()
    
    dtrain_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_train), alpha=dirichlet_alphas)
    dtest_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_test), alpha=dirichlet_alphas)
    df_train["dirichlet_train"] = dtrain_fit
    df_test["dirichlet_test"] = dtest_fit
    
    spn_train_fit = spn.root.eval(mixt_train)
    spn_test_fit = spn.root.eval(mixt_test)
    df_train["spn_train"] = spn_train_fit
    df_test["spn_test"] = spn_test_fit
    

    
    if dimensions == 3:
        xy_train = cartesian(mixt_train)
        xy_test = cartesian(mixt_test)
        
        filename = 'plots/%s_%s.pdf' % (dsname, mixttype)
        try:
            import os
            os.remove(filename)
        except OSError:
            pass
        pp = PdfPages(filename)
        
        markersize = 1.0
        # all
#         fig = plt.figure()
#         plt.title("dirichlet, original points")
#         draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12)
#         #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet)
#         plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize)
#         plt.colorbar()
#         pp.savefig(fig)
        # train
        fig = plt.figure()
        plt.title("Dirichlet, train points")
        draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12)
        #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet)
        plt.plot(xy_train[:, 0], xy_train[:, 1], 'ro', markersize=markersize)
        #plt.colorbar()
        fig.tight_layout()
        pp.savefig(fig)
        
        # test
        fig = plt.figure()
        plt.title("Dirichlet, test points")
        draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12)
        #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet)
        plt.plot(xy_test[:, 0], xy_test[:, 1], 'ro', markersize=markersize)
        #plt.colorbar()
        fig.tight_layout()
        pp.savefig(fig)
    
        # all
#         fig = plt.figure()
#         plt.title("spn, original points")
#         draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12)
#         #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf)
# 
#         plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize)
#         plt.colorbar()
#         pp.savefig(fig)
        
        # train
        fig = plt.figure()
        plt.title("SPN, train points")
        draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12)
        #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf)
        plt.plot(xy_train[:, 0], xy_train[:, 1], 'ro', markersize=markersize)
        #plt.colorbar()
        fig.tight_layout()
        pp.savefig(fig)
        
        # test
        fig = plt.figure()
        plt.title("SPN, test points")
        draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12)
        #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf)
        plt.plot(xy_test[:, 0], xy_test[:, 1], 'ro', markersize=markersize)
        #plt.colorbar()
        fig.tight_layout()
        pp.savefig(fig)
        pp.close()
    
    return ("name", dsname, "size", data.shape, "type", mixttype, "dims", dimensions,
            "spn_train_LL", numpy.mean(spn_train_fit), "dir_train_LL", numpy.mean(dtrain_fit),
            "spn_test_LL", numpy.mean(spn_test_fit), "dir_test_LL", numpy.mean(dtest_fit) ,
            "spn_#_sum_nodes", spn.n_sum_nodes(), "spn_#_prod_nodes", spn.n_prod_nodes(), "spn_#_layers", spn.n_layers()
            )
예제 #4
0
            numpy.savetxt(proc_path,
                          proc_data,
                          fmt=feature_formats,
                          delimiter=',')

        pfdomains = []
        for i, (fn, ft, fd) in enumerate(zip(fnames, ftypes, cfdomains)):

            if ft == 'continuous':
                r = (fd['min'], fd['max'])
                print('range', r)
                #
                # using astropy
                fd = estimate_continuous_domain(train,
                                                i,
                                                range=r,
                                                binning_method=args.bins)
                # fd = estimate_continuous_domain_min_max(train, i, min=fd['min'], max=fd['max'])
                print(fd)
            pfdomains.append(fd)

        #
        # writing back the domain for the missing values
        feature_info_name = '{}.{}'.format(d, FEATURE_INFO_EXT)
        proc_feature_info_path = os.path.join(proc_dir, feature_info_name)
        save_feature_info_dict(fnames,
                               ftypes,
                               pfdomains,
                               proc_feature_info_path,
                               domain_keys=False,
                               range=False)
예제 #5
0
    feature_info_name = 'autism.{}'.format(FEATURE_INFO_EXT)
    feature_info_path = os.path.join(args.datadir, feature_info_name)
    fnames, ftypes, fdomains = load_feature_info_preprocess(feature_info_path)
    logging.info('Loaded feature info file {}'.format(feature_info_path))

    cfdomains = []
    for i, (fn, ft, fd) in enumerate(zip(fnames, ftypes, fdomains)):

        if ft == 'categorical':
            if fd is None:
                fd = estimate_categorical_domain(data, i)

        elif ft == 'continuous':
            if fd is None:
                fd = estimate_continuous_domain(data, i,
                                                range=(data[:, i].min() - 0.01,
                                                       data[:, i].max() + 0.01),
                                                binning_method=bin_method)

        elif ft == 'discrete':
            if fd is None:
                fd = estimate_categorical_domain(data, i)

        #     try:
        #         _ = [float(k) for k, v in fd.items()]
        #     except:
        #         logging.info('Cannot convert discrete domain to float {}'.format(fd))
        #         fd = {str(v): v for _k, v in fd.items()}
        cfdomains.append(fd)

    #
    # feature info
예제 #6
0
        print('Reading domains by feature info file stub {}'.format(
            args.stub_feature_info_file))
        _fnames, _ftypes, _domains = load_feature_info_preprocess(
            args.stub_feature_info_file)
        domains_x = _domains[:n_x]
        print(domains_x)
        print(_domains)
    else:
        print('Estimating domains by embeddings')
        # domains_x = estimate_domains_range(full_x, feature_types_x)
        bin_method = args.bins
        bin_range_width = 0.001
        domains_x = [
            estimate_continuous_domain(
                full_x,
                i,
                range=(full_x[:, i].min() - bin_range_width,
                       full_x[:, i].max() + bin_range_width),
                binning_method=bin_method) for i in range(n_x)
        ]
    domains_y = [estimate_categorical_domain(full_y, i) for i in range(n_y)]
    domains = domains_x + domains_y
    print('domains', domains)

    out_feature_info_path = os.path.join(out_path, 'aug.raelk.features')
    save_feature_info_dict(feature_names,
                           feature_types,
                           domains,
                           out_feature_info_path,
                           range=False)
    print('Saved feature info file to ', out_feature_info_path)