Пример #1
0
class LatentDirichletAllocationImpl():

    def __init__(self, n_components=10, doc_topic_prior=None, topic_word_prior=None, learning_method='batch', learning_decay=0.7, learning_offset=10.0, max_iter=10, batch_size=128, evaluate_every=(- 1), total_samples=1000000.0, perp_tol=0.1, mean_change_tol=0.001, max_doc_update_iter=100, n_jobs=None, verbose=0, random_state=None, n_topics=None):
        self._hyperparams = {
            'n_components': n_components,
            'doc_topic_prior': doc_topic_prior,
            'topic_word_prior': topic_word_prior,
            'learning_method': learning_method,
            'learning_decay': learning_decay,
            'learning_offset': learning_offset,
            'max_iter': max_iter,
            'batch_size': batch_size,
            'evaluate_every': evaluate_every,
            'total_samples': total_samples,
            'perp_tol': perp_tol,
            'mean_change_tol': mean_change_tol,
            'max_doc_update_iter': max_doc_update_iter,
            'n_jobs': n_jobs,
            'verbose': verbose,
            'random_state': random_state,
            'n_topics': n_topics}

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def transform(self, X):
        return self._sklearn_model.transform(X)
Пример #2
0
def computeNIPS(dimensions):
    dsname, data, features = getNips()
    
    lda = LatentDirichletAllocation(n_topics=dimensions, max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
    
    lda.fit(data)
    mixt = lda.transform(data)

    def normalize(data):
        mixtd = data
        mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001
        mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001
        mixtnorm = numpy.sum(mixtd, axis=1)
        mixtd = numpy.divide(mixtd, mixtnorm[:, None])
        return mixtd+0.0
    
    mixt = normalize(mixt)
    print(data.shape)
    featureTypes = ["continuous"] * mixt.shape[1]
    
    domains = [[0,1]] * mixt.shape[1]
    
    print(domains)
        
    @memory.cache
    def learn(data):
        spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3),
                                 domains=domains,
                                 alpha=0.1,
                                 families = ['histogram'] * data.shape[1],
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=100)
        return spn
    
    
    stats = Stats(name=dsname)
    
    for train, test, i in kfolded(mixt, 10):
        print(i)
        #dirichlet_alphas = getDirichlet(train)
        dirichlet_alphas = dirichlet.mle(train, method='meanprecision', maxiter=1000000)
        print("dirichlet done")
        ll = scipy.stats.dirichlet.logpdf(numpy.transpose(test), alpha=dirichlet_alphas)
        stats.add("DIRICHLET", Stats.LOG_LIKELIHOOD, numpy.sum(ll))
        stats.add("DIRICHLET", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll))
        
        spn = learn(train)
        print("spn done")
        ll = spn.root.eval(test)
        print(stats.add("SPN", Stats.LOG_LIKELIHOOD, numpy.sum(ll)))
        print(stats.add("SPN", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll)))

     
    stats.save("results/nips/"+ dsname + "-" + str(dimensions) + ".json")   
Пример #3
0
def getLDA(data, dimensions):
    lda = LatentDirichletAllocation(n_topics=dimensions,
                                    max_iter=50,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0)

    lda.fit(data)
    return lda.transform(data)
Пример #4
0
    def fit_model(self, data, params, return_data=False):
        data = data.tocsr()

        lda_instance = LatentDirichletAllocation(**params)
        lda_instance.fit(data)

        if return_data:
            return lda_instance, data
        else:
            return lda_instance
Пример #5
0
    def fit_model(self, data, params, return_data=False):
        if issparse(data):
            if data.format != 'csr':
                data = data.tocsr()
        else:
            data = csr_matrix(data)

        lda_instance = LatentDirichletAllocation(**params)
        lda_instance.fit(data)

        if return_data:
            return lda_instance, data
        else:
            return lda_instance
Пример #6
0
 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     else:
         self._sklearn_model.fit(X)
     return self
Пример #7
0
    def fit_model(self, data, params, return_data=False):
        from sklearn.decomposition.online_lda import LatentDirichletAllocation

        if issparse(data):
            if data.format != 'csr':
                data = data.tocsr()
        else:
            data = csr_matrix(data)

        lda_instance = LatentDirichletAllocation(**params)
        lda_instance.fit(data)

        if return_data:
            return lda_instance, data
        else:
            return lda_instance
Пример #8
0
 def __init__(self, n_components=10, doc_topic_prior=None, topic_word_prior=None, learning_method='batch', learning_decay=0.7, learning_offset=10.0, max_iter=10, batch_size=128, evaluate_every=(- 1), total_samples=1000000.0, perp_tol=0.1, mean_change_tol=0.001, max_doc_update_iter=100, n_jobs=None, verbose=0, random_state=None, n_topics=None):
     self._hyperparams = {
         'n_components': n_components,
         'doc_topic_prior': doc_topic_prior,
         'topic_word_prior': topic_word_prior,
         'learning_method': learning_method,
         'learning_decay': learning_decay,
         'learning_offset': learning_offset,
         'max_iter': max_iter,
         'batch_size': batch_size,
         'evaluate_every': evaluate_every,
         'total_samples': total_samples,
         'perp_tol': perp_tol,
         'mean_change_tol': mean_change_tol,
         'max_doc_update_iter': max_doc_update_iter,
         'n_jobs': n_jobs,
         'verbose': verbose,
         'random_state': random_state,
         'n_topics': n_topics}
     self._wrapped_model = Op(**self._hyperparams)
Пример #9
0
def test():
    n_topics = 10
    
    with open("acceptedoralpaperstext.txt") as f:
        content = []
        ids = []
        
        for line in f.readlines():
            cols = line.split("\t")
            content.append(cols[1].strip())
            ids.append(cols[0].strip())
        
    
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=100,
                                stop_words='english')
    
    
    #print(content)
    bow = tf_vectorizer.fit_transform(content)
    
    
    
    feature_names = tf_vectorizer.get_feature_names()
    
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=2000,
                                #learning_method='online',
                                #learning_offset=50.,
                                random_state=0)
    topics = lda.fit_transform(bow, bow)
    
    
    print(print_top_words(lda, feature_names, 10))
    print(topics)
    print(bow.shape)
    
    f = numpy.array(feature_names)
    
    data = numpy.array(bow.todense())
    
    featureTypes = ["discrete"] * data.shape[1]
    
    domains = []
    for i, ft in enumerate(featureTypes):
        domain = numpy.unique(data[:, i])

        # print(i, ft, domain)
        domains.append(domain)

        
    memory = Memory(cachedir="/tmp/spntopics", verbose=0, compress=9)


    @memory.cache
    def learn(data, min_instances_slice, feature_names, domains, featureTypes):
        spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.1, linear=True),
                                featureNames=feature_names,
                                domains=domains,
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=min_instances_slice)
        return spn
    
    
    
    
    print(data.shape)
    print(type(data))
    #0/0
    spn = learn(data, 5, f, domains, featureTypes)

    spn.root.validate()
    
    prodNodes = spn.get_nodes_by_type(ProductNode)

    for pn in prodNodes:
        leaves = pn.get_leaves()
        words = set()
        for leaf in leaves:
            # assuming pwl node:
            _x = numpy.argmax(leaf.y_range)
            max_x = leaf.x_range[_x]
            if max_x < 1.0:
                continue
            
            words.add(feature_names[leaf.featureIdx])
        # ll = pn.eval()
        if len(words) < 4:
            continue
        
        print(pn.rows, words)
Пример #10
0
def test3(data, features):
    
    
    f = numpy.array(features)
    
    
    
    
    lda = LatentDirichletAllocation(n_topics=3, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
    
    
    lda.fit(data)
    
    topics = lda.transform(data)
    
    
    print("LEARNING SPN")
    memory = Memory(cachedir="/tmp/test3", verbose=0, compress=9)

    @memory.cache
    def learn():
        spn = SPN.LearnStructure(data, featureTypes=["discrete"] * data.shape[1], row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3, linear=True),
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=200)
    

        return spn
    
    spn = learn()
    spn.root.validate()
    
    prodNodes = spn.get_nodes_by_type(ProductNode)

    for pn in prodNodes:
        leaves = pn.get_leaves()
        words = set()
        for leaf in leaves:
            # assuming pwl node:
            _x = numpy.argmax(leaf.y_range)
            max_x = leaf.x_range[_x]
            if max_x < 1.0:
                continue
            
            words.add(features[leaf.featureIdx])
        # ll = pn.eval()
        if len(words) < 4:
            continue
        
        print(words)
        

    
    0 / 0
    prodNodes = spn.get_nodes_by_type(ProductNode)
    
    
    pnll = numpy.zeros((data.shape[0], len(prodNodes)))
    
    for i, pn in enumerate(prodNodes):
        pnll[:, i] = numpy.exp(pn.eval(data))
        
        
    
    for i in range(topics.shape[1]):
        tmax = 0
        pnmax = None
        
        for j in range(len(prodNodes)):
            # rdcval = rdcdcor(topics[:,i], pnll[:,j])
            bic = numpy.log(data.shape[0]) * len(prodNodes[j].scope) - 2.0 * numpy.log(numpy.sum(pnll[:, j]))
            if bic > tmax:
                tmax = bic
                pnmax = prodNodes[j]
    
    
        print("spn topic")
        print(pnmax.scope)
        print(f[list(pnmax.scope)])
    
    print()
    print_top_words(lda, features, 10)
Пример #11
0
def test4(data, features):
    n_topics = 3
    
    featureNames = features + ["%s%s" % x for x in zip(["topic"] * n_topics, range(n_topics))]
    
    f = numpy.array(featureNames)
    
    
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
    
    numpy.random.seed(17)
    trainIdx = numpy.random.choice([True, False], size=(data.shape[0],), p=[2. / 3, 1. / 3])
    testIdx = numpy.logical_not(trainIdx)
    
    train = data[trainIdx, :]
    test = data[testIdx, :]
    
    lda.fit(train)
    
    topicstrain = lda.transform(train)
    topicstest = lda.transform(test)
    
    
    maxtrain = numpy.argmax(topicstrain, axis=1)
    topicstrain = numpy.zeros_like(topicstrain)
    for i, c in enumerate(maxtrain):
        topicstrain[i, c] = 1
        
    maxtest = numpy.argmax(topicstest, axis=1)
    topicstest = numpy.zeros_like(topicstest)
    for i, c in enumerate(maxtest):
        topicstest[i, c] = 1
    
    
    
    testMPE = numpy.zeros((test.shape[0], len(featureNames)))
    testMPE = testMPE / 0
    testMPE[:, numpy.arange(test.shape[1])] = test
    
    print("LEARNING SPN")
    
    traintopics = numpy.hstack((train, topicstrain))
    
    # print(testMPE[:,[99, 100,101,102]])

    # print(traintopics[:,[99, 100,101,102]])

    print(featureNames)
    
    featureTypes = ["discrete"] * train.shape[1] + ["continuous"] * topicstrain.shape[1]
    
    domains = []
    for i, ft in enumerate(featureTypes):
        if ft == "continuous":
            r = (0.0, 1.0)
            fd = estimate_continuous_domain(traintopics, i, range=r, binning_method=20)
            domain = numpy.array(sorted(fd.keys()))
        else:
            domain = numpy.unique(data[:, i])

        # print(i, ft, domain)
        domains.append(domain)

        
    memory = Memory(cachedir="/tmp/test4", verbose=0, compress=9)


    @memory.cache
    def learn():
        spn = SPN.LearnStructure(traintopics, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.1, linear=True),
                                featureNames=featureNames,
                                domains=domains,
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=100)
        return spn
    
    spn = learn()

    spn.root.validate()
    
    prodNodes = spn.get_nodes_by_type(ProductNode)

    for pn in prodNodes:
        leaves = pn.get_leaves()
        words = set()
        for leaf in leaves:
            # assuming pwl node:
            _x = numpy.argmax(leaf.y_range)
            max_x = leaf.x_range[_x]
            if max_x < 1.0:
                continue
            
            words.add(featureNames[leaf.featureIdx])
        # ll = pn.eval()
        if len(words) < 4:
            continue
        
        print(words)
        

    
    logs, topicsmpe = spn.root.mpe_eval(testMPE)
    
    print(spn.get_leaves())
    
    print(topicsmpe.shape)
    
    print(topicsmpe[:, [100, 101, 102]])
    
    
    maxmpe = numpy.argmax(topicsmpe[:, [100, 101, 102]], axis=1)
    topicsmpe = numpy.zeros_like(topicsmpe[:, [100, 101, 102]])
    for i, c in enumerate(maxmpe):
        topicsmpe[i, c] = 1
        
        
    print(topicstest)
    print(topicsmpe)

    print(topicstest - topicsmpe)
    correct = numpy.sum(numpy.abs(topicstest - topicsmpe), axis=1) == 0

    print("correct", numpy.sum(correct))
    print("incorrect", topicsmpe.shape[0] - numpy.sum(correct))
    
    print_top_words(lda, features, 10)
Пример #12
0
def computeSimplexExperiment(dsname, data, dimensions, mixttype, min_instances_slice=700):
    if mixttype == "Archetype":
        _, mixt = getArchetypes(data, dimensions)
        if mixt is None:
            return ()
    elif mixttype == "LDA":
        lda = LatentDirichletAllocation(n_topics=dimensions, max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
    
        lda.fit(data)
        mixt = lda.transform(data)
    elif mixttype == "RandomSample":
        mixt = numpy.random.dirichlet((1,1,1), 20).transpose()
        print(mixt)
        0/0
        
    print(mixt.shape)
    
    def normalize(data):
        mixtd = data
        mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001
        mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001
        mixtnorm = numpy.sum(mixtd, axis=1)
        mixtd = numpy.divide(mixtd, mixtnorm[:, None])
        return mixtd+0.0
    
    mixt = normalize(mixt)
    mixt_train, mixt_test = train_test_split(mixt, test_size=0.30, random_state=42)


    numpy.savetxt("mixt_train.csv", mixt_train)
    numpy.savetxt("mixt_test.csv", mixt_test)
    #0/0

    featureTypes = ["continuous"] * mixt.shape[1]
    
    domains = []
    for i, ft in enumerate(featureTypes):
        if ft == "continuous":
            r = (0.0, 1.0)
            fd = estimate_continuous_domain(mixt, i, range=r, binning_method=400)
            domain = numpy.array(sorted(fd.keys()))
        else:
            domain = numpy.unique(data[:, i])

        domains.append(domain)
    
    dirichlet_alphas = dirichlet.mle(mixt_train, method='meanprecision', maxiter=100000)

    #@memory.cache
    def learn(data):
        spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3),
                                 domains=domains,
                                 alpha=0.1,
                                 families = ['histogram'] * data.shape[1],
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=min_instances_slice)
        return spn
    #for the good pdf it was 700
    
    
    spn = learn(mixt_train)
    print(spn)
    def spnpdf(data):
        data = data.reshape(-1, mixt.shape[1])
        res = spn.root.eval(normalize(data))[0]
        return res
    
    print(dirichlet_alphas)
    
    def plotDirichlet(data):
        data = data.reshape(-1, mixt.shape[1])
        try:
            result = scipy.stats.dirichlet.logpdf(numpy.transpose(normalize(data)), alpha=dirichlet_alphas)
        except:
            print(normalize(data))
            print(normalize(data)*1.0)
            print(normalize(data)+1)
            print(normalize(data)+0)
            0/0
        return result
    
    df_train = pandas.DataFrame()
    df_test = pandas.DataFrame()
    
    dtrain_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_train), alpha=dirichlet_alphas)
    dtest_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_test), alpha=dirichlet_alphas)
    df_train["dirichlet_train"] = dtrain_fit
    df_test["dirichlet_test"] = dtest_fit
    
    spn_train_fit = spn.root.eval(mixt_train)
    spn_test_fit = spn.root.eval(mixt_test)
    df_train["spn_train"] = spn_train_fit
    df_test["spn_test"] = spn_test_fit
    

    
    if dimensions == 3:
        xy_train = cartesian(mixt_train)
        xy_test = cartesian(mixt_test)
        
        filename = 'plots/%s_%s.pdf' % (dsname, mixttype)
        try:
            import os
            os.remove(filename)
        except OSError:
            pass
        pp = PdfPages(filename)
        
        markersize = 1.0
        # all
#         fig = plt.figure()
#         plt.title("dirichlet, original points")
#         draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12)
#         #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet)
#         plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize)
#         plt.colorbar()
#         pp.savefig(fig)
        # train
        fig = plt.figure()
        plt.title("Dirichlet, train points")
        draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12)
        #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet)
        plt.plot(xy_train[:, 0], xy_train[:, 1], 'ro', markersize=markersize)
        #plt.colorbar()
        fig.tight_layout()
        pp.savefig(fig)
        
        # test
        fig = plt.figure()
        plt.title("Dirichlet, test points")
        draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12)
        #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet)
        plt.plot(xy_test[:, 0], xy_test[:, 1], 'ro', markersize=markersize)
        #plt.colorbar()
        fig.tight_layout()
        pp.savefig(fig)
    
        # all
#         fig = plt.figure()
#         plt.title("spn, original points")
#         draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12)
#         #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf)
# 
#         plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize)
#         plt.colorbar()
#         pp.savefig(fig)
        
        # train
        fig = plt.figure()
        plt.title("SPN, train points")
        draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12)
        #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf)
        plt.plot(xy_train[:, 0], xy_train[:, 1], 'ro', markersize=markersize)
        #plt.colorbar()
        fig.tight_layout()
        pp.savefig(fig)
        
        # test
        fig = plt.figure()
        plt.title("SPN, test points")
        draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12)
        #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf)
        plt.plot(xy_test[:, 0], xy_test[:, 1], 'ro', markersize=markersize)
        #plt.colorbar()
        fig.tight_layout()
        pp.savefig(fig)
        pp.close()
    
    return ("name", dsname, "size", data.shape, "type", mixttype, "dims", dimensions,
            "spn_train_LL", numpy.mean(spn_train_fit), "dir_train_LL", numpy.mean(dtrain_fit),
            "spn_test_LL", numpy.mean(spn_test_fit), "dir_test_LL", numpy.mean(dtest_fit) ,
            "spn_#_sum_nodes", spn.n_sum_nodes(), "spn_#_prod_nodes", spn.n_prod_nodes(), "spn_#_layers", spn.n_layers()
            )
Пример #13
0
			'KNeighborsRegressor':KNeighborsRegressor(),
			'KernelCenterer':KernelCenterer(),
			'KernelDensity':KernelDensity(),
			'KernelPCA':KernelPCA(),
			'KernelRidge':KernelRidge(),
			'LSHForest':LSHForest(),
			'LabelPropagation':LabelPropagation(),
			'LabelSpreading':LabelSpreading(),
			'Lars':Lars(),
			'LarsCV':LarsCV(),
			'Lasso':Lasso(),
			'LassoCV':LassoCV(),
			'LassoLars':LassoLars(),
			'LassoLarsCV':LassoLarsCV(),
			'LassoLarsIC':LassoLarsIC(),
			'LatentDirichletAllocation':LatentDirichletAllocation(),
			'LedoitWolf':LedoitWolf(),
			'LinearDiscriminantAnalysis':LinearDiscriminantAnalysis(),
			'LinearRegression':LinearRegression(),
			'LinearSVC':LinearSVC(),
			'LinearSVR':LinearSVR(),
			'LocallyLinearEmbedding':LocallyLinearEmbedding(),
			'LogisticRegression':LogisticRegression(),
			'LogisticRegressionCV':LogisticRegressionCV(),
			'MDS':MDS(),
			'MLPClassifier':MLPClassifier(),
			'MLPRegressor':MLPRegressor(),
			'MaxAbsScaler':MaxAbsScaler(),
			'MeanShift':MeanShift(),
			'MinCovDet':MinCovDet(),
			'MinMaxScaler':MinMaxScaler(),