Exemplo n.º 1
0
def _setup_bernoulli_mixture():
    """
    Setup code for the hinton tests.

    This code is from http://www.bayespy.org/examples/bmm.html
    """
    np.random.seed(1)
    p0 = [0.1, 0.9, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9]
    p1 = [0.1, 0.1, 0.1, 0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.9]
    p2 = [0.9, 0.9, 0.9, 0.9, 0.9, 0.1, 0.1, 0.1, 0.1, 0.1]
    p = np.array([p0, p1, p2])

    z = random.categorical([1 / 3, 1 / 3, 1 / 3], size=100)
    x = random.bernoulli(p[z])
    N = 100
    D = 10
    K = 10

    R = Dirichlet(K * [1e-5], name='R')
    Z = Categorical(R, plates=(N, 1), name='Z')

    P = Beta([0.5, 0.5], plates=(D, K), name='P')

    X = Mixture(Z, Bernoulli, P)

    Q = VB(Z, R, X, P)
    P.initialize_from_random()
    X.observe(x)
    Q.update(repeat=1000)

    return (R, P, Z)
Exemplo n.º 2
0
def _setup_linear_regression():
    """
    Setup code for the pdf and contour tests.

    This code is from http://www.bayespy.org/examples/regression.html
    """
    np.random.seed(1)
    k = 2  # slope
    c = 5  # bias
    s = 2  # noise standard deviation

    x = np.arange(10)
    y = k * x + c + s * np.random.randn(10)
    X = np.vstack([x, np.ones(len(x))]).T

    B = GaussianARD(0, 1e-6, shape=(2, ))

    F = SumMultiply('i,i', B, X)

    tau = Gamma(1e-3, 1e-3)
    Y = GaussianARD(F, tau)
    Y.observe(y)

    Q = VB(Y, B, tau)
    Q.update(repeat=1000)
    xh = np.linspace(-5, 15, 100)
    Xh = np.vstack([xh, np.ones(len(xh))]).T
    Fh = SumMultiply('i,i', B, Xh)

    return locals()
Exemplo n.º 3
0
def test_gaussian_mixture_plot():
    """
    Test the gaussian_mixture plotting function.

    The code is from http://www.bayespy.org/examples/gmm.html
    """
    np.random.seed(1)
    y0 = np.random.multivariate_normal([0, 0], [[1, 0], [0, 0.02]], size=50)
    y1 = np.random.multivariate_normal([0, 0], [[0.02, 0], [0, 1]], size=50)
    y2 = np.random.multivariate_normal([2, 2], [[1, -0.9], [-0.9, 1]], size=50)
    y3 = np.random.multivariate_normal([-2, -2], [[0.1, 0], [0, 0.1]], size=50)
    y = np.vstack([y0, y1, y2, y3])

    bpplt.pyplot.plot(y[:, 0], y[:, 1], 'rx')

    N = 200
    D = 2
    K = 10

    alpha = Dirichlet(1e-5 * np.ones(K), name='alpha')
    Z = Categorical(alpha, plates=(N, ), name='z')

    mu = Gaussian(np.zeros(D), 1e-5 * np.identity(D), plates=(K, ), name='mu')
    Lambda = Wishart(D, 1e-5 * np.identity(D), plates=(K, ), name='Lambda')

    Y = Mixture(Z, Gaussian, mu, Lambda, name='Y')
    Z.initialize_from_random()

    Q = VB(Y, mu, Lambda, Z, alpha)
    Y.observe(y)
    Q.update(repeat=1000)

    bpplt.gaussian_mixture_2d(Y, scale=2)
Exemplo n.º 4
0
    def fit(self, X, y):
        self.weights = GaussianARD(0, 1e-6, shape=(X.shape[-1], ))
        y_mean = SumMultiply('i,i', self.weights, X)
        precision = Gamma(1, .1)
        y_obs = GaussianARD(y_mean, precision)
        y_obs.observe(y)

        Q = VB(y_obs, self.weights, precision)
        Q.update(repeat=self.n_iter, tol=self.tolerance, verbose=False)
Exemplo n.º 5
0
    def _run(self, x, K=25, beta=0.5, alpha=0.00001, hinton_plot=False, end=False):
        '''Only to be used when doing parameter optimization.'''

        self.participant_list = x[0]
        
        N = len(x[0])            #number of data points (i.e. WCS participants)
        D = np.shape(x[1])[1]    #number of features
        #K = 20            #number of initial clusters
        
        R = Dirichlet(K*[alpha],
                      name='R')
        Z = Categorical(R,
                        plates=(N,1),
                        name='Z')
        
        P = Beta([beta, beta],
                 plates=(D,K),
                 name='P')
        
        X = Mixture(Z, Bernoulli, P)
        
        Q = VB(Z, R, X, P)
        P.initialize_from_random()
        X.observe(x[1])
        Q.update(repeat=1000)

        log_likelihood = Q.L[Q.iter-1]

        if hinton_plot:
            bpplt.hinton(Z)
            bpplt.pyplot.show()
            
            bpplt.hinton(R)
            bpplt.pyplot.show()

        #Get the weight matrix stored in Z (weights determine which cluster data point belongs to)
        z = Z._message_to_child()[0]
        z = z * np.ones(Z.plates+(1,))
        z = np.squeeze(z)
        self.z = z

        #Get the weights stored in R (proportional to the size of the clusters)
        r = np.exp(R._message_to_child()[0])
        r = r * np.ones(R.plates+(1,))
        r = np.squeeze(r)
        self.r = r

        #Get the cluster assignment of each data point
        self.c_assign = np.argmax(self.z, axis=1)

        return log_likelihood
Exemplo n.º 6
0
def run():

    a = nodes.GammaShape(name='a')
    b = nodes.Gamma(1e-5, 1e-5, name='b')

    tau = nodes.Gamma(a, b, plates=(1000, ), name='tau')
    tau.observe(nodes.Gamma(10, 20, plates=(1000, )).random())

    Q = VB(tau, a, b)

    Q.update(repeat=1000)

    print("True gamma parameters:", 10.0, 20.0)
    print("Estimated parameters from 1000 samples:", a.u[0], b.u[0])
Exemplo n.º 7
0
    def fit(self, X, y):
        self._init_weights()
        # self.cost,
        # self.myopic_voc(action, state),
        # self.vpi_action(action, state),
        # self.vpi(state),
        # self.expected_term_reward(state)

        self.tau = Gamma(self.prior_a, self.prior_b)
        F = SumMultiply('i,i', self.weights, X)
        y_obs = GaussianARD(F, self.tau)
        y_obs.observe(y)

        Q = VB(y_obs, self.weights)
        Q.update(repeat=10, tol=1e-4, verbose=False)
Exemplo n.º 8
0
    def predict(self):
        #print(self.network.graph)
        self.predictions = {
            term: numpy.empty((self.lenValidation, 2), dtype=float)
            for term in self.ontology.ontology
        }

        classifiers = {
            term: lambda:
            loadClf(self.ontology[term]['name'], self.fold, self.clfName
                    )  #self.ontology[term]['clf'][self.fold][self.clfName]
            for term in self.ontology.ontology
        }
        #for term, (clf,X,y,g) in classifiers.items():
        #    print(term, ":", repr(self.clfName), repr(clf.name), self.fold, clf.fold)

        observations = {
            term: clf.decision_function(clf.X_validation) if
            term != self.ontology.root else numpy.array([-1.] *
                                                        len(clf.X_validation))
            for term, clff  #(clf, X, y, g)
            in classifiers.items() for clf in (clff(), )
        }
        #print("observations:")
        #print(observations)
        gt = {term: clf().y_validation for term, clf in classifiers.items()}
        #print("gt:")
        #print(gt)

        for i in range(self.lenValidation):
            observation = {
                term: pred[i]
                for term, pred in observations.items()
            }
            #print("Observation for gene %d" % i)
            #print(observation)
            #print(self.network.forward_backward(observation))
            hidden, observed, extra = self.getCopy()
            #print(hidden)
            #print(observed)
            #print(observation)
            #for term, node in hidden.items():
            #    print(i, term, node.get_moments()[0])
            for k, v in observation.items():
                observed[k].observe((v, ))
            #    print("%s observes %s" % (k, v))
            allv = (*hidden.values(), *observed.values(), *extra)
            #print([(x, [p for p in x.parents if isinstance(p, Stochastic)]) for x in [*hidden.values(), *observed.values()]])
            Q = VB(*allv)
            Q.update(*allv, tol=1e-7, repeat=1000, verbose=True)
            #print("---")

            for term, node in hidden.items():
                #print(i, term, node.get_moments()[0])
                self.predictions[term][i, :] = node.get_moments()[0]

        #print("predictions:")
        #print(self.predictions)
        for term in observations:
            compare = numpy.empty((len(gt[term]), 4), dtype=float)
            compare[:, 0] = gt[term]
            compare[:, 1] = observations[term]
            compare[:, 2] = self.predictions[term][:, 1]
            compare[:, 3] = numpy.round(self.predictions[term][:, 1])
            print(term, self.ontology[term]['name'])
            print(compare)
Exemplo n.º 9
0
Zi = Categorical(alpha, plates=(N, ), name='zi')

from bayespy.nodes import Gaussian, Wishart

mui = Gaussian(np.zeros(D), 1e-5 * np.identity(D), plates=(K, ), name='mui')
Lambdai = Wishart(D, 1e-5 * np.identity(D), plates=(K, ), name='Lambdai')

from bayespy.nodes import Mixture

Y = Mixture(Zi, Gaussian, mui, Lambdai, name='Y')

Zi.initialize_from_random()

from bayespy.inference import VB

Q = VB(Y, mui, Lambdai, Zi, alpha)

Y.observe(np.reshape(C_mat, (-1, 2)))

Q.update(repeat=10)
#%%
K = 5  #hyperparameter
neta = 1e-6 * np.ones(K)  #hyperparameter
print(neta.shape)
print(neta)
PI = bayespy.nodes.Dirichlet(neta, name='PI')
#%%
Z = bayespy.nodes.Categorical(PI, plates=(m, n, K), name='Z')

mean_vec = np.zeros(d)  # to be initialized accorinding to image
precission_mat = 1e-5 * np.identity(
Exemplo n.º 10
0
    def run(self, K=25, beta=0.5, alpha=0.00001, foci_thresh=0, num_neigh=4, hinton_plot=False, end=False):
        '''Performs one run of the BBDP according to the specified parameters.'''

        print("Transforming WCS participant data into binary vectors...")
        x = u.transform_data_all(self.langs, norm=False, end=end, foci=True, foci_thresh=foci_thresh, num_neigh=num_neigh)
        print("Finished transforming participant data") 
        self.participant_list = x[0]
        
        N = len(x[0])            #number of data points (i.e. WCS participants)
        D = np.shape(x[1])[1]    #number of features
        #K = 20            #number of initial clusters
        
        R = Dirichlet(K*[alpha],
                      name='R')
        Z = Categorical(R,
                        plates=(N,1),
                        name='Z')
        
        P = Beta([beta, beta],
                 plates=(D,K),
                 name='P')
        
        X = Mixture(Z, Bernoulli, P)
        
        Q = VB(Z, R, X, P)
        P.initialize_from_random()
        X.observe(x[1])
        Q.update(repeat=1000)

        if hinton_plot:
            bpplt.hinton(Z)
            bpplt.pyplot.show()
            
            bpplt.hinton(R)
            bpplt.pyplot.show()

        #Get the weight matrix stored in Z (weights determine which cluster data point belongs to)
        z = Z._message_to_child()[0]
        z = z * np.ones(Z.plates+(1,))
        z = np.squeeze(z)
        self.z = z

        #Get the weights stored in R (proportional to the size of the clusters)
        r = np.exp(R._message_to_child()[0])
        r = r * np.ones(R.plates+(1,))
        r = np.squeeze(r)
        self.r = r

        #Get the cluster assignment of each data point
        self.c_assign = np.argmax(self.z, axis=1)

        #Write cluster results to a file
        if self.write_to_file:
            if end:
                save_path = "cluster_results_end_K={}_B={}_a={}_t={}_nn={}".format(K, beta, alpha, foci_thresh, num_neigh)
            else:
                save_path = "cluster_results_K={}_B={}_a={}_t={}_nn={}".format(K, beta, alpha, foci_thresh, num_neigh)
            while path.exists(save_path+".txt"):
                #save_path already exists
                try:
                    old_file_num = int(save_path[save_path.find('(')+1:-1])
                    new_file_num = old_file_num + 1
                    save_path = save_path[0:save_path.find('(')] + '(' + str(new_file_num) + ')'
                except ValueError:
                    save_path = save_path + " (1)"

            self.save_path = save_path       
            file = open(path.abspath(self.save_path+".txt"), 'w')
            
            #Write cluster assignment matrix Z (gives the probability that observation i belongs to cluster j)
            if 'Z' not in self.in_file:
                for i in range(len(self.z)):
                    line = "\t".join([str(x) for x in self.z[i]]) + "\n"
                    file.write(line)
                file.write('---Z\n')
                self.in_file.append('Z')

            #Write cluster weights matrix R (proportional to the size of the resulting clusters)
            if 'R' not in self.in_file:
                line = "\t".join([str(x) for x in self.r]) + "\n"
                file.write(line)
                file.write('---R\n')
                self.in_file.append('R')

            #Write deterministic cluster assignments with the corresponding participant key
            if 'C' not in self.in_file:
                line1 = "\t".join([str(x) for x in self.participant_list]) + "\n"
                line2 = "\t".join([str(x) for x in self.c_assign]) + "\n"              
                file.write(line1)
                file.write(line2)
                file.write('---C\n')
                self.in_file.append('C')
            
            file.close()

        return self.c_assign
Exemplo n.º 11
0
import numpy
numpy.random.seed(1)
from bayespy.nodes import CategoricalMarkovChain
a0 = [0.6, 0.4] # p(rainy)=0.6, p(sunny)=0.4
A = [[0.7, 0.3], # p(rainy->rainy)=0.7, p(rainy->sunny)=0.3
     [0.4, 0.6]] # p(sunny->rainy)=0.4, p(sunny->sunny)=0.6
N = 100
Z = CategoricalMarkovChain(a0, A, states=N)
from bayespy.nodes import Categorical, Mixture
P = [[0.1, 0.4, 0.5],
     [0.6, 0.3, 0.1]]
Y = Mixture(Z, Categorical, P)
weather = Z.random()
activity = Mixture(weather, Categorical, P).random()
Y.observe(activity)
from bayespy.inference import VB
Q = VB(Y, Z)
Q.update()
import bayespy.plot as bpplt
bpplt.plot(Z)
bpplt.plot(1-weather, color='r', marker='x')
bpplt.pyplot.show()
Exemplo n.º 12
0
lung = Mixture(smoking, Categorical, [[0.98, 0.02], [0.25, 0.75]])

bronchitis = Mixture(smoking, Categorical, [[0.97, 0.03], [0.08, 0.92]])

xray = Mixture(tuberculosis, Mixture, lung, Categorical,
               _or([0.96, 0.04], [0.115, 0.885]))

dyspnea = Mixture(
    bronchitis, Mixture, tuberculosis, Mixture, lung, Categorical,
    [_or([0.6, 0.4], [0.18, 0.82]),
     _or([0.11, 0.89], [0.04, 0.96])])

# Mark observations
tuberculosis.observe(TRUE)
smoking.observe(FALSE)
bronchitis.observe(
    TRUE)  # not a "chance" observation as in the original example

# Run inference
Q = VB(dyspnea, xray, bronchitis, lung, smoking, tuberculosis, asia)
Q.update(repeat=100)

# Show results
print("P(asia):", asia.get_moments()[0][TRUE])
print("P(tuberculosis):", tuberculosis.get_moments()[0][TRUE])
print("P(smoking):", smoking.get_moments()[0][TRUE])
print("P(lung):", lung.get_moments()[0][TRUE])
print("P(bronchitis):", bronchitis.get_moments()[0][TRUE])
print("P(xray):", xray.get_moments()[0][TRUE])
print("P(dyspnea):", dyspnea.get_moments()[0][TRUE])
Exemplo n.º 13
0
Arquivo: utils.py Projeto: SongFGH/TNE
def get_node_distr_over_comm(g, walks, method=None, params={}):

    if method == "HMM_param":

        seqs = []
        lens = []
        for walk in walks:
            s = [[int(w)] for w in walk]
            seqs.extend(s)
            lens.append(len(s))

        model = hmm.MultinomialHMM(n_components=params['number_of_topics'],
                                   tol=0.001,
                                   n_iter=5000)
        model.fit(seqs, lens)

        #posteriors = model.predict_proba(np.asarray([[i] for i in range(self.g.number_of_nodes())]))
        #comms = np.argmax(posteriors, 1)

        likelihood = model.emissionprob_
        """
        comms = np.argmax(likelihood, 0)

        node2comm = {}
        for id in range(len(comms)):
            node2comm[str(id)] = comms[id]

        return node2comm
        """

    elif method == "Nonparam_HMM":

        seqs = []
        lens = []
        for walk in walks:
            s = [int(w) for w in walk]
            seqs.append(s)
            lens.append(len(s))

        seqs = np.vstack(seqs)

        K = params['number_of_topics']  # the number of hidden states
        O = g.number_of_nodes()  # the size of observation set
        L = len(seqs[0])  # the length of each sequence
        N = len(seqs)  # the number of sequences

        p0 = params['prior_p0']  # a vector of size K
        t0 = params['prior_t0']  # a vector of size K
        e0 = params['prior_e0']  # a vector of size K

        p = bayes.Dirichlet(p0 * np.ones(K), name='p')

        T = bayes.Dirichlet(t0 * np.ones(K), plates=(K, ), name='T')

        E = bayes.Dirichlet(e0 * np.ones(O), plates=(K, ), name='E')

        Z = bayes.CategoricalMarkovChain(p,
                                         T,
                                         states=L,
                                         name='Z',
                                         plates=(N, ))

        # Emission/observation distribution
        X = bayes.Mixture(Z, bayes.Categorical, E, name='X')

        p.initialize_from_random()
        T.initialize_from_random()
        E.initialize_from_random()

        Q = VB(X, Z, p, T, E)

        Q['X'].observe(seqs)
        Q.update(repeat=1000)

        likelihood = Q['E'].random()
        """
        comms = np.argmax(likelihood, 0)

        node2comm = {}
        for id in range(len(comms)):
            node2comm[str(id)] = comms[id]

        return node2comm
        """

        return likelihood

    elif method == "LDA":

        # Run GibbsLDA++
        if not os.path.exists(GIBBSLDA_PATH):
            raise ValueError("Invalid path of GibbsLDA++!")

        temp_lda_folder = os.path.join(TEMP_FOLDER, "lda_temp")
        if not os.path.exists(temp_lda_folder):
            os.makedirs(temp_lda_folder)

        temp_dfile_path = os.path.join(temp_lda_folder, "gibblda_temp.dfile")
        # Save the walks into the dfile
        n = len(walks)
        with open(temp_dfile_path, 'w') as f:
            f.write("{}\n".format(n))
            for walk in walks:
                f.write("{}\n".format(" ".join(str(w) for w in walk)))

        initial_time = time.time()
        cmd = "{} -est ".format(GIBBSLDA_PATH)
        cmd += "-alpha {} ".format(params['lda_alpha'])
        cmd += "-beta {} ".format(params['lda_beta'])
        cmd += "-ntopics {} ".format(params['number_of_topics'])
        cmd += "-niters {} ".format(params['lda_number_of_iters'])
        cmd += "-savestep {} ".format(params['lda_number_of_iters'] + 1)
        cmd += "-dfile {} ".format(temp_dfile_path)
        os.system(cmd)

        print("-> The LDA algorithm run in {:.2f} secs".format(time.time() -
                                                               initial_time))

        # Read wordmap file
        id2node = {}
        temp_wordmap_path = os.path.join(temp_lda_folder, "wordmap.txt")
        with open(temp_wordmap_path, 'r') as f:
            f.readline()  # skip the first line
            for line in f.readlines():
                tokens = line.strip().split()
                id2node[int(tokens[1])] = tokens[0]

        # Read phi file
        num_of_nodes = len(id2node)
        phi = np.zeros(shape=(params['number_of_topics'], num_of_nodes),
                       dtype=np.float)
        temp_phi_path = os.path.join(temp_lda_folder, "model-final.phi")
        with open(temp_phi_path, 'r') as f:
            for comm, line in enumerate(f.readlines()):
                for id, value in enumerate(line.strip().split()):
                    phi[comm, int(id2node[id])] = value

        # Read the tassign file, generate topic corpus
        temp_tassing_path = os.path.join(temp_lda_folder,
                                         "model-final.tassign")
        comm_corpus = []
        with smart_open(temp_tassing_path, 'r') as f:
            for line in f:
                tokens = line.strip().split()
                comm_corpus.append([token.split(':')[1] for token in tokens])
        """
        max_topics = np.argmax(phi, axis=0)

        node2comm = {}
        for nodeId in id2node:
            node2comm[id2node[nodeId]] = max_topics[int(nodeId)]

        return node2comm
        """

        return phi, comm_corpus
    else:
        raise ValueError("Wrong parameter name!")
Exemplo n.º 14
0
# -----Performing inference------
# 1: Observe some nodes
c = np.random.randn(10, 2)
x = np.random.randn(2, 100)
data = np.dot(c, x) + 0.1 * np.random.randn(10, 100)
# data:10×100

Y.observe(data)
#( Missing values)
Y.observe(data,
          mask=[[True], [False], [False], [True], [True], [False], [True],
                [True], [True], [False]])

# 2: Choosing the inference method
from bayespy.inference import VB
Q = VB(Y, C, X, alpha, tau)

# 3: Initializing the posterior approximation
X.initialize_from_parameters(np.random.randn(1, 100, D), 10)

# 4: Running the inference algorithm
# Q.update()
# Q.update(C, X)
# Q.update(C, X, C, tau)
# Q.update(repeat=10)
# Q.update(repeat=1000)
Q.update(repeat=10000, tol=1e-5)
# C.update()

#( 5 : Parameter expansion 収束が遅い時)
# from bayespy.inference.vmp import transformations
Exemplo n.º 15
0
r = (1 - q) / (K - 1)
P = q * np.identity(K) + r * (np.ones((3, 3)) - np.identity(3))
y = np.zeros((N, 2))
z = np.zeros(N)
state = np.random.choice(K, p=p0)
for n in range(N):
    z[n] = state
    y[n, :] = std * np.random.randn(2) + mu[state]
    state = np.random.choice(K, p=P[state])
from bayespy.nodes import Dirichlet

a0 = Dirichlet(1e-3 * np.ones(K))
A = Dirichlet(1e-3 * np.ones((K, K)))
Z = CategoricalMarkovChain(a0, A, states=N)
Lambda = std**(-2) * np.identity(2)
from bayespy.nodes import Gaussian

Y = Mixture(Z, Gaussian, mu, Lambda)
Y.observe(y)
Q = VB(Y, Z, A, a0)
Q.update(repeat=1000)
bpplt.pyplot.figure()
bpplt.pyplot.axis('equal')
colors = Y.parents[0].get_moments()[0]

bpplt.pyplot.plot(y[:, 0], y[:, 1], 'k-', zorder=-10)
bpplt.pyplot.scatter(y[:, 0], y[:, 1], c=colors, s=40)
bpplt.pyplot.show()
print(Y.parents[0].get_moments())
print(Z.random())
print(Y.parents[0].get_moments()[0])
Exemplo n.º 16
0
def model(n_documents,
          n_topics,
          n_vocabulary,
          corpus,
          word_documents,
          plates_multiplier=1):
    '''
    Construct Latent Dirichlet Allocation model.
    
    Parameters
    ----------
    
    documents : int
        The number of documents

    topics : int
        The number of topics

    vocabulary : int
        The number of words in the vocabulary

    corpus : integer array
        The vocabulary index of each word in the corpus

    word_documents : integer array
        The document index of each word in the corpus
    '''

    # Topic distributions for each document
    p_topic = nodes.Dirichlet(np.ones(n_topics),
                              plates=(n_documents, ),
                              name='p_topic')

    # Word distributions for each topic
    p_word = nodes.Dirichlet(np.ones(n_vocabulary),
                             plates=(n_topics, ),
                             name='p_word')

    # Use a simple wrapper node so that the value of this can be changed if one
    # uses stocahstic variational inference
    word_documents = Constant(CategoricalMoments(n_documents),
                              word_documents,
                              name='word_documents')

    # Choose a topic for each word in the corpus
    topics = nodes.Categorical(nodes.Gate(word_documents, p_topic),
                               plates=(len(corpus), ),
                               plates_multiplier=(plates_multiplier, ),
                               name='topics')

    # Choose each word in the corpus from the vocabulary
    words = nodes.Categorical(nodes.Gate(topics, p_word), name='words')

    # Observe the corpus
    words.observe(corpus)

    # Break symmetry by random initialization
    p_topic.initialize_from_random()
    p_word.initialize_from_random()

    return VB(words, topics, p_word, p_topic, word_documents)
Exemplo n.º 17
0
    def get_community_assignments_by(self,
                                     method=None,
                                     temp_dfile_file="gibbsldapp.dfile",
                                     params={}):

        if method == "HMM":
            """
            model = hmm.MultinomialHMM(n_components=3)
            model.startprob_ = np.array([0.6, 0.3, 0.1])
            model.transmat_ = np.array([[0.7, 0.2, 0.1],
                                             [0.3, 0.5, 0.2],
                                             [0.3, 0.3, 0.4]])
            model.emissionprob_ = np.array([[0.4, 0.2, 0.1, 0.3],
                                        [0.3, 0.4, 0.1, 0.2],
                                        [0.1, 0.3, 0.5, 0.1]])

            X, Z = model.sample(1000)

            print(np.asarray(X).T)
            print(Z)
            """
            """
            remodel = hmm.MultinomialHMM(n_components=3, n_iter=100)
            remodel.fit(X)
            Z2 = remodel.predict(X)
            print(Z2)
            """
            """
            seqs = []
            lens = []
            for walk in self._walks:
                s = [[int(w)-1] for w in walk]
                seqs.extend(s)
                lens.append(len(s))

            model = hmm.MultinomialHMM(n_components=params['number_of_topics'], tol=0.001, n_iter=5000)
            model.fit(seqs, lens)

            posteriors = model.predict_proba(np.asarray([[i] for i in range(self.g.number_of_nodes())]))
            comms = np.argmax(posteriors, 1)

            node2comm = {}
            for id in range(len(comms)):
                node2comm[str(id+1)] = comms[id]

            return node2comm
            """
            seqs = []
            lens = []
            for walk in self._walks:
                s = [int(w) - 1 for w in walk]
                seqs.append(s)
                lens.append(len(s))

            pipi = np.asarray([0.5, 0.5], dtype=np.float)
            AA = np.asarray([[0.2, 0.8], [0.5, 0.5]], dtype=np.float)
            OO = np.asarray([[0.9, 0.05, 0.05], [0.05, 0.05, 0.9]],
                            dtype=np.float)

            seqs = []
            for i in range(31):
                seq = []

                s = np.random.choice(range(2), p=pipi)
                o = np.random.choice(range(3), p=OO[s, :])
                seq.append(o)
                for _ in range(59):
                    s = np.random.choice(range(2), p=AA[s, :])
                    o = np.random.choice(range(3), p=OO[s, :])
                    seq.append(o)

                seqs.append(seq)

            seqs = np.vstack(seqs)

            #print(seqs)

            from bayespy.nodes import Categorical, Mixture
            from bayespy.nodes import CategoricalMarkovChain
            from bayespy.nodes import Dirichlet
            from bayespy.inference import VB
            K = params['number_of_topics']  # the number of hidden states
            N = self.g.number_of_nodes()  # the number of observations

            #p0 = np.ones(K) / K

            D = 31  #len(lens)
            states = 60

            a0 = Dirichlet(1e+1 * np.ones(K), plates=())
            A = Dirichlet(1e+1 * np.ones(K), plates=(2, ), name='A')
            P = Dirichlet(1e+1 * np.ones((K, N)))
            Z = CategoricalMarkovChain(a0, A, states=states, plates=(D, ))
            Y = Mixture(Z, Categorical, P)

            Y.observe(seqs)

            #a0.random()
            #A.random()
            #P.random()

            Ainit = np.random.random((2, 2))
            Ainit = np.divide(Ainit.T, np.sum(Ainit, 1)).T

            #A.initialize_from_value(Ainit)
            #print(Ainit)
            Q = VB(Y, Z, P, A, a0)

            Q.update(repeat=1000, plot=False, verbose=True)

            #print(Z.random())
            print(Q['A'])

            return {}

        if method == "LDA":

            # Run GibbsLDA++

            lda_exe_path = c._GIBBSLDA_PATH

            if not os.path.exists(lda_exe_path):
                raise ValueError("Invalid path of GibbsLDA++!")

            temp_lda_folder = "./temp"
            if not os.path.exists(temp_lda_folder):
                os.makedirs(temp_lda_folder)

            temp_dfile_path = os.path.join(temp_lda_folder, temp_dfile_file)

            if not os.path.exists(temp_dfile_path):
                # Save the walks into the dfile
                n = len(self._walks)
                with open(temp_dfile_path, 'w') as f:
                    f.write("{}\n".format(n))
                    for walk in self._walks:
                        f.write("{}\n".format(" ".join(str(w) for w in walk)))

            initial_time = time.time()

            cmd = "{} -est ".format(lda_exe_path)
            cmd += "-alpha {} ".format(params['lda_alpha'])
            cmd += "-beta {} ".format(params['lda_beta'])
            cmd += "-ntopics {} ".format(params['number_of_topics'])
            cmd += "-niters {} ".format(params['lda_number_of_iters'])
            cmd += "-savestep {} ".format(params['lda_number_of_iters'] + 1)
            cmd += "-dfile {} ".format(temp_dfile_path)
            os.system(cmd)

            print(
                "-> The LDA algorithm run in {:.2f} secs".format(time.time() -
                                                                 initial_time))

            # Read wordmap file
            id2node = {}
            temp_wordmap_path = os.path.join(temp_lda_folder, "wordmap.txt")
            with open(temp_wordmap_path, 'r') as f:
                f.readline()  # skip the first line
                for line in f.readlines():
                    tokens = line.strip().split()
                    id2node[int(tokens[1])] = tokens[0]

            # Read phi file
            phi = np.zeros(shape=(params['number_of_topics'], len(id2node)),
                           dtype=np.float)
            temp_phi_path = os.path.join(temp_lda_folder, "model-final.phi")
            with open(temp_phi_path, 'r') as f:
                for topicId, line in enumerate(f.readlines()):
                    phi[topicId, :] = [
                        float(value) for value in line.strip().split()
                    ]

            max_topics = np.argmax(phi, axis=0)

            node2comm = {}
            for nodeId in id2node:
                node2comm[id2node[nodeId]] = max_topics[int(nodeId)]

            return node2comm
Exemplo n.º 18
0
    def create_model(self, model_type=None):

        #Create location model for each of the timezone
        location_model = []

        if ('all' == model_type):
            p_conc = nodes.DirichletConcentration(self.N_LOCATIONS)
            p_conc.initialize_from_value(np.ones(self.N_LOCATIONS))
            p_theta = nodes.Dirichlet(p_conc,
                                      plates = (self.N_TIMEZONES,),
                                      name = 'p_theta')
            for time in np.arange(self.N_TIMEZONES):
                model = nodes.Categorical(p_theta[time],
                                        plates=(self.N_OBSERVATIONS[time],1),
                                        name=str(time))

                #observe data
                timezone_observations = self._observed_locations[self._observed_locations['time'] == time]

                if not timezone_observations.empty:
                    data = timezone_observations['location'].as_matrix().reshape((self.N_OBSERVATIONS[time],1))
                    model.observe(data)

                location_model.append(model)


            Q = VB(location_model[0], location_model[1], location_model[2], location_model[3],
                    location_model[4], location_model[5], location_model[6], location_model[7],
                    location_model[8], location_model[9], location_model[10], location_model[11],
                    location_model[12], location_model[13], location_model[14], location_model[15],
                    location_model[16], location_model[17], location_model[18], location_model[19],
                    location_model[20], location_model[21], location_model[22], location_model[23],
                    p_theta, p_conc)

        elif ('cross' == model_type):
            raise 'Not Implemented'
            pass
        elif ('2fold' == model_type):
            p_conc_morning = nodes.DirichletConcentration(self.N_LOCATIONS)
            p_conc_night = nodes.DirichletConcentration(self.N_LOCATIONS)

            p_conc_morning.initialize_from_value(np.ones(self.N_LOCATIONS))
            p_conc_night.initialize_from_value(np.ones(self.N_LOCATIONS))

            morning_time = np.arange(6,19)
            night_time = np.append(np.arange(0,6) , np.arange(19,24))

            p_theta_morning = nodes.Dirichlet(p_conc_morning,
                                      plates = (morning_time.size,),
                                      name = 'p_theta_morning')
            p_theta_night = nodes.Dirichlet(p_conc_night,
                                      plates = (night_time.size,),
                                      name = 'p_theta_night')


            #Combinging morning time
            for count, time in enumerate(morning_time):
                model = nodes.Categorical(p_theta_morning[count],
                                        plates=(self.N_OBSERVATIONS[time],1),
                                        name=str(time))

                #observe data
                timezone_observations = self._observed_locations[self._observed_locations['time'] == time]
                #print(timezone_observations)

                if not timezone_observations.empty:
                    data = timezone_observations['location'].as_matrix().reshape((self.N_OBSERVATIONS[time],1))
                    model.observe(data)

                location_model.append(model)

            #Combinging night time
            for count, time in enumerate(night_time):
                model = nodes.Categorical(p_theta_night[count],
                                        plates=(self.N_OBSERVATIONS[time],1),
                                        name=str(time))

                #observe data
                timezone_observations = self._observed_locations[self._observed_locations['time'] == time]

                if not timezone_observations.empty:
                    data = timezone_observations['location'].as_matrix().reshape((self.N_OBSERVATIONS[time],1))
                    model.observe(data)

                location_model.append(model)

            Q = VB(location_model[0], location_model[1], location_model[2], location_model[3],
                    location_model[4], location_model[5], location_model[6], location_model[7],
                    location_model[8], location_model[9], location_model[10], location_model[11],
                    location_model[12], location_model[13], location_model[14], location_model[15],
                    location_model[16], location_model[17], location_model[18], location_model[19],
                    location_model[20], location_model[21], location_model[22], location_model[23],
                    p_theta_morning, p_theta_night, p_conc_morning, p_conc_night)
        else:
            raise 'no model_type selected'

        print ("models created")

        ####################################################################################
        #Learning parameters
        Q.update(repeat=1000)
        print ('learned params')
        ####################################################################################
        
        if ('all' == model_type):
            return np.array(p_theta.get_parameters()).reshape((self.N_TIMEZONES,self.N_LOCATIONS))
        elif ('2fold' == model_type):
            learned_night = np.array(p_theta_night.get_parameters()).reshape((night_time.size, self.N_LOCATIONS))
            learned_morn = np.array(p_theta_morning.get_parameters()).reshape((morning_time.size, self.N_LOCATIONS))
            return(np.row_stack((learned_night[:6,:], learned_morn, learned_night[6:,:])))
Exemplo n.º 19
0
print("++++++++++++++++++++++++++")

N = 10000
y = np.random.choice(3, size=N, p=[0.3, 0.6, 0.1])


a0 = [0.5, 0.1, 0.1]

mu0 = -1
lambda0 = 5



#MU = bayes.Gaussian(mu=mu0, Lambda=0.9)
#X = bayes.Gaussian(mu=0.2, Lambda=0.4, plates=(N, ))
P = bayes.Dirichlet(a0)
X = bayes.Categorical(P, plates=(N, ))

#P.initialize_from_random()

Q = VB(X, P)

X.observe(y)
Q.update(repeat=1000)


print(X.pdf([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))
print(P.random())
#print(np.sum(y==2))
Exemplo n.º 20
0
from bayespy.nodes import Dirichlet, Categorical
from bayespy.nodes import Gaussian, Wishart
from bayespy.nodes import Mixture
from bayespy.inference import VB

y0 = np.random.multivariate_normal([0, 0], [[2, 0], [0, 0.1]], size=50)
y1 = np.random.multivariate_normal([0, 0], [[0.1, 0], [0, 2]], size=50)
y2 = np.random.multivariate_normal([2, 2], [[2, -1.5], [-1.5, 2]], size=50)
y3 = np.random.multivariate_normal([-2, -2], [[0.5, 0], [0, 0.5]], size=50)
y = np.vstack([y0, y1, y2, y3])

N = 200
D = 2
K = 10

alpha = Dirichlet(1e-5*np.ones(K), name='alpha')
Z = Categorical(alpha, plates=(N,),name='z')

mu = Gaussian(np.zeros(D),1e-5*np.identity(D),plates=(K,),name='mu')
Lambda = Wishart(D,1e-5*np.identity(D),plates=(K,),name='Lambda')

Y = Mixture(Z, Gaussian, mu, Lambda, name='Y')
Z.initialize_from_random()
Q = VB(Y, mu, Lambda, Z, alpha)

Y.observe(y)
Q.update(repeat=1000)

bpplt.gaussian_mixture_2d(Y, alpha=alpha, scale=2)

Exemplo n.º 21
0
from bayespy.inference import VB
import copy
import numpy as np
import bayespy.plot as bpplt

hidden2 = Categorical((0.7, 0.3))
hidden1 = Mixture(hidden2, Categorical, ((0.6, 0.4), (0.1, 0.9)))

observed1 = Mixture(hidden1, Gaussian, ([-1.0], [0.9]), ([[1.3]], [[0.8]]))
observed2 = Mixture(hidden2, Gaussian, ([-0.9], [1.1]), ([[1.2]], [[0.7]]))

observed_1, observed_2, hidden_1, hidden_2 = copy.deepcopy(
    (observed1, observed2, hidden1, hidden2))
observed_1.observe((-1.2, ))
observed_2.observe((1.2, ))
Q = VB(hidden_1, hidden_2, observed_1, observed_2, tol=1e-10)
Q.update(repeat=100)
print(hidden_1.get_moments())
print(hidden_2.get_moments())

observed_1, observed_2, hidden_1, hidden_2 = copy.deepcopy(
    (observed1, observed2, hidden1, hidden2))
observed_1.observe((-0.2, ))
observed_2.observe((1.2, ))
Q = VB(hidden_1, hidden_2, observed_1, observed_2)
Q.update(repeat=100)
print(hidden_1.get_moments())
print(hidden_2.get_moments())

observed_1, observed_2, hidden_1, hidden_2 = copy.deepcopy(
    (observed1, observed2, hidden1, hidden2))
Exemplo n.º 22
0
import numpy as np

np.random.seed(1)
data = np.random.normal(5, 10, size=(10, ))
from bayespy.nodes import GaussianARD, Gamma

mu = GaussianARD(0, 1e-6)
tau = Gamma(1e-6, 1e-6)
y = GaussianARD(mu, tau, plates=(10, ))
y.observe(data)
from bayespy.inference import VB

Q = VB(mu, tau, y)
Q.update(repeat=20)
import bayespy.plot as bpplt

bpplt.pyplot.subplot(2, 1, 1)
bpplt.pdf(mu, np.linspace(-10, 20, num=100), color='k', name=r'\mu')
bpplt.pyplot.subplot(2, 1, 2)
bpplt.pdf(tau, np.linspace(1e-6, 0.08, num=100), color='k', name=r'\tau')
bpplt.pyplot.tight_layout()
bpplt.pyplot.show()
Exemplo n.º 23
0

A = Categorical([0.5, 0.5])

T = Mixture(A, Categorical, [[0.99, 0.01], [0.8, 0.2]])

S = Categorical([0.5, 0.5])

L = Mixture(S, Categorical, [[0.98, 0.02], [0.75, 0.25]])

B = Mixture(S, Categorical, [[0.97, 0.03], [0.70, 0.30]])

X = Mixture(T, Mixture, L, Categorical, _or([0.96, 0.04], [0.115, 0.885]))

D = Mixture(B, Mixture, X, Categorical, _or([0.115, 0.885], [0.04, 0.96]))

T.observe(TRUE)
S.observe(FALSE)

B.observe(TRUE)

Q = VB(A, T, S, L, B, X, D)
Q.update(repeat=100)

print("P(asia): ", A.get_moments()[0][TRUE])
print("P(tuberculosis): ", T.get_moments()[0][TRUE])
print("P(smoking): ", S.get_moments()[0][TRUE])
print("P(lung): ", L.get_moments()[0][TRUE])
print("P(bronchitis): ", B.get_moments()[0][TRUE])
print("P(xray): ", X.get_moments()[0][TRUE])
print("P(dyspnea): ", D.get_moments()[0][TRUE])
Exemplo n.º 24
0
import numpy
numpy.random.seed(1)
p0 = [0.1, 0.9, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9]
p1 = [0.1, 0.1, 0.1, 0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.9]
p2 = [0.9, 0.9, 0.9, 0.9, 0.9, 0.1, 0.1, 0.1, 0.1, 0.1]
import numpy as np
p = np.array([p0, p1, p2])
from bayespy.utils import random
z = random.categorical([1 / 3, 1 / 3, 1 / 3], size=100)
x = random.bernoulli(p[z])
N = 100
D = 10
K = 10
from bayespy.nodes import Categorical, Dirichlet
R = Dirichlet(K * [1e-5], name='R')
Z = Categorical(R, plates=(N, 1), name='Z')
from bayespy.nodes import Beta
P = Beta([0.5, 0.5], plates=(D, K), name='P')
from bayespy.nodes import Mixture, Bernoulli
X = Mixture(Z, Bernoulli, P)
from bayespy.inference import VB
Q = VB(Z, R, X, P)
P.initialize_from_random()
X.observe(x)
Q.update(repeat=1000)
import bayespy.plot as bpplt
bpplt.hinton(P)
bpplt.pyplot.show()
Exemplo n.º 25
0
alpha = Gamma(1e-5, 1e-5, plates=(D, ), name='alpha')
A = GaussianARD(0, alpha, shape=(D, ), plates=(D, ), name='A')
X = GaussianMarkovChain(np.zeros(D),
                        1e-3 * np.identity(D),
                        A,
                        np.ones(D),
                        n=N,
                        name='X')
gamma = Gamma(1e-5, 1e-5, plates=(D, ), name='gamma')
C = GaussianARD(0, gamma, shape=(D, ), plates=(M, 1), name='C')
F = Dot(C, X, name='F')
C.initialize_from_random()
tau = Gamma(1e-5, 1e-5, name='tau')
Y = GaussianARD(F, tau, name='Y')
from bayespy.inference import VB
Q = VB(X, C, gamma, A, alpha, tau, Y)
w = 0.3
a = np.array([[np.cos(w), -np.sin(w), 0, 0], [np.sin(w),
                                              np.cos(w), 0, 0], [0, 0, 1, 0],
              [0, 0, 0, 0]])
c = np.random.randn(M, 4)
x = np.empty((N, 4))
f = np.empty((M, N))
y = np.empty((M, N))
x[0] = 10 * np.random.randn(4)
f[:, 0] = np.dot(c, x[0])
y[:, 0] = f[:, 0] + 3 * np.random.randn(M)
for n in range(N - 1):
    x[n + 1] = np.dot(a, x[n]) + [1, 1, 10, 10] * np.random.randn(4)
    f[:, n + 1] = np.dot(c, x[n + 1])
    y[:, n + 1] = f[:, n + 1] + 3 * np.random.randn(M)
Exemplo n.º 26
0
    y = y.reshape(y.shape[0], )

    X = x2.reshape(x2.shape[0], 1)

    from bayespy.nodes import GaussianARD
    B = GaussianARD(0, 1e-6, shape=(X.shape[1], ))
    from bayespy.nodes import SumMultiply
    F = SumMultiply('i,i', B, X)

    from bayespy.nodes import Gamma
    tau = Gamma(1e-3, 1e-3)
    Y = GaussianARD(F, tau)
    Y.observe(y)
    from bayespy.inference import VB
    Q = VB(Y, B, tau)
    #Q.update(repeat=100990)
    distribution = []
    result = []
    distribution = F.get_moments()
    for min_val, max_val in zip(distribution[0], distribution[1]):
        #mean = []
        mean = (min_val + max_val) / 2
        result.append(mean)
        #result = mean
        #x3 = []
        #x3 = pd.DataFrame({result:buffer_data})
        #x1 = x1.append(x3)
    x1[buffer_data] = result

print(x1)
Exemplo n.º 27
0
        def fit(self, X, y):
            """Fit Multivariate Gaussian model per class using Variational Inference.
	        Parameters
	        ----------
	        X : {array-like}, shape = [n_samples,n_features]
	            Training data
	        y : array-like, shape = [n_samples]
	            Target values
	        Returns
	        -------
	        self : returns an instance of self.
	        """
            n_samples, n_features = X.shape

            classes_ = np.unique(y)
            n_classes_ = len(classes_)

            n_estimators = n_features / n_classes_

            def remove_outliers(X, y):

                classes = np.unique(y)

                n_classes = len(classes)

                n_estimators = int(X.shape[1] / n_classes)

                Xt = X.reshape((X.shape[0], n_estimators, n_classes))

                yt = np.repeat(y, n_estimators).reshape((len(y), n_estimators))

                rate = (yt == classes.take(np.argmax(Xt, axis=2))).sum(1)

                return np.where(rate > 0.0)[0]

            self.models_ = []
            for i, Y in enumerate(classes_):
                features = np.arange(n_estimators,
                                     dtype=int) * (n_classes_) + i
                L = X[y == Y, :]

                N, D = L.shape

                Lambda = nodes.Wishart(D, np.identity(D))
                mu = nodes.Gaussian(np.zeros(D), np.identity(D))

                x = nodes.Gaussian(mu, Lambda, plates=(N, ))
                x.observe(L)

                Q = VB(x, mu, Lambda)
                Q.update(repeat=2000, tol=0, verbose=False)

                cov = np.linalg.inv(Lambda.u[0])
                m = mu.u[0]

                self.models_.append([m, cov, float(L.shape[0]) / n_samples])

            if (self.weight_class):
                self.w = X.shape[0] / (n_classes_ *
                                       np.bincount(np.asarray(y, dtype=int)))
            else:
                self.w = np.ones(n_classes_)

            self.n_classes_ = n_classes_
            self.classes_ = classes_
            self.n_estimators = n_estimators

            return self
Exemplo n.º 28
0
import numpy
numpy.random.seed(1)
M = 20
N = 100
import numpy as np
x = np.random.randn(N, 2)
w = np.random.randn(M, 2)
f = np.einsum('ik,jk->ij', w, x)
y = f + 0.1 * np.random.randn(M, N)
D = 10
from bayespy.nodes import GaussianARD, Gamma, SumMultiply
X = GaussianARD(0, 1, plates=(1, N), shape=(D, ))
alpha = Gamma(1e-5, 1e-5, plates=(D, ))
C = GaussianARD(0, alpha, plates=(M, 1), shape=(D, ))
F = SumMultiply('d,d->', X, C)
tau = Gamma(1e-5, 1e-5)
Y = GaussianARD(F, tau)
Y.observe(y)
from bayespy.inference import VB
Q = VB(Y, X, C, alpha, tau)
C.initialize_from_random()
from bayespy.inference.vmp.transformations import RotateGaussianARD
rot_X = RotateGaussianARD(X)
rot_C = RotateGaussianARD(C, alpha)
from bayespy.inference.vmp.transformations import RotationOptimizer
R = RotationOptimizer(rot_X, rot_C, D)
Q.set_callback(R.rotate)
Q.update(repeat=1000)
import bayespy.plot as bpplt
bpplt.plot(F)
bpplt.plot(f, color='r', marker='x', linestyle='None')
Exemplo n.º 29
0
p = bayes.Dirichlet(p_param, name='p')

t_param = t0 * np.ones(K, dtype=np.float)
T = bayes.Dirichlet(t_param, plates=(K, ), name='T')

e_param = e0 * np.ones(E, dtype=np.float)
E = bayes.Dirichlet(e_param, plates=(K, ), name='E')

z = bayes.CategoricalMarkovChain(p, T, states=L, plates=(N, ), name='Z')
x = bayes.Mixture(z, bayes.Categorical, E, plates=(N, L), name='X')

p.initialize_from_random()
T.initialize_from_random()
E.initialize_from_random()

Q = VB(x, z, E, T, p)

x.observe(y)
Q.update(repeat=1000)

print("---------------------")
print(np.array(y[1][:25]))
print(np.argmax(x.parents[0].get_moments()[0][1], axis=1)[:25])
print("---------------------")
for u in z.parents[1].get_moments():
    print(u)
    print("++")
print("zzzzzzz")
print(x.parents[1].get_moments()[0])
#print(x.get_parameters())
print(E)