def _setup_bernoulli_mixture(): """ Setup code for the hinton tests. This code is from http://www.bayespy.org/examples/bmm.html """ np.random.seed(1) p0 = [0.1, 0.9, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9] p1 = [0.1, 0.1, 0.1, 0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.9] p2 = [0.9, 0.9, 0.9, 0.9, 0.9, 0.1, 0.1, 0.1, 0.1, 0.1] p = np.array([p0, p1, p2]) z = random.categorical([1 / 3, 1 / 3, 1 / 3], size=100) x = random.bernoulli(p[z]) N = 100 D = 10 K = 10 R = Dirichlet(K * [1e-5], name='R') Z = Categorical(R, plates=(N, 1), name='Z') P = Beta([0.5, 0.5], plates=(D, K), name='P') X = Mixture(Z, Bernoulli, P) Q = VB(Z, R, X, P) P.initialize_from_random() X.observe(x) Q.update(repeat=1000) return (R, P, Z)
def _setup_linear_regression(): """ Setup code for the pdf and contour tests. This code is from http://www.bayespy.org/examples/regression.html """ np.random.seed(1) k = 2 # slope c = 5 # bias s = 2 # noise standard deviation x = np.arange(10) y = k * x + c + s * np.random.randn(10) X = np.vstack([x, np.ones(len(x))]).T B = GaussianARD(0, 1e-6, shape=(2, )) F = SumMultiply('i,i', B, X) tau = Gamma(1e-3, 1e-3) Y = GaussianARD(F, tau) Y.observe(y) Q = VB(Y, B, tau) Q.update(repeat=1000) xh = np.linspace(-5, 15, 100) Xh = np.vstack([xh, np.ones(len(xh))]).T Fh = SumMultiply('i,i', B, Xh) return locals()
def test_gaussian_mixture_plot(): """ Test the gaussian_mixture plotting function. The code is from http://www.bayespy.org/examples/gmm.html """ np.random.seed(1) y0 = np.random.multivariate_normal([0, 0], [[1, 0], [0, 0.02]], size=50) y1 = np.random.multivariate_normal([0, 0], [[0.02, 0], [0, 1]], size=50) y2 = np.random.multivariate_normal([2, 2], [[1, -0.9], [-0.9, 1]], size=50) y3 = np.random.multivariate_normal([-2, -2], [[0.1, 0], [0, 0.1]], size=50) y = np.vstack([y0, y1, y2, y3]) bpplt.pyplot.plot(y[:, 0], y[:, 1], 'rx') N = 200 D = 2 K = 10 alpha = Dirichlet(1e-5 * np.ones(K), name='alpha') Z = Categorical(alpha, plates=(N, ), name='z') mu = Gaussian(np.zeros(D), 1e-5 * np.identity(D), plates=(K, ), name='mu') Lambda = Wishart(D, 1e-5 * np.identity(D), plates=(K, ), name='Lambda') Y = Mixture(Z, Gaussian, mu, Lambda, name='Y') Z.initialize_from_random() Q = VB(Y, mu, Lambda, Z, alpha) Y.observe(y) Q.update(repeat=1000) bpplt.gaussian_mixture_2d(Y, scale=2)
def fit(self, X, y): self.weights = GaussianARD(0, 1e-6, shape=(X.shape[-1], )) y_mean = SumMultiply('i,i', self.weights, X) precision = Gamma(1, .1) y_obs = GaussianARD(y_mean, precision) y_obs.observe(y) Q = VB(y_obs, self.weights, precision) Q.update(repeat=self.n_iter, tol=self.tolerance, verbose=False)
def _run(self, x, K=25, beta=0.5, alpha=0.00001, hinton_plot=False, end=False): '''Only to be used when doing parameter optimization.''' self.participant_list = x[0] N = len(x[0]) #number of data points (i.e. WCS participants) D = np.shape(x[1])[1] #number of features #K = 20 #number of initial clusters R = Dirichlet(K*[alpha], name='R') Z = Categorical(R, plates=(N,1), name='Z') P = Beta([beta, beta], plates=(D,K), name='P') X = Mixture(Z, Bernoulli, P) Q = VB(Z, R, X, P) P.initialize_from_random() X.observe(x[1]) Q.update(repeat=1000) log_likelihood = Q.L[Q.iter-1] if hinton_plot: bpplt.hinton(Z) bpplt.pyplot.show() bpplt.hinton(R) bpplt.pyplot.show() #Get the weight matrix stored in Z (weights determine which cluster data point belongs to) z = Z._message_to_child()[0] z = z * np.ones(Z.plates+(1,)) z = np.squeeze(z) self.z = z #Get the weights stored in R (proportional to the size of the clusters) r = np.exp(R._message_to_child()[0]) r = r * np.ones(R.plates+(1,)) r = np.squeeze(r) self.r = r #Get the cluster assignment of each data point self.c_assign = np.argmax(self.z, axis=1) return log_likelihood
def run(): a = nodes.GammaShape(name='a') b = nodes.Gamma(1e-5, 1e-5, name='b') tau = nodes.Gamma(a, b, plates=(1000, ), name='tau') tau.observe(nodes.Gamma(10, 20, plates=(1000, )).random()) Q = VB(tau, a, b) Q.update(repeat=1000) print("True gamma parameters:", 10.0, 20.0) print("Estimated parameters from 1000 samples:", a.u[0], b.u[0])
def fit(self, X, y): self._init_weights() # self.cost, # self.myopic_voc(action, state), # self.vpi_action(action, state), # self.vpi(state), # self.expected_term_reward(state) self.tau = Gamma(self.prior_a, self.prior_b) F = SumMultiply('i,i', self.weights, X) y_obs = GaussianARD(F, self.tau) y_obs.observe(y) Q = VB(y_obs, self.weights) Q.update(repeat=10, tol=1e-4, verbose=False)
def predict(self): #print(self.network.graph) self.predictions = { term: numpy.empty((self.lenValidation, 2), dtype=float) for term in self.ontology.ontology } classifiers = { term: lambda: loadClf(self.ontology[term]['name'], self.fold, self.clfName ) #self.ontology[term]['clf'][self.fold][self.clfName] for term in self.ontology.ontology } #for term, (clf,X,y,g) in classifiers.items(): # print(term, ":", repr(self.clfName), repr(clf.name), self.fold, clf.fold) observations = { term: clf.decision_function(clf.X_validation) if term != self.ontology.root else numpy.array([-1.] * len(clf.X_validation)) for term, clff #(clf, X, y, g) in classifiers.items() for clf in (clff(), ) } #print("observations:") #print(observations) gt = {term: clf().y_validation for term, clf in classifiers.items()} #print("gt:") #print(gt) for i in range(self.lenValidation): observation = { term: pred[i] for term, pred in observations.items() } #print("Observation for gene %d" % i) #print(observation) #print(self.network.forward_backward(observation)) hidden, observed, extra = self.getCopy() #print(hidden) #print(observed) #print(observation) #for term, node in hidden.items(): # print(i, term, node.get_moments()[0]) for k, v in observation.items(): observed[k].observe((v, )) # print("%s observes %s" % (k, v)) allv = (*hidden.values(), *observed.values(), *extra) #print([(x, [p for p in x.parents if isinstance(p, Stochastic)]) for x in [*hidden.values(), *observed.values()]]) Q = VB(*allv) Q.update(*allv, tol=1e-7, repeat=1000, verbose=True) #print("---") for term, node in hidden.items(): #print(i, term, node.get_moments()[0]) self.predictions[term][i, :] = node.get_moments()[0] #print("predictions:") #print(self.predictions) for term in observations: compare = numpy.empty((len(gt[term]), 4), dtype=float) compare[:, 0] = gt[term] compare[:, 1] = observations[term] compare[:, 2] = self.predictions[term][:, 1] compare[:, 3] = numpy.round(self.predictions[term][:, 1]) print(term, self.ontology[term]['name']) print(compare)
Zi = Categorical(alpha, plates=(N, ), name='zi') from bayespy.nodes import Gaussian, Wishart mui = Gaussian(np.zeros(D), 1e-5 * np.identity(D), plates=(K, ), name='mui') Lambdai = Wishart(D, 1e-5 * np.identity(D), plates=(K, ), name='Lambdai') from bayespy.nodes import Mixture Y = Mixture(Zi, Gaussian, mui, Lambdai, name='Y') Zi.initialize_from_random() from bayespy.inference import VB Q = VB(Y, mui, Lambdai, Zi, alpha) Y.observe(np.reshape(C_mat, (-1, 2))) Q.update(repeat=10) #%% K = 5 #hyperparameter neta = 1e-6 * np.ones(K) #hyperparameter print(neta.shape) print(neta) PI = bayespy.nodes.Dirichlet(neta, name='PI') #%% Z = bayespy.nodes.Categorical(PI, plates=(m, n, K), name='Z') mean_vec = np.zeros(d) # to be initialized accorinding to image precission_mat = 1e-5 * np.identity(
def run(self, K=25, beta=0.5, alpha=0.00001, foci_thresh=0, num_neigh=4, hinton_plot=False, end=False): '''Performs one run of the BBDP according to the specified parameters.''' print("Transforming WCS participant data into binary vectors...") x = u.transform_data_all(self.langs, norm=False, end=end, foci=True, foci_thresh=foci_thresh, num_neigh=num_neigh) print("Finished transforming participant data") self.participant_list = x[0] N = len(x[0]) #number of data points (i.e. WCS participants) D = np.shape(x[1])[1] #number of features #K = 20 #number of initial clusters R = Dirichlet(K*[alpha], name='R') Z = Categorical(R, plates=(N,1), name='Z') P = Beta([beta, beta], plates=(D,K), name='P') X = Mixture(Z, Bernoulli, P) Q = VB(Z, R, X, P) P.initialize_from_random() X.observe(x[1]) Q.update(repeat=1000) if hinton_plot: bpplt.hinton(Z) bpplt.pyplot.show() bpplt.hinton(R) bpplt.pyplot.show() #Get the weight matrix stored in Z (weights determine which cluster data point belongs to) z = Z._message_to_child()[0] z = z * np.ones(Z.plates+(1,)) z = np.squeeze(z) self.z = z #Get the weights stored in R (proportional to the size of the clusters) r = np.exp(R._message_to_child()[0]) r = r * np.ones(R.plates+(1,)) r = np.squeeze(r) self.r = r #Get the cluster assignment of each data point self.c_assign = np.argmax(self.z, axis=1) #Write cluster results to a file if self.write_to_file: if end: save_path = "cluster_results_end_K={}_B={}_a={}_t={}_nn={}".format(K, beta, alpha, foci_thresh, num_neigh) else: save_path = "cluster_results_K={}_B={}_a={}_t={}_nn={}".format(K, beta, alpha, foci_thresh, num_neigh) while path.exists(save_path+".txt"): #save_path already exists try: old_file_num = int(save_path[save_path.find('(')+1:-1]) new_file_num = old_file_num + 1 save_path = save_path[0:save_path.find('(')] + '(' + str(new_file_num) + ')' except ValueError: save_path = save_path + " (1)" self.save_path = save_path file = open(path.abspath(self.save_path+".txt"), 'w') #Write cluster assignment matrix Z (gives the probability that observation i belongs to cluster j) if 'Z' not in self.in_file: for i in range(len(self.z)): line = "\t".join([str(x) for x in self.z[i]]) + "\n" file.write(line) file.write('---Z\n') self.in_file.append('Z') #Write cluster weights matrix R (proportional to the size of the resulting clusters) if 'R' not in self.in_file: line = "\t".join([str(x) for x in self.r]) + "\n" file.write(line) file.write('---R\n') self.in_file.append('R') #Write deterministic cluster assignments with the corresponding participant key if 'C' not in self.in_file: line1 = "\t".join([str(x) for x in self.participant_list]) + "\n" line2 = "\t".join([str(x) for x in self.c_assign]) + "\n" file.write(line1) file.write(line2) file.write('---C\n') self.in_file.append('C') file.close() return self.c_assign
import numpy numpy.random.seed(1) from bayespy.nodes import CategoricalMarkovChain a0 = [0.6, 0.4] # p(rainy)=0.6, p(sunny)=0.4 A = [[0.7, 0.3], # p(rainy->rainy)=0.7, p(rainy->sunny)=0.3 [0.4, 0.6]] # p(sunny->rainy)=0.4, p(sunny->sunny)=0.6 N = 100 Z = CategoricalMarkovChain(a0, A, states=N) from bayespy.nodes import Categorical, Mixture P = [[0.1, 0.4, 0.5], [0.6, 0.3, 0.1]] Y = Mixture(Z, Categorical, P) weather = Z.random() activity = Mixture(weather, Categorical, P).random() Y.observe(activity) from bayespy.inference import VB Q = VB(Y, Z) Q.update() import bayespy.plot as bpplt bpplt.plot(Z) bpplt.plot(1-weather, color='r', marker='x') bpplt.pyplot.show()
lung = Mixture(smoking, Categorical, [[0.98, 0.02], [0.25, 0.75]]) bronchitis = Mixture(smoking, Categorical, [[0.97, 0.03], [0.08, 0.92]]) xray = Mixture(tuberculosis, Mixture, lung, Categorical, _or([0.96, 0.04], [0.115, 0.885])) dyspnea = Mixture( bronchitis, Mixture, tuberculosis, Mixture, lung, Categorical, [_or([0.6, 0.4], [0.18, 0.82]), _or([0.11, 0.89], [0.04, 0.96])]) # Mark observations tuberculosis.observe(TRUE) smoking.observe(FALSE) bronchitis.observe( TRUE) # not a "chance" observation as in the original example # Run inference Q = VB(dyspnea, xray, bronchitis, lung, smoking, tuberculosis, asia) Q.update(repeat=100) # Show results print("P(asia):", asia.get_moments()[0][TRUE]) print("P(tuberculosis):", tuberculosis.get_moments()[0][TRUE]) print("P(smoking):", smoking.get_moments()[0][TRUE]) print("P(lung):", lung.get_moments()[0][TRUE]) print("P(bronchitis):", bronchitis.get_moments()[0][TRUE]) print("P(xray):", xray.get_moments()[0][TRUE]) print("P(dyspnea):", dyspnea.get_moments()[0][TRUE])
def get_node_distr_over_comm(g, walks, method=None, params={}): if method == "HMM_param": seqs = [] lens = [] for walk in walks: s = [[int(w)] for w in walk] seqs.extend(s) lens.append(len(s)) model = hmm.MultinomialHMM(n_components=params['number_of_topics'], tol=0.001, n_iter=5000) model.fit(seqs, lens) #posteriors = model.predict_proba(np.asarray([[i] for i in range(self.g.number_of_nodes())])) #comms = np.argmax(posteriors, 1) likelihood = model.emissionprob_ """ comms = np.argmax(likelihood, 0) node2comm = {} for id in range(len(comms)): node2comm[str(id)] = comms[id] return node2comm """ elif method == "Nonparam_HMM": seqs = [] lens = [] for walk in walks: s = [int(w) for w in walk] seqs.append(s) lens.append(len(s)) seqs = np.vstack(seqs) K = params['number_of_topics'] # the number of hidden states O = g.number_of_nodes() # the size of observation set L = len(seqs[0]) # the length of each sequence N = len(seqs) # the number of sequences p0 = params['prior_p0'] # a vector of size K t0 = params['prior_t0'] # a vector of size K e0 = params['prior_e0'] # a vector of size K p = bayes.Dirichlet(p0 * np.ones(K), name='p') T = bayes.Dirichlet(t0 * np.ones(K), plates=(K, ), name='T') E = bayes.Dirichlet(e0 * np.ones(O), plates=(K, ), name='E') Z = bayes.CategoricalMarkovChain(p, T, states=L, name='Z', plates=(N, )) # Emission/observation distribution X = bayes.Mixture(Z, bayes.Categorical, E, name='X') p.initialize_from_random() T.initialize_from_random() E.initialize_from_random() Q = VB(X, Z, p, T, E) Q['X'].observe(seqs) Q.update(repeat=1000) likelihood = Q['E'].random() """ comms = np.argmax(likelihood, 0) node2comm = {} for id in range(len(comms)): node2comm[str(id)] = comms[id] return node2comm """ return likelihood elif method == "LDA": # Run GibbsLDA++ if not os.path.exists(GIBBSLDA_PATH): raise ValueError("Invalid path of GibbsLDA++!") temp_lda_folder = os.path.join(TEMP_FOLDER, "lda_temp") if not os.path.exists(temp_lda_folder): os.makedirs(temp_lda_folder) temp_dfile_path = os.path.join(temp_lda_folder, "gibblda_temp.dfile") # Save the walks into the dfile n = len(walks) with open(temp_dfile_path, 'w') as f: f.write("{}\n".format(n)) for walk in walks: f.write("{}\n".format(" ".join(str(w) for w in walk))) initial_time = time.time() cmd = "{} -est ".format(GIBBSLDA_PATH) cmd += "-alpha {} ".format(params['lda_alpha']) cmd += "-beta {} ".format(params['lda_beta']) cmd += "-ntopics {} ".format(params['number_of_topics']) cmd += "-niters {} ".format(params['lda_number_of_iters']) cmd += "-savestep {} ".format(params['lda_number_of_iters'] + 1) cmd += "-dfile {} ".format(temp_dfile_path) os.system(cmd) print("-> The LDA algorithm run in {:.2f} secs".format(time.time() - initial_time)) # Read wordmap file id2node = {} temp_wordmap_path = os.path.join(temp_lda_folder, "wordmap.txt") with open(temp_wordmap_path, 'r') as f: f.readline() # skip the first line for line in f.readlines(): tokens = line.strip().split() id2node[int(tokens[1])] = tokens[0] # Read phi file num_of_nodes = len(id2node) phi = np.zeros(shape=(params['number_of_topics'], num_of_nodes), dtype=np.float) temp_phi_path = os.path.join(temp_lda_folder, "model-final.phi") with open(temp_phi_path, 'r') as f: for comm, line in enumerate(f.readlines()): for id, value in enumerate(line.strip().split()): phi[comm, int(id2node[id])] = value # Read the tassign file, generate topic corpus temp_tassing_path = os.path.join(temp_lda_folder, "model-final.tassign") comm_corpus = [] with smart_open(temp_tassing_path, 'r') as f: for line in f: tokens = line.strip().split() comm_corpus.append([token.split(':')[1] for token in tokens]) """ max_topics = np.argmax(phi, axis=0) node2comm = {} for nodeId in id2node: node2comm[id2node[nodeId]] = max_topics[int(nodeId)] return node2comm """ return phi, comm_corpus else: raise ValueError("Wrong parameter name!")
# -----Performing inference------ # 1: Observe some nodes c = np.random.randn(10, 2) x = np.random.randn(2, 100) data = np.dot(c, x) + 0.1 * np.random.randn(10, 100) # data:10×100 Y.observe(data) #( Missing values) Y.observe(data, mask=[[True], [False], [False], [True], [True], [False], [True], [True], [True], [False]]) # 2: Choosing the inference method from bayespy.inference import VB Q = VB(Y, C, X, alpha, tau) # 3: Initializing the posterior approximation X.initialize_from_parameters(np.random.randn(1, 100, D), 10) # 4: Running the inference algorithm # Q.update() # Q.update(C, X) # Q.update(C, X, C, tau) # Q.update(repeat=10) # Q.update(repeat=1000) Q.update(repeat=10000, tol=1e-5) # C.update() #( 5 : Parameter expansion 収束が遅い時) # from bayespy.inference.vmp import transformations
r = (1 - q) / (K - 1) P = q * np.identity(K) + r * (np.ones((3, 3)) - np.identity(3)) y = np.zeros((N, 2)) z = np.zeros(N) state = np.random.choice(K, p=p0) for n in range(N): z[n] = state y[n, :] = std * np.random.randn(2) + mu[state] state = np.random.choice(K, p=P[state]) from bayespy.nodes import Dirichlet a0 = Dirichlet(1e-3 * np.ones(K)) A = Dirichlet(1e-3 * np.ones((K, K))) Z = CategoricalMarkovChain(a0, A, states=N) Lambda = std**(-2) * np.identity(2) from bayespy.nodes import Gaussian Y = Mixture(Z, Gaussian, mu, Lambda) Y.observe(y) Q = VB(Y, Z, A, a0) Q.update(repeat=1000) bpplt.pyplot.figure() bpplt.pyplot.axis('equal') colors = Y.parents[0].get_moments()[0] bpplt.pyplot.plot(y[:, 0], y[:, 1], 'k-', zorder=-10) bpplt.pyplot.scatter(y[:, 0], y[:, 1], c=colors, s=40) bpplt.pyplot.show() print(Y.parents[0].get_moments()) print(Z.random()) print(Y.parents[0].get_moments()[0])
def model(n_documents, n_topics, n_vocabulary, corpus, word_documents, plates_multiplier=1): ''' Construct Latent Dirichlet Allocation model. Parameters ---------- documents : int The number of documents topics : int The number of topics vocabulary : int The number of words in the vocabulary corpus : integer array The vocabulary index of each word in the corpus word_documents : integer array The document index of each word in the corpus ''' # Topic distributions for each document p_topic = nodes.Dirichlet(np.ones(n_topics), plates=(n_documents, ), name='p_topic') # Word distributions for each topic p_word = nodes.Dirichlet(np.ones(n_vocabulary), plates=(n_topics, ), name='p_word') # Use a simple wrapper node so that the value of this can be changed if one # uses stocahstic variational inference word_documents = Constant(CategoricalMoments(n_documents), word_documents, name='word_documents') # Choose a topic for each word in the corpus topics = nodes.Categorical(nodes.Gate(word_documents, p_topic), plates=(len(corpus), ), plates_multiplier=(plates_multiplier, ), name='topics') # Choose each word in the corpus from the vocabulary words = nodes.Categorical(nodes.Gate(topics, p_word), name='words') # Observe the corpus words.observe(corpus) # Break symmetry by random initialization p_topic.initialize_from_random() p_word.initialize_from_random() return VB(words, topics, p_word, p_topic, word_documents)
def get_community_assignments_by(self, method=None, temp_dfile_file="gibbsldapp.dfile", params={}): if method == "HMM": """ model = hmm.MultinomialHMM(n_components=3) model.startprob_ = np.array([0.6, 0.3, 0.1]) model.transmat_ = np.array([[0.7, 0.2, 0.1], [0.3, 0.5, 0.2], [0.3, 0.3, 0.4]]) model.emissionprob_ = np.array([[0.4, 0.2, 0.1, 0.3], [0.3, 0.4, 0.1, 0.2], [0.1, 0.3, 0.5, 0.1]]) X, Z = model.sample(1000) print(np.asarray(X).T) print(Z) """ """ remodel = hmm.MultinomialHMM(n_components=3, n_iter=100) remodel.fit(X) Z2 = remodel.predict(X) print(Z2) """ """ seqs = [] lens = [] for walk in self._walks: s = [[int(w)-1] for w in walk] seqs.extend(s) lens.append(len(s)) model = hmm.MultinomialHMM(n_components=params['number_of_topics'], tol=0.001, n_iter=5000) model.fit(seqs, lens) posteriors = model.predict_proba(np.asarray([[i] for i in range(self.g.number_of_nodes())])) comms = np.argmax(posteriors, 1) node2comm = {} for id in range(len(comms)): node2comm[str(id+1)] = comms[id] return node2comm """ seqs = [] lens = [] for walk in self._walks: s = [int(w) - 1 for w in walk] seqs.append(s) lens.append(len(s)) pipi = np.asarray([0.5, 0.5], dtype=np.float) AA = np.asarray([[0.2, 0.8], [0.5, 0.5]], dtype=np.float) OO = np.asarray([[0.9, 0.05, 0.05], [0.05, 0.05, 0.9]], dtype=np.float) seqs = [] for i in range(31): seq = [] s = np.random.choice(range(2), p=pipi) o = np.random.choice(range(3), p=OO[s, :]) seq.append(o) for _ in range(59): s = np.random.choice(range(2), p=AA[s, :]) o = np.random.choice(range(3), p=OO[s, :]) seq.append(o) seqs.append(seq) seqs = np.vstack(seqs) #print(seqs) from bayespy.nodes import Categorical, Mixture from bayespy.nodes import CategoricalMarkovChain from bayespy.nodes import Dirichlet from bayespy.inference import VB K = params['number_of_topics'] # the number of hidden states N = self.g.number_of_nodes() # the number of observations #p0 = np.ones(K) / K D = 31 #len(lens) states = 60 a0 = Dirichlet(1e+1 * np.ones(K), plates=()) A = Dirichlet(1e+1 * np.ones(K), plates=(2, ), name='A') P = Dirichlet(1e+1 * np.ones((K, N))) Z = CategoricalMarkovChain(a0, A, states=states, plates=(D, )) Y = Mixture(Z, Categorical, P) Y.observe(seqs) #a0.random() #A.random() #P.random() Ainit = np.random.random((2, 2)) Ainit = np.divide(Ainit.T, np.sum(Ainit, 1)).T #A.initialize_from_value(Ainit) #print(Ainit) Q = VB(Y, Z, P, A, a0) Q.update(repeat=1000, plot=False, verbose=True) #print(Z.random()) print(Q['A']) return {} if method == "LDA": # Run GibbsLDA++ lda_exe_path = c._GIBBSLDA_PATH if not os.path.exists(lda_exe_path): raise ValueError("Invalid path of GibbsLDA++!") temp_lda_folder = "./temp" if not os.path.exists(temp_lda_folder): os.makedirs(temp_lda_folder) temp_dfile_path = os.path.join(temp_lda_folder, temp_dfile_file) if not os.path.exists(temp_dfile_path): # Save the walks into the dfile n = len(self._walks) with open(temp_dfile_path, 'w') as f: f.write("{}\n".format(n)) for walk in self._walks: f.write("{}\n".format(" ".join(str(w) for w in walk))) initial_time = time.time() cmd = "{} -est ".format(lda_exe_path) cmd += "-alpha {} ".format(params['lda_alpha']) cmd += "-beta {} ".format(params['lda_beta']) cmd += "-ntopics {} ".format(params['number_of_topics']) cmd += "-niters {} ".format(params['lda_number_of_iters']) cmd += "-savestep {} ".format(params['lda_number_of_iters'] + 1) cmd += "-dfile {} ".format(temp_dfile_path) os.system(cmd) print( "-> The LDA algorithm run in {:.2f} secs".format(time.time() - initial_time)) # Read wordmap file id2node = {} temp_wordmap_path = os.path.join(temp_lda_folder, "wordmap.txt") with open(temp_wordmap_path, 'r') as f: f.readline() # skip the first line for line in f.readlines(): tokens = line.strip().split() id2node[int(tokens[1])] = tokens[0] # Read phi file phi = np.zeros(shape=(params['number_of_topics'], len(id2node)), dtype=np.float) temp_phi_path = os.path.join(temp_lda_folder, "model-final.phi") with open(temp_phi_path, 'r') as f: for topicId, line in enumerate(f.readlines()): phi[topicId, :] = [ float(value) for value in line.strip().split() ] max_topics = np.argmax(phi, axis=0) node2comm = {} for nodeId in id2node: node2comm[id2node[nodeId]] = max_topics[int(nodeId)] return node2comm
def create_model(self, model_type=None): #Create location model for each of the timezone location_model = [] if ('all' == model_type): p_conc = nodes.DirichletConcentration(self.N_LOCATIONS) p_conc.initialize_from_value(np.ones(self.N_LOCATIONS)) p_theta = nodes.Dirichlet(p_conc, plates = (self.N_TIMEZONES,), name = 'p_theta') for time in np.arange(self.N_TIMEZONES): model = nodes.Categorical(p_theta[time], plates=(self.N_OBSERVATIONS[time],1), name=str(time)) #observe data timezone_observations = self._observed_locations[self._observed_locations['time'] == time] if not timezone_observations.empty: data = timezone_observations['location'].as_matrix().reshape((self.N_OBSERVATIONS[time],1)) model.observe(data) location_model.append(model) Q = VB(location_model[0], location_model[1], location_model[2], location_model[3], location_model[4], location_model[5], location_model[6], location_model[7], location_model[8], location_model[9], location_model[10], location_model[11], location_model[12], location_model[13], location_model[14], location_model[15], location_model[16], location_model[17], location_model[18], location_model[19], location_model[20], location_model[21], location_model[22], location_model[23], p_theta, p_conc) elif ('cross' == model_type): raise 'Not Implemented' pass elif ('2fold' == model_type): p_conc_morning = nodes.DirichletConcentration(self.N_LOCATIONS) p_conc_night = nodes.DirichletConcentration(self.N_LOCATIONS) p_conc_morning.initialize_from_value(np.ones(self.N_LOCATIONS)) p_conc_night.initialize_from_value(np.ones(self.N_LOCATIONS)) morning_time = np.arange(6,19) night_time = np.append(np.arange(0,6) , np.arange(19,24)) p_theta_morning = nodes.Dirichlet(p_conc_morning, plates = (morning_time.size,), name = 'p_theta_morning') p_theta_night = nodes.Dirichlet(p_conc_night, plates = (night_time.size,), name = 'p_theta_night') #Combinging morning time for count, time in enumerate(morning_time): model = nodes.Categorical(p_theta_morning[count], plates=(self.N_OBSERVATIONS[time],1), name=str(time)) #observe data timezone_observations = self._observed_locations[self._observed_locations['time'] == time] #print(timezone_observations) if not timezone_observations.empty: data = timezone_observations['location'].as_matrix().reshape((self.N_OBSERVATIONS[time],1)) model.observe(data) location_model.append(model) #Combinging night time for count, time in enumerate(night_time): model = nodes.Categorical(p_theta_night[count], plates=(self.N_OBSERVATIONS[time],1), name=str(time)) #observe data timezone_observations = self._observed_locations[self._observed_locations['time'] == time] if not timezone_observations.empty: data = timezone_observations['location'].as_matrix().reshape((self.N_OBSERVATIONS[time],1)) model.observe(data) location_model.append(model) Q = VB(location_model[0], location_model[1], location_model[2], location_model[3], location_model[4], location_model[5], location_model[6], location_model[7], location_model[8], location_model[9], location_model[10], location_model[11], location_model[12], location_model[13], location_model[14], location_model[15], location_model[16], location_model[17], location_model[18], location_model[19], location_model[20], location_model[21], location_model[22], location_model[23], p_theta_morning, p_theta_night, p_conc_morning, p_conc_night) else: raise 'no model_type selected' print ("models created") #################################################################################### #Learning parameters Q.update(repeat=1000) print ('learned params') #################################################################################### if ('all' == model_type): return np.array(p_theta.get_parameters()).reshape((self.N_TIMEZONES,self.N_LOCATIONS)) elif ('2fold' == model_type): learned_night = np.array(p_theta_night.get_parameters()).reshape((night_time.size, self.N_LOCATIONS)) learned_morn = np.array(p_theta_morning.get_parameters()).reshape((morning_time.size, self.N_LOCATIONS)) return(np.row_stack((learned_night[:6,:], learned_morn, learned_night[6:,:])))
print("++++++++++++++++++++++++++") N = 10000 y = np.random.choice(3, size=N, p=[0.3, 0.6, 0.1]) a0 = [0.5, 0.1, 0.1] mu0 = -1 lambda0 = 5 #MU = bayes.Gaussian(mu=mu0, Lambda=0.9) #X = bayes.Gaussian(mu=0.2, Lambda=0.4, plates=(N, )) P = bayes.Dirichlet(a0) X = bayes.Categorical(P, plates=(N, )) #P.initialize_from_random() Q = VB(X, P) X.observe(y) Q.update(repeat=1000) print(X.pdf([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])) print(P.random()) #print(np.sum(y==2))
from bayespy.nodes import Dirichlet, Categorical from bayespy.nodes import Gaussian, Wishart from bayespy.nodes import Mixture from bayespy.inference import VB y0 = np.random.multivariate_normal([0, 0], [[2, 0], [0, 0.1]], size=50) y1 = np.random.multivariate_normal([0, 0], [[0.1, 0], [0, 2]], size=50) y2 = np.random.multivariate_normal([2, 2], [[2, -1.5], [-1.5, 2]], size=50) y3 = np.random.multivariate_normal([-2, -2], [[0.5, 0], [0, 0.5]], size=50) y = np.vstack([y0, y1, y2, y3]) N = 200 D = 2 K = 10 alpha = Dirichlet(1e-5*np.ones(K), name='alpha') Z = Categorical(alpha, plates=(N,),name='z') mu = Gaussian(np.zeros(D),1e-5*np.identity(D),plates=(K,),name='mu') Lambda = Wishart(D,1e-5*np.identity(D),plates=(K,),name='Lambda') Y = Mixture(Z, Gaussian, mu, Lambda, name='Y') Z.initialize_from_random() Q = VB(Y, mu, Lambda, Z, alpha) Y.observe(y) Q.update(repeat=1000) bpplt.gaussian_mixture_2d(Y, alpha=alpha, scale=2)
from bayespy.inference import VB import copy import numpy as np import bayespy.plot as bpplt hidden2 = Categorical((0.7, 0.3)) hidden1 = Mixture(hidden2, Categorical, ((0.6, 0.4), (0.1, 0.9))) observed1 = Mixture(hidden1, Gaussian, ([-1.0], [0.9]), ([[1.3]], [[0.8]])) observed2 = Mixture(hidden2, Gaussian, ([-0.9], [1.1]), ([[1.2]], [[0.7]])) observed_1, observed_2, hidden_1, hidden_2 = copy.deepcopy( (observed1, observed2, hidden1, hidden2)) observed_1.observe((-1.2, )) observed_2.observe((1.2, )) Q = VB(hidden_1, hidden_2, observed_1, observed_2, tol=1e-10) Q.update(repeat=100) print(hidden_1.get_moments()) print(hidden_2.get_moments()) observed_1, observed_2, hidden_1, hidden_2 = copy.deepcopy( (observed1, observed2, hidden1, hidden2)) observed_1.observe((-0.2, )) observed_2.observe((1.2, )) Q = VB(hidden_1, hidden_2, observed_1, observed_2) Q.update(repeat=100) print(hidden_1.get_moments()) print(hidden_2.get_moments()) observed_1, observed_2, hidden_1, hidden_2 = copy.deepcopy( (observed1, observed2, hidden1, hidden2))
import numpy as np np.random.seed(1) data = np.random.normal(5, 10, size=(10, )) from bayespy.nodes import GaussianARD, Gamma mu = GaussianARD(0, 1e-6) tau = Gamma(1e-6, 1e-6) y = GaussianARD(mu, tau, plates=(10, )) y.observe(data) from bayespy.inference import VB Q = VB(mu, tau, y) Q.update(repeat=20) import bayespy.plot as bpplt bpplt.pyplot.subplot(2, 1, 1) bpplt.pdf(mu, np.linspace(-10, 20, num=100), color='k', name=r'\mu') bpplt.pyplot.subplot(2, 1, 2) bpplt.pdf(tau, np.linspace(1e-6, 0.08, num=100), color='k', name=r'\tau') bpplt.pyplot.tight_layout() bpplt.pyplot.show()
A = Categorical([0.5, 0.5]) T = Mixture(A, Categorical, [[0.99, 0.01], [0.8, 0.2]]) S = Categorical([0.5, 0.5]) L = Mixture(S, Categorical, [[0.98, 0.02], [0.75, 0.25]]) B = Mixture(S, Categorical, [[0.97, 0.03], [0.70, 0.30]]) X = Mixture(T, Mixture, L, Categorical, _or([0.96, 0.04], [0.115, 0.885])) D = Mixture(B, Mixture, X, Categorical, _or([0.115, 0.885], [0.04, 0.96])) T.observe(TRUE) S.observe(FALSE) B.observe(TRUE) Q = VB(A, T, S, L, B, X, D) Q.update(repeat=100) print("P(asia): ", A.get_moments()[0][TRUE]) print("P(tuberculosis): ", T.get_moments()[0][TRUE]) print("P(smoking): ", S.get_moments()[0][TRUE]) print("P(lung): ", L.get_moments()[0][TRUE]) print("P(bronchitis): ", B.get_moments()[0][TRUE]) print("P(xray): ", X.get_moments()[0][TRUE]) print("P(dyspnea): ", D.get_moments()[0][TRUE])
import numpy numpy.random.seed(1) p0 = [0.1, 0.9, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9] p1 = [0.1, 0.1, 0.1, 0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.9] p2 = [0.9, 0.9, 0.9, 0.9, 0.9, 0.1, 0.1, 0.1, 0.1, 0.1] import numpy as np p = np.array([p0, p1, p2]) from bayespy.utils import random z = random.categorical([1 / 3, 1 / 3, 1 / 3], size=100) x = random.bernoulli(p[z]) N = 100 D = 10 K = 10 from bayespy.nodes import Categorical, Dirichlet R = Dirichlet(K * [1e-5], name='R') Z = Categorical(R, plates=(N, 1), name='Z') from bayespy.nodes import Beta P = Beta([0.5, 0.5], plates=(D, K), name='P') from bayespy.nodes import Mixture, Bernoulli X = Mixture(Z, Bernoulli, P) from bayespy.inference import VB Q = VB(Z, R, X, P) P.initialize_from_random() X.observe(x) Q.update(repeat=1000) import bayespy.plot as bpplt bpplt.hinton(P) bpplt.pyplot.show()
alpha = Gamma(1e-5, 1e-5, plates=(D, ), name='alpha') A = GaussianARD(0, alpha, shape=(D, ), plates=(D, ), name='A') X = GaussianMarkovChain(np.zeros(D), 1e-3 * np.identity(D), A, np.ones(D), n=N, name='X') gamma = Gamma(1e-5, 1e-5, plates=(D, ), name='gamma') C = GaussianARD(0, gamma, shape=(D, ), plates=(M, 1), name='C') F = Dot(C, X, name='F') C.initialize_from_random() tau = Gamma(1e-5, 1e-5, name='tau') Y = GaussianARD(F, tau, name='Y') from bayespy.inference import VB Q = VB(X, C, gamma, A, alpha, tau, Y) w = 0.3 a = np.array([[np.cos(w), -np.sin(w), 0, 0], [np.sin(w), np.cos(w), 0, 0], [0, 0, 1, 0], [0, 0, 0, 0]]) c = np.random.randn(M, 4) x = np.empty((N, 4)) f = np.empty((M, N)) y = np.empty((M, N)) x[0] = 10 * np.random.randn(4) f[:, 0] = np.dot(c, x[0]) y[:, 0] = f[:, 0] + 3 * np.random.randn(M) for n in range(N - 1): x[n + 1] = np.dot(a, x[n]) + [1, 1, 10, 10] * np.random.randn(4) f[:, n + 1] = np.dot(c, x[n + 1]) y[:, n + 1] = f[:, n + 1] + 3 * np.random.randn(M)
y = y.reshape(y.shape[0], ) X = x2.reshape(x2.shape[0], 1) from bayespy.nodes import GaussianARD B = GaussianARD(0, 1e-6, shape=(X.shape[1], )) from bayespy.nodes import SumMultiply F = SumMultiply('i,i', B, X) from bayespy.nodes import Gamma tau = Gamma(1e-3, 1e-3) Y = GaussianARD(F, tau) Y.observe(y) from bayespy.inference import VB Q = VB(Y, B, tau) #Q.update(repeat=100990) distribution = [] result = [] distribution = F.get_moments() for min_val, max_val in zip(distribution[0], distribution[1]): #mean = [] mean = (min_val + max_val) / 2 result.append(mean) #result = mean #x3 = [] #x3 = pd.DataFrame({result:buffer_data}) #x1 = x1.append(x3) x1[buffer_data] = result print(x1)
def fit(self, X, y): """Fit Multivariate Gaussian model per class using Variational Inference. Parameters ---------- X : {array-like}, shape = [n_samples,n_features] Training data y : array-like, shape = [n_samples] Target values Returns ------- self : returns an instance of self. """ n_samples, n_features = X.shape classes_ = np.unique(y) n_classes_ = len(classes_) n_estimators = n_features / n_classes_ def remove_outliers(X, y): classes = np.unique(y) n_classes = len(classes) n_estimators = int(X.shape[1] / n_classes) Xt = X.reshape((X.shape[0], n_estimators, n_classes)) yt = np.repeat(y, n_estimators).reshape((len(y), n_estimators)) rate = (yt == classes.take(np.argmax(Xt, axis=2))).sum(1) return np.where(rate > 0.0)[0] self.models_ = [] for i, Y in enumerate(classes_): features = np.arange(n_estimators, dtype=int) * (n_classes_) + i L = X[y == Y, :] N, D = L.shape Lambda = nodes.Wishart(D, np.identity(D)) mu = nodes.Gaussian(np.zeros(D), np.identity(D)) x = nodes.Gaussian(mu, Lambda, plates=(N, )) x.observe(L) Q = VB(x, mu, Lambda) Q.update(repeat=2000, tol=0, verbose=False) cov = np.linalg.inv(Lambda.u[0]) m = mu.u[0] self.models_.append([m, cov, float(L.shape[0]) / n_samples]) if (self.weight_class): self.w = X.shape[0] / (n_classes_ * np.bincount(np.asarray(y, dtype=int))) else: self.w = np.ones(n_classes_) self.n_classes_ = n_classes_ self.classes_ = classes_ self.n_estimators = n_estimators return self
import numpy numpy.random.seed(1) M = 20 N = 100 import numpy as np x = np.random.randn(N, 2) w = np.random.randn(M, 2) f = np.einsum('ik,jk->ij', w, x) y = f + 0.1 * np.random.randn(M, N) D = 10 from bayespy.nodes import GaussianARD, Gamma, SumMultiply X = GaussianARD(0, 1, plates=(1, N), shape=(D, )) alpha = Gamma(1e-5, 1e-5, plates=(D, )) C = GaussianARD(0, alpha, plates=(M, 1), shape=(D, )) F = SumMultiply('d,d->', X, C) tau = Gamma(1e-5, 1e-5) Y = GaussianARD(F, tau) Y.observe(y) from bayespy.inference import VB Q = VB(Y, X, C, alpha, tau) C.initialize_from_random() from bayespy.inference.vmp.transformations import RotateGaussianARD rot_X = RotateGaussianARD(X) rot_C = RotateGaussianARD(C, alpha) from bayespy.inference.vmp.transformations import RotationOptimizer R = RotationOptimizer(rot_X, rot_C, D) Q.set_callback(R.rotate) Q.update(repeat=1000) import bayespy.plot as bpplt bpplt.plot(F) bpplt.plot(f, color='r', marker='x', linestyle='None')
p = bayes.Dirichlet(p_param, name='p') t_param = t0 * np.ones(K, dtype=np.float) T = bayes.Dirichlet(t_param, plates=(K, ), name='T') e_param = e0 * np.ones(E, dtype=np.float) E = bayes.Dirichlet(e_param, plates=(K, ), name='E') z = bayes.CategoricalMarkovChain(p, T, states=L, plates=(N, ), name='Z') x = bayes.Mixture(z, bayes.Categorical, E, plates=(N, L), name='X') p.initialize_from_random() T.initialize_from_random() E.initialize_from_random() Q = VB(x, z, E, T, p) x.observe(y) Q.update(repeat=1000) print("---------------------") print(np.array(y[1][:25])) print(np.argmax(x.parents[0].get_moments()[0][1], axis=1)[:25]) print("---------------------") for u in z.parents[1].get_moments(): print(u) print("++") print("zzzzzzz") print(x.parents[1].get_moments()[0]) #print(x.get_parameters()) print(E)