outputs=[s, P, ll], mode=theano.Mode(optimizer="unsafe")) s, P, ll = kf(Y, 2 * np.ones(m)) import pymc3 as pm with pm.Model() as model: # Phi, Q, L, c, H, Sv, d, s0, P0, n, m, g phi = pm.Normal("phi", shape=(1, 1)) q = pm.HalfStudentT("q", nu=1.0, sd=2.0, shape=(1, 1)) K = KalmanFilter("kf", phi, q, np.array([[1.]]), np.array([0.]), np.array([[1.]]), np.array([[0.0]]), np.array([0.]), np.array([0.]), np.array([[10.]]), 1, 1, 1, observed=y) with model: # approx = pm.fit(n=100, method="advi") trace = pm.sample_approx(approx, draws=500)
def train_pymc3(docs_te, docs_tr, n_samples_te, n_samples_tr, n_words, n_topics, n_tokens): """ Return: Pymc3 LDA results Parameters: docs_tr: training documents (processed) docs_te: testing documents (processed) n_samples_te: number of testing docs n_samples_tr: number of training docs n_words: size of vocabulary n_topics: number of topics to learn n_tokens: number of non-zero datapoints in processed training tf matrix """ # Log-likelihood of documents for LDA def logp_lda_doc(beta, theta): """ Returns the log-likelihood function for given documents. K : number of topics in the model V : number of words (size of vocabulary) D : number of documents (in a mini-batch) Parameters ---------- beta : tensor (K x V) Word distribution. theta : tensor (D x K) Topic distributions for the documents. """ def ll_docs_f(docs): dixs, vixs = docs.nonzero() vfreqs = docs[dixs, vixs] ll_docs = vfreqs * pmmath.logsumexp( tt.log(theta[dixs]) + tt.log(beta.T[vixs]), axis=1).ravel() # Per-word log-likelihood times no. of tokens in the whole dataset return tt.sum(ll_docs) / (tt.sum(vfreqs) + 1e-9) * n_tokens return ll_docs_f # fit the pymc3 LDA # we have sparse dataset. It's better to have dence batch so that all words accure there minibatch_size = 128 # defining minibatch doc_t_minibatch = pm.Minibatch(docs_tr.toarray(), minibatch_size) doc_t = shared(docs_tr.toarray()[:minibatch_size]) with pm.Model() as model: theta = Dirichlet( 'theta', a=pm.floatX((1.0 / n_topics) * np.ones( (minibatch_size, n_topics))), shape=(minibatch_size, n_topics), transform=t_stick_breaking(1e-9), # do not forget scaling total_size=n_samples_tr) beta = Dirichlet('beta', a=pm.floatX((1.0 / n_topics) * np.ones( (n_topics, n_words))), shape=(n_topics, n_words), transform=t_stick_breaking(1e-9)) # Note, that we defined likelihood with scaling, so here we need no additional `total_size` kwarg doc = pm.DensityDist('doc', logp_lda_doc(beta, theta), observed=doc_t) # Encoder class LDAEncoder: """Encode (term-frequency) document vectors to variational means and (log-transformed) stds. """ def __init__(self, n_words, n_hidden, n_topics, p_corruption=0, random_seed=1): rng = np.random.RandomState(random_seed) self.n_words = n_words self.n_hidden = n_hidden self.n_topics = n_topics self.w0 = shared(0.01 * rng.randn(n_words, n_hidden).ravel(), name='w0') self.b0 = shared(0.01 * rng.randn(n_hidden), name='b0') self.w1 = shared(0.01 * rng.randn(n_hidden, 2 * (n_topics - 1)).ravel(), name='w1') self.b1 = shared(0.01 * rng.randn(2 * (n_topics - 1)), name='b1') self.rng = MRG_RandomStreams(seed=random_seed) self.p_corruption = p_corruption def encode(self, xs): if 0 < self.p_corruption: dixs, vixs = xs.nonzero() mask = tt.set_subtensor( tt.zeros_like(xs)[dixs, vixs], self.rng.binomial(size=dixs.shape, n=1, p=1 - self.p_corruption)) xs_ = xs * mask else: xs_ = xs w0 = self.w0.reshape((self.n_words, self.n_hidden)) w1 = self.w1.reshape((self.n_hidden, 2 * (self.n_topics - 1))) hs = tt.tanh(xs_.dot(w0) + self.b0) zs = hs.dot(w1) + self.b1 zs_mean = zs[:, :(self.n_topics - 1)] zs_rho = zs[:, (self.n_topics - 1):] return {'mu': zs_mean, 'rho': zs_rho} def get_params(self): return [self.w0, self.b0, self.w1, self.b1] # call Encoder encoder = LDAEncoder(n_words=n_words, n_hidden=100, n_topics=n_topics, p_corruption=0.0) local_RVs = OrderedDict([(theta, encoder.encode(doc_t))]) # get parameters encoder_params = encoder.get_params() # Train pymc3 Model η = .1 s = shared(η) def reduce_rate(a, h, i): s.set_value(η / ((i / minibatch_size) + 1)**.7) with model: approx = pm.MeanField(local_rv=local_RVs) approx.scale_cost_to_minibatch = False inference = pm.KLqp(approx) inference.fit(10000, callbacks=[reduce_rate], obj_optimizer=pm.sgd(learning_rate=s), more_obj_params=encoder_params, total_grad_norm_constraint=200, more_replacements={doc_t: doc_t_minibatch}) # Extracting characteristic words doc_t.set_value(docs_tr.toarray()) samples = pm.sample_approx(approx, draws=100) beta_pymc3 = samples['beta'].mean(axis=0) # Predictive distribution def calc_pp(ws, thetas, beta, wix): """ Parameters ---------- ws: ndarray (N,) Number of times the held-out word appeared in N documents. thetas: ndarray, shape=(N, K) Topic distributions for N documents. beta: ndarray, shape=(K, V) Word distributions for K topics. wix: int Index of the held-out word Return ------ Log probability of held-out words. """ return ws * np.log(thetas.dot(beta[:, wix])) def eval_lda(transform, beta, docs_te, wixs): """Evaluate LDA model by log predictive probability. Parameters ---------- transform: Python function Transform document vectors to posterior mean of topic proportions. wixs: iterable of int Word indices to be held-out. """ lpss = [] docs_ = deepcopy(docs_te) thetass = [] wss = [] total_words = 0 for wix in wixs: ws = docs_te[:, wix].ravel() if 0 < ws.sum(): # Hold-out docs_[:, wix] = 0 # Topic distributions thetas = transform(docs_) # Predictive log probability lpss.append(calc_pp(ws, thetas, beta, wix)) docs_[:, wix] = ws thetass.append(thetas) wss.append(ws) total_words += ws.sum() else: thetass.append(None) wss.append(None) # Log-probability lp = np.sum(np.hstack(lpss)) / total_words return {'lp': lp, 'thetass': thetass, 'beta': beta, 'wss': wss} inp = tt.matrix(dtype='int64') sample_vi_theta = theano.function([inp], approx.sample_node( approx.model.theta, 100, more_replacements={ doc_t: inp }).mean(0)) def transform_pymc3(docs): return sample_vi_theta(docs) result_pymc3 = eval_lda(transform_pymc3, beta_pymc3, docs_te.toarray(), np.arange(100)) print('Predictive log prob (pm3) = {}'.format(result_pymc3['lp'])) return result_pymc3
def variational_inference(X_train, Y_train, X_test, Y_test, m, k): import numpy as np import pymc3 as pm from sklearn.preprocessing import MinMaxScaler import theano import matplotlib.pyplot as plt import numpy import random #输入数据并进行标准化 n, p = np.shape(X_train) Y_train = np.reshape(Y_train, (len(Y_train), 1)) Y_test = np.reshape(Y_test, (len(Y_test), 1)) scaler_x = MinMaxScaler(feature_range=(-1, 1)) X_train = scaler_x.fit_transform(X_train) X_test = scaler_x.transform(X_test) scaler_y = MinMaxScaler(feature_range=(0, 1)) Y_train = scaler_y.fit_transform(Y_train) Y_test = scaler_y.transform(Y_test) X_train = theano.shared(X_train) #添加噪音 #sigma=0.1 #rd_num=int(sigma*len(Y_train)) #rd=random.sample(range(len(Y_train)),rd_num) #sm=np.random.uniform(-0.1,0,size=rd_num) #Y_train=np.ravel(Y_train) #Y_train[rd]=sm #定义模型 basic_model = pm.Model() with basic_model: b = pm.Normal('b', mu=0, tau=1) A = pm.Normal('A', mu=0, tau=1, shape=(p, m)) gamma_0 = pm.Gamma('gamma_0', alpha=10**(-5), beta=10**(-5)) gamma_1 = pm.Gamma('gamma_1', alpha=10**(-5), beta=10**(-5)) beta = pm.Normal('beta', mu=0, tau=gamma_0, shape=(m, 1)) Y_obs = pm.Normal('Y_obs', mu=sigmoid_kernel(X_train, beta, A, b), tau=gamma_1, observed=Y_train) start = pm.find_MAP() #approx=pm.fit(k,start=start,obj_optimizer=pm.adam(),callbacks=[tracker]) approx = pm.fit(k, start=start, obj_optimizer=pm.adam()) #在拟合好的模型中,对参数z={beta,A,b,gamma_0,gamma_1}进行抽样 trace = pm.sample_approx(approx=approx, draws=5000) #pm.traceplot(trace) #pm.summary(trace) #取5000次后验预测的均值为最终结果。 post_pred = pm.sample_ppc(trace, samples=5000, model=basic_model) y_train_pred = np.mean(post_pred['Y_obs'], axis=0) #对预测结果与实际结果进行比较。 mse_train = (((y_train_pred - Y_train)**2).sum()) / np.size(Y_train, 0) X_train.set_value(X_test) post_pred = pm.sample_ppc(trace, samples=5000, model=basic_model) y_test_pred = np.mean(post_pred['Y_obs'], axis=0) mse_test = (((y_test_pred - Y_test)**2).sum()) / np.size(Y_test, 0) # Y_mean = np.ones_like(Y_test) * np.mean(Y_test) r2 = 1 - (((y_test_pred - Y_test)**2).sum()) / ( ((Y_test - Y_mean)**2).sum()) # n = len(Y_test) err = Y_test - y_test_pred err_mean = np.ones_like(err) * np.mean(err) err_var = (((err - err_mean)**2).sum()) / (n - 1) y_var = (((Y_test - Y_mean)**2).sum()) / (n - 1) Evar = 1 - err_var / y_var #print('mse_train=',mse_train,'\n mse_test=',mse_test,'\n r2=',r2,'\n Evar=',Evar,'\n m=',m) return mse_train, mse_test, r2, Evar, m
def run_lda(args): tf_vectorizer, docs_tr, docs_te = prepare_sparse_matrix_nonlabel(args.n_tr, args.n_te, args.n_word) feature_names = tf_vectorizer.get_feature_names() doc_tr_minibatch = pm.Minibatch(docs_tr.toarray(), args.bsz) doc_tr = shared(docs_tr.toarray()[:args.bsz]) def log_prob(beta, theta): """Returns the log-likelihood function for given documents. K : number of topics in the model V : number of words (size of vocabulary) D : number of documents (in a mini-batch) Parameters ---------- beta : tensor (K x V) Word distributions. theta : tensor (D x K) Topic distributions for documents. """ def ll_docs_f(docs): dixs, vixs = docs.nonzero() vfreqs = docs[dixs, vixs] ll_docs = (vfreqs * pmmath.logsumexp(tt.log(theta[dixs]) + tt.log(beta.T[vixs]), axis=1).ravel()) return tt.sum(ll_docs) / (tt.sum(vfreqs) + 1e-9) return ll_docs_f with pm.Model() as model: beta = Dirichlet("beta", a=pm.floatX((1. / args.n_topic) * np.ones((args.n_topic, args.n_word))), shape=(args.n_topic, args.n_word), ) theta = Dirichlet("theta", a=pm.floatX((10. / args.n_topic) * np.ones((args.bsz, args.n_topic))), shape=(args.bsz, args.n_topic), total_size=args.n_tr, ) doc = pm.DensityDist("doc", log_prob(beta, theta), observed=doc_tr) encoder = ThetaEncoder(n_words=args.n_word, n_hidden=100, n_topics=args.n_topic) local_RVs = OrderedDict([(theta, encoder.encode(doc_tr))]) encoder_params = encoder.get_params() s = shared(args.lr) def reduce_rate(a, h, i): s.set_value(args.lr / ((i / args.bsz) + 1) ** 0.7) with model: approx = pm.MeanField(local_rv=local_RVs) approx.scale_cost_to_minibatch = False inference = pm.KLqp(approx) inference.fit(args.n_iter, callbacks=[reduce_rate, pm.callbacks.CheckParametersConvergence(diff="absolute")], obj_optimizer=pm.adam(learning_rate=s), more_obj_params=encoder_params, total_grad_norm_constraint=200, more_replacements={ doc_tr: doc_tr_minibatch }, ) doc_tr.set_value(docs_tr.toarray()) inp = tt.matrix(dtype="int64") sample_vi_theta = theano.function([inp], approx.sample_node(approx.model.theta, args.n_sample, more_replacements={doc_tr: inp}), ) test = docs_te.toarray() test_n = test.sum(1) beta_pymc3 = pm.sample_approx(approx, draws=args.n_sample)['beta'] theta_pymc3 = sample_vi_theta(test) assert beta_pymc3.shape == (args.n_sample, args.n_topic, args.n_word) assert theta_pymc3.shape == (args.n_sample, args.n_te, args.n_topic) beta_mean = beta_pymc3.mean(0) theta_mean = theta_pymc3.mean(0) pred_rate = theta_mean.dot(beta_mean) pp_test = (test * np.log(pred_rate)).sum(1) / test_n posteriors = { 'theta': theta_pymc3, 'beta': beta_pymc3,} log_top_words(beta_pymc3.mean(0), feature_names, n_top_words=args.n_top_word) save_elbo(approx.hist) save_pp(pp_test) save_draws(posteriors)
local_RVs = OrderedDict([(theta, encoder.encode(counts_share))]) encoder_params = encoder.get_params() with lda_model: approx1 = pm.fit( 6000, method='advi', local_rv=local_RVs, more_obj_params=encoder_params, # https://arxiv.org/pdf/1705.08292.pdf # sgd(with/without momentum) seems to be good choice for high dimensional problems obj_optimizer=pm.sgd, # but your gradients will explode here total_grad_norm_constraint=1000) samples = pm.sample_approx(approx1, draws=100) beta_pymc3 = samples['beta'].mean(axis=0) theta_pymc3 = samples['theta'].mean(axis=0) plt.plot(approx1.hist[10:]) plt.show() ## get label for each cell z_pymc3 = theta_pymc3.argmax(axis=1) pd.DataFrame({ "celda": z_celda, "pymc3": z_pymc3 }).groupby(['celda', 'pymc3']).size() ## 3:21 minutes to run the AEVB with ADVI to get the posterior Dis