def _do_chain(args): """ Generate a synthetic documen set from the teaching model. Return the documents and the acceptance rate of the sampler """ docs_r = args[0] docs_t = args[1] # start state for teaching topics = args[2] # target topics α = args[3] β = args[4] ρ = args[5] n_samples = args[6] n_iter = args[7] seed = args[8] chain = args[9] n_lda_steps = 500 n_words = len(topics[0]) docs_t = _jitter_docs(docs_t, 1.0, n_words) n_acpt = 0 acr = 0 lp = teach_lda_pgis(docs_t, topics, alpha=α, beta=β, n_samples=n_samples) # lp = teach_lda_exact(docs_t, topics, alpha=α, beta=β) for i in range(n_iter): docs_p = _jitter_docs(docs_t, ρ, n_words) lp_p = teach_lda_pgis(docs_p, topics, alpha=α, beta=β, n_samples=n_samples) # lp_p = teach_lda_exact(docs_p, topics, alpha=α, beta=β) if log(random.random()) < lp_p - lp: n_acpt += 1 docs_t = docs_p lp = lp_p acr = n_acpt/float(i+1) err_t = _get_err(docs_t, topics, n_words, α, β, n_lda_steps, seed) err_r = _get_err(docs_r, topics, n_words, α, β, n_lda_steps, seed) print(".", end="") sys.stdout.flush() return docs_t, acr, err_t, err_r, chain
def evidence(docs, n_topics, n_words, beta=.1, n_samples=10000, seed=1337): """ estimate the evidence using sequental importance sampling """ # generate fake topics. Remember, teach_lda_XXX calcualtes the teaching # probability; we're onlt concerned with the evidence # TODO: Pass m as an argument m = 50. alpha = m / n_topics if USE_HARMONIC_MEAN: evidence = evidence_lda_hm(docs, n_topics, n_words, alpha=alpha, beta=beta, n_samples=n_samples, stop_itr=1000, seed=None) else: topics = np.ones((n_topics, n_words)) / n_words _, _, evx = teach_lda_pgis(docs, topics, alpha=alpha, beta=beta, n_samples=n_samples, seed=seed, return_parts=True) # the evidence isn't normalized in pgis because the normalizer cancels # out of the ratio. Normalize here. evidence = logsumexp(evx) - log(n_samples) return evidence
def expitr(n_topics, n_vocab, n_words, α, β, n_docs, tol, seed, tasknum): random.seed(seed) print('Running %d topics, %d n_vocab, and %d words to ε=%1.2f.' % (n_topics, n_vocab, n_words, tol,)) docs, topics = gen_docs(n_docs, n_topics, n_vocab, n_words, α, β) i = 0 t_pgis = 0 relerr = float('Inf') weights = [] while relerr > tol and i < MAX_ITR: i += 1 seed = random.randrange(2**31) t_start = time.time() _, _, lml = teach_lda_pgis(docs, topics, alpha=α, beta=β, n_samples=1, seed=seed, return_parts=True) t_end = time.time() t_pgis += t_end - t_start weights.append(lml) if i > 1: relerr = isrelerr(weights) if i > MAX_ITR: print('Warning: MAX_ITR reached. Breaking.') return t_pgis, i, n_topics, n_vocab, n_words, α, β, n_docs
def _calc_docset_logp_inner(docs, topics, topics_list, α, β, n_samples, seed): pt = teach_lda_pgis(docs, topics, topics_list=topics_list, alpha=α, beta=β, n_samples=n_samples, seed=seed, return_parts=True) prior, numer, denom = pt lp = prior + logsumexp(numer) - logsumexp(denom) ll = logsumexp(numer) - np.log(len(numer)) print(".", end="") sys.stdout.flush() return lp, ll
def _expitr(docs, dtpc, alpha, beta, n_samples, seed): logsamples = log(n_samples) n_topics = len(dtpc) n_words = len(dtpc[0]) exact = logsumexp( teach_lda_exact(docs, dtpc, alpha=alpha, beta=beta, return_parts=True)[-1]) print("1") unis = logsumexp( teach_lda_unis(docs, dtpc, alpha=alpha, beta=beta, n_samples=n_samples, seed=seed, return_parts=True)[-1]) - logsamples print("2") unis += sum(len(d['w']) for d in docs) * log(n_topics) pgis = logsumexp( teach_lda_pgis(docs, dtpc, alpha=alpha, beta=beta, n_samples=n_samples, seed=seed + 1, return_parts=True)[-1]) - logsamples print("3") hm = evidence_lda_hm(docs, n_topics, n_words, alpha=alpha, beta=beta, n_samples=n_samples, stop_itr=100, seed=seed + 2) print("4") print(seed) return {'Exact': exact, 'Uniform': unis, 'SIS': pgis, 'HM': hm}
def expitr(n_topics, n_vocab, n_words, α, β, n_docs, tol, seed, tasknum): random.seed(seed) print('Running %d topics, %d n_vocab, and %d words to ε=%1.2f.' % ( n_topics, n_vocab, n_words, tol, )) docs, topics = gen_docs(n_docs, n_topics, n_vocab, n_words, α, β) i = 0 t_pgis = 0 relerr = float('Inf') weights = [] while relerr > tol and i < MAX_ITR: i += 1 seed = random.randrange(2**31) t_start = time.time() _, _, lml = teach_lda_pgis(docs, topics, alpha=α, beta=β, n_samples=1, seed=seed, return_parts=True) t_end = time.time() t_pgis += t_end - t_start weights.append(lml) if i > 1: relerr = isrelerr(weights) if i > MAX_ITR: print('Warning: MAX_ITR reached. Breaking.') return t_pgis, i, n_topics, n_vocab, n_words, α, β, n_docs
def evidence(docs, n_topics, n_words, beta=.1, n_samples=10000, seed=1337): """ estimate the evidence using sequental importance sampling """ # generate fake topics. Remember, teach_lda_XXX calcualtes the teaching # probability; we're onlt concerned with the evidence # TODO: Pass m as an argument m = 50. alpha = m/n_topics if USE_HARMONIC_MEAN: evidence = evidence_lda_hm(docs, n_topics, n_words, alpha=alpha, beta=beta, n_samples=n_samples, stop_itr=1000, seed=None) else: topics = np.ones((n_topics, n_words))/n_words _, _, evx = teach_lda_pgis(docs, topics, alpha=alpha, beta=beta, n_samples=n_samples, seed=seed, return_parts=True) # the evidence isn't normalized in pgis because the normalizer cancels # out of the ratio. Normalize here. evidence = logsumexp(evx) - log(n_samples) return evidence
def _expitr(docs, dtpc, alpha, beta, n_samples, seed): n_topics = len(dtpc) n_words = len(dtpc[0]) unis = np.array(teach_lda_unis(docs, dtpc, alpha=alpha, beta=beta, n_samples=n_samples, seed=seed, return_parts=True)[-1]) unis += sum([len(d['w']) for d in docs])*log(n_topics) pgis = np.array(teach_lda_pgis(docs, dtpc, alpha=alpha, beta=beta, n_samples=n_samples, seed=seed+1, return_parts=True)[-1]) hm = evidence_lda_hm(docs, n_topics, n_words, alpha=alpha, beta=beta, n_samples=n_samples, stop_itr=200, seed=seed+2, return_samples=True) lsxaccum = lambda x: logsumexp(x) - log(len(x)) hmaccum = lambda x: -(logsumexp(x) - log(len(x))) data = [] steps = np.unique(np.array(np.exp(np.linspace( log(2), log(n_samples-1), 250)))) for i in steps: n_samples = float(i+1) data.append({'Samples': n_samples, 'Type': 'Uniform', 'Estimate': lsxaccum(unis[:i]), 'Relative error': isrelerr(unis[:i]), }) data.append({'Samples': n_samples, 'Type': 'SIS', 'Estimate': lsxaccum(pgis[:i]), 'Relative error': isrelerr(pgis[:i]), }) data.append({'Samples': n_samples, 'Type': 'Harmonic mean', 'Estimate': hmaccum(hm[:i]), 'Relative error': isrelerr(hm[:i]), }) return data