Пример #1
0
def _do_chain(args):
    """
    Generate a synthetic documen set from the teaching model.

    Return the documents and the acceptance rate of the sampler
    """
    docs_r = args[0]
    docs_t = args[1]  # start state for teaching
    topics = args[2]  # target topics

    α = args[3]
    β = args[4]
    ρ = args[5]
    n_samples = args[6]
    n_iter = args[7]
    seed = args[8]
    chain = args[9]

    n_lda_steps = 500

    n_words = len(topics[0])
    docs_t = _jitter_docs(docs_t, 1.0, n_words)

    n_acpt = 0
    acr = 0

    lp = teach_lda_pgis(docs_t, topics, alpha=α, beta=β, n_samples=n_samples)
    # lp = teach_lda_exact(docs_t, topics, alpha=α, beta=β)

    for i in range(n_iter):
        docs_p = _jitter_docs(docs_t, ρ, n_words)
        lp_p = teach_lda_pgis(docs_p, topics, alpha=α, beta=β,
                              n_samples=n_samples)

        # lp_p = teach_lda_exact(docs_p, topics, alpha=α, beta=β)
        if log(random.random()) < lp_p - lp:
            n_acpt += 1
            docs_t = docs_p
            lp = lp_p

        acr = n_acpt/float(i+1)

    err_t = _get_err(docs_t, topics, n_words, α, β, n_lda_steps, seed)
    err_r = _get_err(docs_r, topics, n_words, α, β, n_lda_steps, seed)

    print(".", end="")
    sys.stdout.flush()
    return docs_t, acr, err_t, err_r, chain
def evidence(docs, n_topics, n_words, beta=.1, n_samples=10000, seed=1337):
    """ estimate the evidence using sequental importance sampling """

    # generate fake topics. Remember, teach_lda_XXX calcualtes the teaching
    # probability; we're onlt concerned with the evidence
    # TODO: Pass m as an argument
    m = 50.
    alpha = m / n_topics
    if USE_HARMONIC_MEAN:
        evidence = evidence_lda_hm(docs,
                                   n_topics,
                                   n_words,
                                   alpha=alpha,
                                   beta=beta,
                                   n_samples=n_samples,
                                   stop_itr=1000,
                                   seed=None)
    else:
        topics = np.ones((n_topics, n_words)) / n_words
        _, _, evx = teach_lda_pgis(docs,
                                   topics,
                                   alpha=alpha,
                                   beta=beta,
                                   n_samples=n_samples,
                                   seed=seed,
                                   return_parts=True)

        # the evidence isn't normalized in pgis because the normalizer cancels
        # out of the ratio. Normalize here.
        evidence = logsumexp(evx) - log(n_samples)
    return evidence
Пример #3
0
def expitr(n_topics, n_vocab, n_words, α, β, n_docs, tol, seed, tasknum):
    random.seed(seed)

    print('Running %d topics, %d  n_vocab, and %d words to ε=%1.2f.' %
          (n_topics, n_vocab, n_words, tol,))

    docs, topics = gen_docs(n_docs, n_topics, n_vocab, n_words, α, β)

    i = 0
    t_pgis = 0
    relerr = float('Inf')
    weights = []
    while relerr > tol and i < MAX_ITR:
        i += 1
        seed = random.randrange(2**31)

        t_start = time.time()
        _, _, lml = teach_lda_pgis(docs, topics, alpha=α, beta=β, n_samples=1,
                                   seed=seed, return_parts=True)
        t_end = time.time()

        t_pgis += t_end - t_start
        weights.append(lml)

        if i > 1:
            relerr = isrelerr(weights)

    if i > MAX_ITR:
        print('Warning: MAX_ITR reached. Breaking.')

    return t_pgis, i, n_topics, n_vocab, n_words, α, β, n_docs
Пример #4
0
def _calc_docset_logp_inner(docs, topics, topics_list, α, β, n_samples, seed):
    pt = teach_lda_pgis(docs, topics, topics_list=topics_list, alpha=α, beta=β,
                        n_samples=n_samples, seed=seed, return_parts=True)
    prior, numer, denom = pt

    lp = prior + logsumexp(numer) - logsumexp(denom)
    ll = logsumexp(numer) - np.log(len(numer))

    print(".", end="")
    sys.stdout.flush()
    return lp, ll
Пример #5
0
def _calc_docset_logp_inner(docs, topics, topics_list, α, β, n_samples, seed):
    pt = teach_lda_pgis(docs,
                        topics,
                        topics_list=topics_list,
                        alpha=α,
                        beta=β,
                        n_samples=n_samples,
                        seed=seed,
                        return_parts=True)
    prior, numer, denom = pt

    lp = prior + logsumexp(numer) - logsumexp(denom)
    ll = logsumexp(numer) - np.log(len(numer))

    print(".", end="")
    sys.stdout.flush()
    return lp, ll
Пример #6
0
def _expitr(docs, dtpc, alpha, beta, n_samples, seed):
    logsamples = log(n_samples)
    n_topics = len(dtpc)
    n_words = len(dtpc[0])

    exact = logsumexp(
        teach_lda_exact(docs, dtpc, alpha=alpha, beta=beta,
                        return_parts=True)[-1])
    print("1")
    unis = logsumexp(
        teach_lda_unis(docs,
                       dtpc,
                       alpha=alpha,
                       beta=beta,
                       n_samples=n_samples,
                       seed=seed,
                       return_parts=True)[-1]) - logsamples
    print("2")
    unis += sum(len(d['w']) for d in docs) * log(n_topics)
    pgis = logsumexp(
        teach_lda_pgis(docs,
                       dtpc,
                       alpha=alpha,
                       beta=beta,
                       n_samples=n_samples,
                       seed=seed + 1,
                       return_parts=True)[-1]) - logsamples
    print("3")
    hm = evidence_lda_hm(docs,
                         n_topics,
                         n_words,
                         alpha=alpha,
                         beta=beta,
                         n_samples=n_samples,
                         stop_itr=100,
                         seed=seed + 2)
    print("4")
    print(seed)

    return {'Exact': exact, 'Uniform': unis, 'SIS': pgis, 'HM': hm}
Пример #7
0
def expitr(n_topics, n_vocab, n_words, α, β, n_docs, tol, seed, tasknum):
    random.seed(seed)

    print('Running %d topics, %d  n_vocab, and %d words to ε=%1.2f.' % (
        n_topics,
        n_vocab,
        n_words,
        tol,
    ))

    docs, topics = gen_docs(n_docs, n_topics, n_vocab, n_words, α, β)

    i = 0
    t_pgis = 0
    relerr = float('Inf')
    weights = []
    while relerr > tol and i < MAX_ITR:
        i += 1
        seed = random.randrange(2**31)

        t_start = time.time()
        _, _, lml = teach_lda_pgis(docs,
                                   topics,
                                   alpha=α,
                                   beta=β,
                                   n_samples=1,
                                   seed=seed,
                                   return_parts=True)
        t_end = time.time()

        t_pgis += t_end - t_start
        weights.append(lml)

        if i > 1:
            relerr = isrelerr(weights)

    if i > MAX_ITR:
        print('Warning: MAX_ITR reached. Breaking.')

    return t_pgis, i, n_topics, n_vocab, n_words, α, β, n_docs
def evidence(docs, n_topics, n_words, beta=.1, n_samples=10000, seed=1337):
    """ estimate the evidence using sequental importance sampling """

    # generate fake topics. Remember, teach_lda_XXX calcualtes the teaching
    # probability; we're onlt concerned with the evidence
    # TODO: Pass m as an argument
    m = 50.
    alpha = m/n_topics
    if USE_HARMONIC_MEAN:
        evidence = evidence_lda_hm(docs, n_topics, n_words, alpha=alpha,
                                   beta=beta, n_samples=n_samples,
                                   stop_itr=1000, seed=None)
    else:
        topics = np.ones((n_topics, n_words))/n_words
        _, _, evx = teach_lda_pgis(docs, topics, alpha=alpha, beta=beta,
                                   n_samples=n_samples, seed=seed,
                                   return_parts=True)

        # the evidence isn't normalized in pgis because the normalizer cancels
        # out of the ratio. Normalize here.
        evidence = logsumexp(evx) - log(n_samples)
    return evidence
Пример #9
0
def _expitr(docs, dtpc, alpha, beta, n_samples, seed):
    n_topics = len(dtpc)
    n_words = len(dtpc[0])

    unis = np.array(teach_lda_unis(docs, dtpc, alpha=alpha, beta=beta,
                                   n_samples=n_samples, seed=seed,
                                   return_parts=True)[-1])
    unis += sum([len(d['w']) for d in docs])*log(n_topics)

    pgis = np.array(teach_lda_pgis(docs, dtpc, alpha=alpha, beta=beta,
                                   n_samples=n_samples, seed=seed+1,
                                   return_parts=True)[-1])
    hm = evidence_lda_hm(docs, n_topics, n_words, alpha=alpha, beta=beta,
                         n_samples=n_samples, stop_itr=200, seed=seed+2,
                         return_samples=True)

    lsxaccum = lambda x: logsumexp(x) - log(len(x))
    hmaccum = lambda x: -(logsumexp(x) - log(len(x)))

    data = []
    steps = np.unique(np.array(np.exp(np.linspace(
        log(2), log(n_samples-1), 250))))
    for i in steps:
        n_samples = float(i+1)
        data.append({'Samples': n_samples,
                     'Type': 'Uniform',
                     'Estimate': lsxaccum(unis[:i]),
                     'Relative error': isrelerr(unis[:i]), })
        data.append({'Samples': n_samples,
                     'Type': 'SIS',
                     'Estimate': lsxaccum(pgis[:i]),
                     'Relative error': isrelerr(pgis[:i]), })
        data.append({'Samples': n_samples,
                     'Type': 'Harmonic mean',
                     'Estimate': hmaccum(hm[:i]),
                     'Relative error': isrelerr(hm[:i]), })
    return data