Exemplo n.º 1
0
def test_inference(algorithm, V, D, l, alpha, beta, num_itns, s):
    """
    Generates data via the generative process and then infers the
    parameters of the generative process using that data.
    """

    seed(s)

    print 'Generating data...'

    phi_TV, z_D, N_DV = generate_data(V, D, l, alpha, beta)

    set_printoptions(precision=4, suppress=True)

    for t in argsort(bincount(z_D))[::-1]:
        idx, = where(z_D[:] == t)
        print len(idx), phi_TV[t, :]

    print 'Running inference...'

    # initialize every document to the same topic

    algorithm.inference(N_DV, alpha, beta, zeros(D, dtype=int), num_itns, z_D)
Exemplo n.º 2
0
def getting_it_right(algorithm, V, D, l, alpha, beta, num_itns, s):
    """
    Runs Geweke's "getting it right" test.
    """

    seed(s)

    # generate forward samples via the generative process

    print "Generating forward samples..."

    forward_samples = []

    for _ in iterview(xrange(num_itns)):
        forward_samples.append(generate_data(V, D, l, alpha, beta)[1:])

    # generate reverse samples via the inference algorithm

    print "Generating reverse samples..."

    reverse_samples = []

    phi_TV, z_D, _ = generate_data(V, D, l, alpha, beta)

    for _ in iterview(xrange(num_itns)):

        N_DV = zeros((D, V), dtype=int)

        if algorithm.__name__ == "algorithm_8" or algorithm.__name__ == "nonconjugate_split_merge":
            for d in xrange(D):
                for v in sample(phi_TV[z_D[d], :], num_samples=poisson(l)):
                    N_DV[d, v] += 1

            phi_TV, z_D = algorithm.inference(N_DV, alpha, beta, z_D, 1)

        else:

            T = D  # maximum number of topics

            N_TV = zeros((T, V), dtype=int)
            N_T = zeros(T, dtype=int)

            for d in xrange(D):
                t = z_D[d]
                for _ in xrange(poisson(l)):
                    [v] = sample((N_TV[t, :] + beta / V) / (N_T[t] + beta))
                    N_DV[d, v] += 1
                    N_TV[t, v] += 1
                    N_T[t] += 1

            z_D = algorithm.inference(N_DV, alpha, beta, z_D, 1)

        z_D_copy = empty_like(z_D)
        z_D_copy[:] = z_D

        reverse_samples.append((z_D_copy, N_DV))

    print "Computing test statistics..."

    # test statistics: number of topics, maximum topic size, mean
    # topic size, standard deviation of topic sizes

    # compute test statistics for forward samples

    forward_num_topics = []
    forward_max_topic_size = []
    forward_mean_topic_size = []
    forward_std_topic_size = []

    for z_D, _ in forward_samples:
        forward_num_topics.append(len(unique(z_D)))
        topic_sizes = []
        for t in unique(z_D):
            topic_sizes.append((z_D[:] == t).sum())
        topic_sizes = array(topic_sizes)
        forward_max_topic_size.append(topic_sizes.max())
        forward_mean_topic_size.append(topic_sizes.mean())
        forward_std_topic_size.append(topic_sizes.std())

    # compute test statistics for reverse samples

    reverse_num_topics = []
    reverse_max_topic_size = []
    reverse_mean_topic_size = []
    reverse_std_topic_size = []

    for z_D, _ in reverse_samples:
        reverse_num_topics.append(len(unique(z_D)))
        topic_sizes = []
        for t in unique(z_D):
            topic_sizes.append((z_D[:] == t).sum())
        topic_sizes = array(topic_sizes)
        reverse_max_topic_size.append(topic_sizes.max())
        reverse_mean_topic_size.append(topic_sizes.mean())
        reverse_std_topic_size.append(topic_sizes.std())

    # generate P-P plots

    pp_plot(array(forward_num_topics), array(reverse_num_topics))
    pp_plot(array(forward_max_topic_size), array(reverse_max_topic_size))
    pp_plot(array(forward_mean_topic_size), array(reverse_mean_topic_size))
    pp_plot(array(forward_std_topic_size), array(reverse_std_topic_size))
Exemplo n.º 3
0
def getting_it_right(algorithm, V, D, l, alpha, beta, num_itns, s):
    """
    Runs Geweke's "getting it right" test.
    """

    seed(s)

    # generate forward samples via the generative process

    print 'Generating forward samples...'

    forward_samples = []

    for _ in iterview(xrange(num_itns)):
        forward_samples.append(generate_data(V, D, l, alpha, beta)[1:])

    # generate reverse samples via the inference algorithm

    print 'Generating reverse samples...'

    reverse_samples = []

    phi_TV, z_D, _ = generate_data(V, D, l, alpha, beta)

    for _ in iterview(xrange(num_itns)):

        N_DV = zeros((D, V), dtype=int)

        if (algorithm.__name__ == 'algorithm_8'
                or algorithm.__name__ == 'nonconjugate_split_merge'):
            for d in xrange(D):
                for v in sample(phi_TV[z_D[d], :], num_samples=poisson(l)):
                    N_DV[d, v] += 1

            phi_TV, z_D = algorithm.inference(N_DV, alpha, beta, z_D, 1)

        else:

            T = D  # maximum number of topics

            N_TV = zeros((T, V), dtype=int)
            N_T = zeros(T, dtype=int)

            for d in xrange(D):
                t = z_D[d]
                for _ in xrange(poisson(l)):
                    [v] = sample((N_TV[t, :] + beta / V) / (N_T[t] + beta))
                    N_DV[d, v] += 1
                    N_TV[t, v] += 1
                    N_T[t] += 1

            z_D = algorithm.inference(N_DV, alpha, beta, z_D, 1)

        z_D_copy = empty_like(z_D)
        z_D_copy[:] = z_D

        reverse_samples.append((z_D_copy, N_DV))

    print 'Computing test statistics...'

    # test statistics: number of topics, maximum topic size, mean
    # topic size, standard deviation of topic sizes

    # compute test statistics for forward samples

    forward_num_topics = []
    forward_max_topic_size = []
    forward_mean_topic_size = []
    forward_std_topic_size = []

    for z_D, _ in forward_samples:
        forward_num_topics.append(len(unique(z_D)))
        topic_sizes = []
        for t in unique(z_D):
            topic_sizes.append((z_D[:] == t).sum())
        topic_sizes = array(topic_sizes)
        forward_max_topic_size.append(topic_sizes.max())
        forward_mean_topic_size.append(topic_sizes.mean())
        forward_std_topic_size.append(topic_sizes.std())

    # compute test statistics for reverse samples

    reverse_num_topics = []
    reverse_max_topic_size = []
    reverse_mean_topic_size = []
    reverse_std_topic_size = []

    for z_D, _ in reverse_samples:
        reverse_num_topics.append(len(unique(z_D)))
        topic_sizes = []
        for t in unique(z_D):
            topic_sizes.append((z_D[:] == t).sum())
        topic_sizes = array(topic_sizes)
        reverse_max_topic_size.append(topic_sizes.max())
        reverse_mean_topic_size.append(topic_sizes.mean())
        reverse_std_topic_size.append(topic_sizes.std())

    # generate P-P plots

    pp_plot(array(forward_num_topics), array(reverse_num_topics))
    pp_plot(array(forward_max_topic_size), array(reverse_max_topic_size))
    pp_plot(array(forward_mean_topic_size), array(reverse_mean_topic_size))
    pp_plot(array(forward_std_topic_size), array(reverse_std_topic_size))