Пример #1
0
def plot_wasserstein_vs_lamb_simple():

    np.random.seed(1)

    N = 1000
    actual_lamb = 1.2
    n_lambs = 2000
    lambs = np.random.uniform(1.15, 1.25, n_lambs)

    ns_obs = get_ranked_empirical_counts_from_infinite_power_law(
        actual_lamb, N)

    wass_ds = []
    for lamb in lambs:
        ns = get_ranked_empirical_counts_from_infinite_power_law(lamb, N)
        w_d = scipy.stats.wasserstein_distance(ns_obs, ns)
        wass_ds.append(w_d)

    tolerance = get_new_tolerance(wass_ds, 0.5)

    top_lambs = []
    top_ws = []

    for i in range(n_lambs):
        if wass_ds[i] < tolerance:
            top_lambs.append(lambs[i])
            top_ws.append(wass_ds[i])

    sns.scatterplot(top_lambs, top_ws, color=PRIMARY_COLOR, s=POINT_SIZE)
    plt.axhline(tolerance,
                color=PRIMARY_COLOR,
                linestyle="--",
                label=r"$\epsilon$",
                linewidth=LINEWIDTH)

    plt.ylim(0.06, tolerance * 1.3)

    plt.xlabel("$\lambda$")
    plt.ylabel("$W(n_i, n_{obs})$")

    plt.legend()

    plt.savefig(
        "images/abc-pmc-wasserstein_distance_vs_lamb_N_{}-simple.png".format(
            N),
        dpi=300)

    plt.show()
def test_with_basic_data():

    actual_lamb = 1.2
    n = get_ranked_empirical_counts_from_infinite_power_law(1.2, N=10000)
    estimated_lamb = abc_pmc_zipf(n)
    print("Actual lamb was {}, predicted lamb was {}".format(
        actual_lamb, estimated_lamb))
def abc_pmc_zipf(n,
                 theta_min=1.01,
                 theta_max=3,
                 survival_fraction=0.4,
                 n_particles=256,
                 n_generations=10):
    """
	As described in Pilgrim and Hills, "Bias in Zipf's Law Estimators" (2021)
	"""
    prior_dist = scipy.stats.uniform(loc=theta_min,
                                     scale=theta_max - theta_min)
    n_data = sum(n)
    tolerance = math.inf
    proposal_dist = prior_dist
    for g in range(n_generations):
        print("Running ABC PMC generation {} of {}".format(g, n_generations))
        thetas = []
        ds = []
        weights = []
        for i in range(n_particles):
            hit = False
            while not hit:
                if g == 0:
                    # First generation - uniform dist is a different type of object in scipy to kde
                    theta = proposal_dist.rvs(1)[0]
                else:
                    theta = proposal_dist.resample(1)[0][0]

                if theta > theta_min and theta < theta_max:
                    z = get_ranked_empirical_counts_from_infinite_power_law(
                        theta, N=n_data)
                    d = scipy.stats.wasserstein_distance(z, n)
                    if d <= tolerance:
                        thetas.append(theta)
                        ds.append(d)
                        if g == 0:
                            # First generation - uniform proposal so equal weights
                            weight = 1
                        else:
                            # Uniform dist has equal values at each vaue of theta
                            # Relative values of weights is needed - not absolute values
                            weight = 1 / proposal_dist.evaluate(theta)[0]
                        weights.append(weight)
                        hit = True

        tolerance = get_tolerance(ds, survival_fraction)
        var = 2 * np.cov(thetas, aweights=weights)
        # bandwidth with 2 * data weighted variance works well
        proposal_dist = scipy.stats.gaussian_kde(thetas,
                                                 weights=weights,
                                                 bw_method=np.sqrt(2))

    posterior = scipy.stats.gaussian_kde(thetas, weights=weights)
    xs = np.linspace(theta_min, theta_max, 100000)
    posterior_values = posterior.evaluate(xs)

    mle = xs[np.argmax(posterior_values)]
    return mle
Пример #4
0
def plot_kde_of_result_simple():

    np.random.seed(1)

    N = 1000
    actual_lamb = 1.2
    n_lambs = 2000
    lambs = np.linspace(1.15, 1.25, n_lambs)

    ns_obs = get_ranked_empirical_counts_from_infinite_power_law(
        actual_lamb, N)

    wass_ds = []
    for lamb in lambs:
        ns = get_ranked_empirical_counts_from_infinite_power_law(lamb, N)
        w_d = scipy.stats.wasserstein_distance(ns_obs, ns)
        wass_ds.append(w_d)

    top_tolerance = get_new_tolerance(wass_ds, 0.5)

    top_lambs = []
    top_ws = []

    for i in range(n_lambs):
        if wass_ds[i] < top_tolerance:
            top_lambs.append(lambs[i])
            top_ws.append(wass_ds[i])

    var = np.var(top_lambs)

    sns.kdeplot(top_lambs,
                bw_method=np.sqrt(2),
                color=PRIMARY_COLOR,
                linewidth=LINEWIDTH)

    plt.xlabel("$\lambda$")
    plt.ylabel("Proposal distribution, $g(\lambda)$")

    plt.savefig("images/abc-pmc-proposal-distribution-simple.png", dpi=300)

    plt.show()
Пример #5
0
def run_sims_changing_N_one_seed(
        seed=100,
        results_filename="../data/simulated/zipf_beaumont_results_cow_test.csv"
):

    # Experiment variables - match ones chosen for Clauset
    N_exponents = range(6, 21)
    Ns = [2**a for a in N_exponents]
    exponent = 1.1

    # WABC variables
    n_particles = 256
    survival_fraction = 0.4
    n_generations = 10

    print(Ns)

    for i in range(1):
        for N in Ns:
            n_data = N
            print(N)

            np.random.seed(seed)
            print("Seed {} N {}".format(seed, N))

            ns = get_ranked_empirical_counts_from_infinite_power_law(exponent,
                                                                     N=N)

            try:
                start = time.time()
                mle = abc_pmc_zipf(ns,
                                   n_particles=n_particles,
                                   survival_fraction=survival_fraction,
                                   n_generations=n_generations)
                print(mle)
                end = time.time()
                csv_row = [
                    "Beaumont 2007 Basic", seed, exponent, n_particles,
                    survival_fraction, n_data, n_generations, mle, end - start
                ]
                print(csv_row)
            except Exception as e:
                print("EXCEPTION ", str(e))
                csv_row = [
                    "Beaumont 2007 Basic", seed, exponent, n_particles,
                    survival_fraction, n_data, n_generations,
                    str(e)
                ]
            append_to_csv(csv_row, results_filename)
def generate_data_for_clauset_bias():

    data_filename = ("../data/simulated/clauset_bias_changing_lamb.csv")

    lambs = np.linspace(1.01, 2, 100)
    N = 10000

    seeds = range(100)

    for seed in seeds:
        print(seed)
        for lamb in lambs:
            np.random.seed(seed)
            n = get_ranked_empirical_counts_from_infinite_power_law(lamb, N)
            lamb_hat = mle_bauke_diff(n)
            csv_list = [seed, lamb, N, "clauset", lamb_hat]
            append_to_csv(csv_list, data_filename)
Пример #7
0
def run_sims_changing_lambda_one_seed(
        seed=100,
        results_filename="../data/simulated/zipf_beaumont_results_cow_test.csv"
):

    n_data = 10000
    n_particles = 256
    survival_fraction = 0.4
    n_generations = 10

    for i in range(1):
        for exponent in np.linspace(1.01, 2, 100):
            print(exponent)

            np.random.seed(seed)
            print("Seed {} exponent {}".format(seed, exponent))

            ns = get_ranked_empirical_counts_from_infinite_power_law(exponent,
                                                                     N=n_data)

            try:
                start = time.time()
                mle = abc_pmc_zipf(ns,
                                   n_particles=n_particles,
                                   survival_fraction=survival_fraction,
                                   n_generations=n_generations)
                print(mle)
                end = time.time()
                csv_row = [
                    "Beaumont 2007 Basic", seed, exponent, n_particles,
                    survival_fraction, n_data, n_generations, mle, end - start
                ]
                print(csv_row)
            except Exception as e:
                print("EXCEPTION ", str(e))
                csv_row = [
                    "Beaumont 2007 Basic", seed, exponent, n_particles,
                    survival_fraction, n_data, n_generations,
                    str(e)
                ]
            append_to_csv(csv_row, results_filename)
Пример #8
0
def plot_data_and_sims():

    plt.rcParams["figure.figsize"] = (20, 4)

    np.random.seed(5)

    actual_lamb = 1.2
    N = 1000

    ns_observed = get_ranked_empirical_counts_from_infinite_power_law(
        actual_lamb, N)
    ns_ranks = range(1, len(ns_observed) + 1)

    fig, (ax1, axgap, ax2, ax3, ax4) = plt.subplots(1,
                                                    5,
                                                    sharey=True,
                                                    sharex=True)

    ax1.scatter(ns_ranks, ns_observed, color=PRIMARY_COLOR, s=POINT_SIZE)

    plt.xscale("log")
    plt.yscale("log")

    ax1.set_xlabel("$n_{obs}$")

    ax1.set_title("Observed data, $\lambda={}$".format(actual_lamb))

    axgap.set_title('$\lambda_i \sim g^{t-1}(\lambda)$')

    lamb_1 = 1.15
    ns_2 = get_ranked_empirical_counts_from_infinite_power_law(lamb_1, N)
    ns_2_ranks = range(1, len(ns_2) + 1)

    ax2.scatter(ns_2_ranks, ns_2, color=PRIMARY_COLOR, s=POINT_SIZE)

    ax2.set_title("$\lambda_1 = {}$".format(lamb_1))

    ax2.set_xlabel("$n_1$")

    lamb_2 = 1.21
    ns_3 = get_ranked_empirical_counts_from_infinite_power_law(lamb_2, N)
    ns_3_ranks = range(1, len(ns_3) + 1)

    ax3.scatter(ns_3_ranks, ns_3, color=PRIMARY_COLOR, s=POINT_SIZE)

    ax3.set_title("$\lambda_2 = {}$".format(lamb_2))
    ax3.set_xlabel("$n_2$")

    lamb_3 = 1.23
    ns_4 = get_ranked_empirical_counts_from_infinite_power_law(lamb_3, N)
    ns_4_ranks = range(1, len(ns_4) + 1)

    ax4.scatter(ns_4_ranks, ns_4, color=PRIMARY_COLOR, s=POINT_SIZE)

    ax4.set_title("$\lambda_3 = {}$".format(lamb_3))
    ax4.set_xlabel("$n_3$")

    for ax in [ax1, ax2, ax3, ax4]:

        ax.tick_params(
            axis='x',  # changes apply to the x-axis
            which='both',  # both major and minor ticks are affected
            bottom=False,  # ticks along the bottom edge are off
            top=False,  # ticks along the top edge are off
            labelbottom=False)
        ax.tick_params(
            axis='y',  # changes apply to the x-axis
            which='both',  # both major and minor ticks are affected
            left=False,  # ticks along the bottom edge are off
            right=False,  # ticks along the top edge are off
            labelleft=False)

    plt.savefig("images/abc-pmc-top-part-data-and-sims.png", dpi=300)
    plt.show()