def plot_wasserstein_vs_lamb_simple(): np.random.seed(1) N = 1000 actual_lamb = 1.2 n_lambs = 2000 lambs = np.random.uniform(1.15, 1.25, n_lambs) ns_obs = get_ranked_empirical_counts_from_infinite_power_law( actual_lamb, N) wass_ds = [] for lamb in lambs: ns = get_ranked_empirical_counts_from_infinite_power_law(lamb, N) w_d = scipy.stats.wasserstein_distance(ns_obs, ns) wass_ds.append(w_d) tolerance = get_new_tolerance(wass_ds, 0.5) top_lambs = [] top_ws = [] for i in range(n_lambs): if wass_ds[i] < tolerance: top_lambs.append(lambs[i]) top_ws.append(wass_ds[i]) sns.scatterplot(top_lambs, top_ws, color=PRIMARY_COLOR, s=POINT_SIZE) plt.axhline(tolerance, color=PRIMARY_COLOR, linestyle="--", label=r"$\epsilon$", linewidth=LINEWIDTH) plt.ylim(0.06, tolerance * 1.3) plt.xlabel("$\lambda$") plt.ylabel("$W(n_i, n_{obs})$") plt.legend() plt.savefig( "images/abc-pmc-wasserstein_distance_vs_lamb_N_{}-simple.png".format( N), dpi=300) plt.show()
def test_with_basic_data(): actual_lamb = 1.2 n = get_ranked_empirical_counts_from_infinite_power_law(1.2, N=10000) estimated_lamb = abc_pmc_zipf(n) print("Actual lamb was {}, predicted lamb was {}".format( actual_lamb, estimated_lamb))
def abc_pmc_zipf(n, theta_min=1.01, theta_max=3, survival_fraction=0.4, n_particles=256, n_generations=10): """ As described in Pilgrim and Hills, "Bias in Zipf's Law Estimators" (2021) """ prior_dist = scipy.stats.uniform(loc=theta_min, scale=theta_max - theta_min) n_data = sum(n) tolerance = math.inf proposal_dist = prior_dist for g in range(n_generations): print("Running ABC PMC generation {} of {}".format(g, n_generations)) thetas = [] ds = [] weights = [] for i in range(n_particles): hit = False while not hit: if g == 0: # First generation - uniform dist is a different type of object in scipy to kde theta = proposal_dist.rvs(1)[0] else: theta = proposal_dist.resample(1)[0][0] if theta > theta_min and theta < theta_max: z = get_ranked_empirical_counts_from_infinite_power_law( theta, N=n_data) d = scipy.stats.wasserstein_distance(z, n) if d <= tolerance: thetas.append(theta) ds.append(d) if g == 0: # First generation - uniform proposal so equal weights weight = 1 else: # Uniform dist has equal values at each vaue of theta # Relative values of weights is needed - not absolute values weight = 1 / proposal_dist.evaluate(theta)[0] weights.append(weight) hit = True tolerance = get_tolerance(ds, survival_fraction) var = 2 * np.cov(thetas, aweights=weights) # bandwidth with 2 * data weighted variance works well proposal_dist = scipy.stats.gaussian_kde(thetas, weights=weights, bw_method=np.sqrt(2)) posterior = scipy.stats.gaussian_kde(thetas, weights=weights) xs = np.linspace(theta_min, theta_max, 100000) posterior_values = posterior.evaluate(xs) mle = xs[np.argmax(posterior_values)] return mle
def plot_kde_of_result_simple(): np.random.seed(1) N = 1000 actual_lamb = 1.2 n_lambs = 2000 lambs = np.linspace(1.15, 1.25, n_lambs) ns_obs = get_ranked_empirical_counts_from_infinite_power_law( actual_lamb, N) wass_ds = [] for lamb in lambs: ns = get_ranked_empirical_counts_from_infinite_power_law(lamb, N) w_d = scipy.stats.wasserstein_distance(ns_obs, ns) wass_ds.append(w_d) top_tolerance = get_new_tolerance(wass_ds, 0.5) top_lambs = [] top_ws = [] for i in range(n_lambs): if wass_ds[i] < top_tolerance: top_lambs.append(lambs[i]) top_ws.append(wass_ds[i]) var = np.var(top_lambs) sns.kdeplot(top_lambs, bw_method=np.sqrt(2), color=PRIMARY_COLOR, linewidth=LINEWIDTH) plt.xlabel("$\lambda$") plt.ylabel("Proposal distribution, $g(\lambda)$") plt.savefig("images/abc-pmc-proposal-distribution-simple.png", dpi=300) plt.show()
def run_sims_changing_N_one_seed( seed=100, results_filename="../data/simulated/zipf_beaumont_results_cow_test.csv" ): # Experiment variables - match ones chosen for Clauset N_exponents = range(6, 21) Ns = [2**a for a in N_exponents] exponent = 1.1 # WABC variables n_particles = 256 survival_fraction = 0.4 n_generations = 10 print(Ns) for i in range(1): for N in Ns: n_data = N print(N) np.random.seed(seed) print("Seed {} N {}".format(seed, N)) ns = get_ranked_empirical_counts_from_infinite_power_law(exponent, N=N) try: start = time.time() mle = abc_pmc_zipf(ns, n_particles=n_particles, survival_fraction=survival_fraction, n_generations=n_generations) print(mle) end = time.time() csv_row = [ "Beaumont 2007 Basic", seed, exponent, n_particles, survival_fraction, n_data, n_generations, mle, end - start ] print(csv_row) except Exception as e: print("EXCEPTION ", str(e)) csv_row = [ "Beaumont 2007 Basic", seed, exponent, n_particles, survival_fraction, n_data, n_generations, str(e) ] append_to_csv(csv_row, results_filename)
def generate_data_for_clauset_bias(): data_filename = ("../data/simulated/clauset_bias_changing_lamb.csv") lambs = np.linspace(1.01, 2, 100) N = 10000 seeds = range(100) for seed in seeds: print(seed) for lamb in lambs: np.random.seed(seed) n = get_ranked_empirical_counts_from_infinite_power_law(lamb, N) lamb_hat = mle_bauke_diff(n) csv_list = [seed, lamb, N, "clauset", lamb_hat] append_to_csv(csv_list, data_filename)
def run_sims_changing_lambda_one_seed( seed=100, results_filename="../data/simulated/zipf_beaumont_results_cow_test.csv" ): n_data = 10000 n_particles = 256 survival_fraction = 0.4 n_generations = 10 for i in range(1): for exponent in np.linspace(1.01, 2, 100): print(exponent) np.random.seed(seed) print("Seed {} exponent {}".format(seed, exponent)) ns = get_ranked_empirical_counts_from_infinite_power_law(exponent, N=n_data) try: start = time.time() mle = abc_pmc_zipf(ns, n_particles=n_particles, survival_fraction=survival_fraction, n_generations=n_generations) print(mle) end = time.time() csv_row = [ "Beaumont 2007 Basic", seed, exponent, n_particles, survival_fraction, n_data, n_generations, mle, end - start ] print(csv_row) except Exception as e: print("EXCEPTION ", str(e)) csv_row = [ "Beaumont 2007 Basic", seed, exponent, n_particles, survival_fraction, n_data, n_generations, str(e) ] append_to_csv(csv_row, results_filename)
def plot_data_and_sims(): plt.rcParams["figure.figsize"] = (20, 4) np.random.seed(5) actual_lamb = 1.2 N = 1000 ns_observed = get_ranked_empirical_counts_from_infinite_power_law( actual_lamb, N) ns_ranks = range(1, len(ns_observed) + 1) fig, (ax1, axgap, ax2, ax3, ax4) = plt.subplots(1, 5, sharey=True, sharex=True) ax1.scatter(ns_ranks, ns_observed, color=PRIMARY_COLOR, s=POINT_SIZE) plt.xscale("log") plt.yscale("log") ax1.set_xlabel("$n_{obs}$") ax1.set_title("Observed data, $\lambda={}$".format(actual_lamb)) axgap.set_title('$\lambda_i \sim g^{t-1}(\lambda)$') lamb_1 = 1.15 ns_2 = get_ranked_empirical_counts_from_infinite_power_law(lamb_1, N) ns_2_ranks = range(1, len(ns_2) + 1) ax2.scatter(ns_2_ranks, ns_2, color=PRIMARY_COLOR, s=POINT_SIZE) ax2.set_title("$\lambda_1 = {}$".format(lamb_1)) ax2.set_xlabel("$n_1$") lamb_2 = 1.21 ns_3 = get_ranked_empirical_counts_from_infinite_power_law(lamb_2, N) ns_3_ranks = range(1, len(ns_3) + 1) ax3.scatter(ns_3_ranks, ns_3, color=PRIMARY_COLOR, s=POINT_SIZE) ax3.set_title("$\lambda_2 = {}$".format(lamb_2)) ax3.set_xlabel("$n_2$") lamb_3 = 1.23 ns_4 = get_ranked_empirical_counts_from_infinite_power_law(lamb_3, N) ns_4_ranks = range(1, len(ns_4) + 1) ax4.scatter(ns_4_ranks, ns_4, color=PRIMARY_COLOR, s=POINT_SIZE) ax4.set_title("$\lambda_3 = {}$".format(lamb_3)) ax4.set_xlabel("$n_3$") for ax in [ax1, ax2, ax3, ax4]: ax.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom=False, # ticks along the bottom edge are off top=False, # ticks along the top edge are off labelbottom=False) ax.tick_params( axis='y', # changes apply to the x-axis which='both', # both major and minor ticks are affected left=False, # ticks along the bottom edge are off right=False, # ticks along the top edge are off labelleft=False) plt.savefig("images/abc-pmc-top-part-data-and-sims.png", dpi=300) plt.show()