def main(): data = [1713790,285060,142305,75755,24015,13640,9080] bands = ['0-4', '5-9', '10-19', '20-49', '50-99', '100-249', '250+'] data = {bands[i]: data[i] for i in range(len(data))} total_employment = 25265000 n_ons_companies = np.sum([x for x in data.values()]) print(calculate_parameters.max_likelihood(data, total_employment / n_ons_companies))
def estimate_bias(n, mean, sd, sample_size=100): print(n, mean, sd) mean_total = 0 sd_total = 0 fixed_mean_total = 0 fixed_sd_total = 0 for _ in range(sample_size): if n is not None: sample = lognorm.rvs(sd, scale=np.exp(mean), size=n) binned_sample = sort_sample(sample) binned_sample = {s: v / n for s, v in binned_sample.items()} #params = calculate_parameters.max_likelihood(binned_sample, sample.mean()) params = calculate_parameters.max_likelihood(binned_sample) else: max_sizes = [0.00001, 5, 10, 20, 50, 100, 250, 10**10] titles = [ '0-4', '5-9', '10-19', '20-49', '50-99', '100-249', '250+' ] binned_sample = { titles[i]: lognorm.cdf(max_sizes[i + 1], sd, scale=np.exp(mean)) - lognorm.cdf(max_sizes[i], sd, scale=np.exp(mean)) for i in range(len(max_sizes) - 1) } params = calculate_parameters.max_likelihood( binned_sample, np.exp(mean + sd**2 / 2)) #print(params) if params is None: continue recovered_mean, recovered_sd = params mean_total += recovered_mean - mean sd_total += recovered_sd - sd #fixed_mean, fixed_sd = calculate_parameters.remove_bias(recovered_mean, recovered_sd) #fixed_mean_total += fixed_mean #fixed_sd_total += fixed_sd return mean_total / sample_size, sd_total / sample_size, fixed_mean_total / sample_size, fixed_sd_total / sample_size
def recovery_simulation(n, mean, sd): mean_with_dist_mean = [] mean_without_dist_mean = [] sd_with_dist_mean = [] sd_without_dist_mean = [] for i in range(1000): sizes = lognorm.rvs(sd, scale=np.exp(mean), size=n) binned_sizes = analysis.sort_sample(sizes) parameters_without_mean = calculate_parameters.max_likelihood( binned_sizes) parameters_with_mean = calculate_parameters.max_likelihood( binned_sizes, sizes.mean()) if parameters_without_mean is not None and parameters_with_mean is not None: mean_with_dist_mean.append(parameters_with_mean[0]) sd_with_dist_mean.append(parameters_with_mean[1]) mean_without_dist_mean.append(parameters_without_mean[0]) sd_without_dist_mean.append(parameters_without_mean[1]) return mean_with_dist_mean, sd_with_dist_mean, mean_without_dist_mean, sd_without_dist_mean
def main(files): files = [files[i:i + 2] for i in range(int(len(files) / 2)) ] # TODO: need to make this neater, the script takes two inputs with open('2014_output.csv', 'w') as csvfile: writer = csv.writer(csvfile) for file in files: dist_data = calculate_parameters.read_file(file[0]) employment = get_employment(file[1]) ratios = {} for key, size_dist in dist_data.items(): #print(size_dist) total = np.sum([n for n in size_dist.values()]) if key not in employment: print(key + ' missing', total) continue params = calculate_parameters.max_likelihood( size_dist, employment[key] / total) #params = calculate_parameters.max_likelihood(size_dist) if params is None: continue mean, sd = params expected = expected_bands(mean, sd, [bands for bands in size_dist]) writer.writerow([key, mean, sd]) for size_band, n in size_dist.items(): if size_band in ratios: ratios[size_band]['x'].append(n / total) ratios[size_band]['y'].append(expected[size_band]) else: ratios[size_band] = { 'x': [n / total], 'y': [expected[size_band]] } plt.figure(0) plt.loglog() ax = plt.gca() ax.set_xlim([10**-4, 10**0]) ax.set_ylim([10**-4, 10**0]) for band, data in ratios.items(): plt.scatter(data['x'], data['y'], label=band) plt.legend() plt.plot([0, 1], [0, 1]) plt.xlabel('Actual proportion') plt.ylabel('Predicted proportion') plt.savefig('graphs/' + file[0][:-4] + '.png') plt.show()
def main(): bounds = [0, 5, 10, 20, 50, 100, 250, np.inf] titles = ['0-4', '5-9', '10-19', '20-49', '50-99', '100-249', '250+'] X = {t: [] for t in titles} Y = {t: [] for t in titles} Z = {t: [] for t in titles} for x in np.linspace(-1, 1, num=10): for y in np.linspace(0.5, 3, num=10): mean = x sd = y size = int(17000 + norm.rvs(0, 2000)) sample = lognorm.rvs(sd, scale=np.exp(mean), size=size) binned_sample = analysis.sort_sample(sample) binned_sample = { titles[i]: lognorm.cdf(bounds[i + 1], sd, scale=np.exp(mean)) - lognorm.cdf(bounds[i], sd, scale=np.exp(mean)) for i in range(len(bounds) - 1) } params_with_mean = calculate_parameters.max_likelihood( binned_sample, np.exp(mean + sd**2 / 2)) params = calculate_parameters.max_likelihood( binned_sample, sample.mean()) if params is None or params_with_mean is None: continue r_mean, r_sd = params r_sample = lognorm.rvs(r_sd, scale=np.exp(r_mean), size=size) r_binned_sample = analysis.sort_sample(r_sample) r_with_mean_mean, r_with_mean_sd = params_with_mean r_with_mean = lognorm.rvs(r_sd, scale=np.exp(r_mean), size=size) r_with_mean_binned_sample = analysis.sort_sample(r_with_mean) r_binned_sample = { titles[i]: lognorm.cdf(bounds[i + 1], r_sd, scale=np.exp(r_mean)) - lognorm.cdf(bounds[i], r_sd, scale=np.exp(r_mean)) for i in range(len(bounds) - 1) } for t, p in binned_sample.items(): X[t].append(p / np.sum([x for x in binned_sample.values()])) Y[t].append(r_binned_sample[t] / np.sum([x for x in r_binned_sample.values()])) Z[t].append( r_with_mean_binned_sample[t] / np.sum([x for x in r_with_mean_binned_sample.values()])) plt.figure(figsize=(16, 6)) plt.subplot(121) plt.loglog() ax = plt.gca() ax.set_xlim([10**-5, 10**0]) ax.set_ylim([10**-5, 10**0]) for t in titles: plt.scatter(X[t], Z[t], label=t) plt.xlabel('Actual proportion') plt.ylabel('Predicted proportion') plt.legend() plt.plot([0, 1], [0, 1]) plt.subplot(122) plt.loglog() ax = plt.gca() ax.set_xlim([10**-5, 10**0]) ax.set_ylim([10**-5, 10**0]) plt.xlabel('Actual proportion') plt.ylabel('Predicted proportion') for i in range(len(X['0-4'])): plt.plot([X[t][i] for t in titles], [Y[t][i] for t in titles]) plt.plot([0, 1], [0, 1]) plt.savefig('loglog_reconstruction.png') plt.show()