def main():
	data = [1713790,285060,142305,75755,24015,13640,9080]
	bands = ['0-4', '5-9', '10-19', '20-49', '50-99', '100-249', '250+']
	data = {bands[i]: data[i] for i in range(len(data))}
	total_employment = 25265000
	n_ons_companies = np.sum([x for x in data.values()])

	print(calculate_parameters.max_likelihood(data, total_employment / n_ons_companies))
Пример #2
0
def estimate_bias(n, mean, sd, sample_size=100):
    print(n, mean, sd)
    mean_total = 0
    sd_total = 0
    fixed_mean_total = 0
    fixed_sd_total = 0
    for _ in range(sample_size):
        if n is not None:
            sample = lognorm.rvs(sd, scale=np.exp(mean), size=n)
            binned_sample = sort_sample(sample)
            binned_sample = {s: v / n for s, v in binned_sample.items()}

            #params = calculate_parameters.max_likelihood(binned_sample, sample.mean())
            params = calculate_parameters.max_likelihood(binned_sample)

        else:
            max_sizes = [0.00001, 5, 10, 20, 50, 100, 250, 10**10]
            titles = [
                '0-4', '5-9', '10-19', '20-49', '50-99', '100-249', '250+'
            ]

            binned_sample = {
                titles[i]:
                lognorm.cdf(max_sizes[i + 1], sd, scale=np.exp(mean)) -
                lognorm.cdf(max_sizes[i], sd, scale=np.exp(mean))
                for i in range(len(max_sizes) - 1)
            }

            params = calculate_parameters.max_likelihood(
                binned_sample, np.exp(mean + sd**2 / 2))
            #print(params)

        if params is None:
            continue
        recovered_mean, recovered_sd = params

        mean_total += recovered_mean - mean
        sd_total += recovered_sd - sd

        #fixed_mean, fixed_sd = calculate_parameters.remove_bias(recovered_mean, recovered_sd)
        #fixed_mean_total += fixed_mean
        #fixed_sd_total += fixed_sd

    return mean_total / sample_size, sd_total / sample_size, fixed_mean_total / sample_size, fixed_sd_total / sample_size
Пример #3
0
def recovery_simulation(n, mean, sd):
    mean_with_dist_mean = []
    mean_without_dist_mean = []
    sd_with_dist_mean = []
    sd_without_dist_mean = []
    for i in range(1000):
        sizes = lognorm.rvs(sd, scale=np.exp(mean), size=n)
        binned_sizes = analysis.sort_sample(sizes)
        parameters_without_mean = calculate_parameters.max_likelihood(
            binned_sizes)
        parameters_with_mean = calculate_parameters.max_likelihood(
            binned_sizes, sizes.mean())

        if parameters_without_mean is not None and parameters_with_mean is not None:
            mean_with_dist_mean.append(parameters_with_mean[0])
            sd_with_dist_mean.append(parameters_with_mean[1])
            mean_without_dist_mean.append(parameters_without_mean[0])
            sd_without_dist_mean.append(parameters_without_mean[1])

    return mean_with_dist_mean, sd_with_dist_mean, mean_without_dist_mean, sd_without_dist_mean
def main(files):
    files = [files[i:i + 2] for i in range(int(len(files) / 2))
             ]  # TODO: need to make this neater, the script takes two inputs
    with open('2014_output.csv', 'w') as csvfile:
        writer = csv.writer(csvfile)
        for file in files:
            dist_data = calculate_parameters.read_file(file[0])
            employment = get_employment(file[1])
            ratios = {}
            for key, size_dist in dist_data.items():
                #print(size_dist)
                total = np.sum([n for n in size_dist.values()])
                if key not in employment:
                    print(key + ' missing', total)
                    continue
                params = calculate_parameters.max_likelihood(
                    size_dist, employment[key] / total)
                #params = calculate_parameters.max_likelihood(size_dist)
                if params is None:
                    continue
                mean, sd = params
                expected = expected_bands(mean, sd,
                                          [bands for bands in size_dist])

                writer.writerow([key, mean, sd])
                for size_band, n in size_dist.items():
                    if size_band in ratios:
                        ratios[size_band]['x'].append(n / total)
                        ratios[size_band]['y'].append(expected[size_band])

                    else:
                        ratios[size_band] = {
                            'x': [n / total],
                            'y': [expected[size_band]]
                        }

            plt.figure(0)
            plt.loglog()
            ax = plt.gca()
            ax.set_xlim([10**-4, 10**0])
            ax.set_ylim([10**-4, 10**0])
            for band, data in ratios.items():
                plt.scatter(data['x'], data['y'], label=band)
            plt.legend()

            plt.plot([0, 1], [0, 1])
            plt.xlabel('Actual proportion')
            plt.ylabel('Predicted proportion')
            plt.savefig('graphs/' + file[0][:-4] + '.png')
            plt.show()
def main():

    bounds = [0, 5, 10, 20, 50, 100, 250, np.inf]
    titles = ['0-4', '5-9', '10-19', '20-49', '50-99', '100-249', '250+']

    X = {t: [] for t in titles}
    Y = {t: [] for t in titles}
    Z = {t: [] for t in titles}

    for x in np.linspace(-1, 1, num=10):
        for y in np.linspace(0.5, 3, num=10):

            mean = x
            sd = y
            size = int(17000 + norm.rvs(0, 2000))
            sample = lognorm.rvs(sd, scale=np.exp(mean), size=size)
            binned_sample = analysis.sort_sample(sample)
            binned_sample = {
                titles[i]: lognorm.cdf(bounds[i + 1], sd, scale=np.exp(mean)) -
                lognorm.cdf(bounds[i], sd, scale=np.exp(mean))
                for i in range(len(bounds) - 1)
            }
            params_with_mean = calculate_parameters.max_likelihood(
                binned_sample, np.exp(mean + sd**2 / 2))
            params = calculate_parameters.max_likelihood(
                binned_sample, sample.mean())
            if params is None or params_with_mean is None:
                continue
            r_mean, r_sd = params

            r_sample = lognorm.rvs(r_sd, scale=np.exp(r_mean), size=size)
            r_binned_sample = analysis.sort_sample(r_sample)

            r_with_mean_mean, r_with_mean_sd = params_with_mean

            r_with_mean = lognorm.rvs(r_sd, scale=np.exp(r_mean), size=size)
            r_with_mean_binned_sample = analysis.sort_sample(r_with_mean)

            r_binned_sample = {
                titles[i]:
                lognorm.cdf(bounds[i + 1], r_sd, scale=np.exp(r_mean)) -
                lognorm.cdf(bounds[i], r_sd, scale=np.exp(r_mean))
                for i in range(len(bounds) - 1)
            }

            for t, p in binned_sample.items():
                X[t].append(p / np.sum([x for x in binned_sample.values()]))
                Y[t].append(r_binned_sample[t] /
                            np.sum([x for x in r_binned_sample.values()]))
                Z[t].append(
                    r_with_mean_binned_sample[t] /
                    np.sum([x for x in r_with_mean_binned_sample.values()]))
    plt.figure(figsize=(16, 6))
    plt.subplot(121)
    plt.loglog()
    ax = plt.gca()
    ax.set_xlim([10**-5, 10**0])
    ax.set_ylim([10**-5, 10**0])
    for t in titles:
        plt.scatter(X[t], Z[t], label=t)
    plt.xlabel('Actual proportion')
    plt.ylabel('Predicted proportion')
    plt.legend()
    plt.plot([0, 1], [0, 1])
    plt.subplot(122)
    plt.loglog()
    ax = plt.gca()
    ax.set_xlim([10**-5, 10**0])
    ax.set_ylim([10**-5, 10**0])
    plt.xlabel('Actual proportion')
    plt.ylabel('Predicted proportion')
    for i in range(len(X['0-4'])):
        plt.plot([X[t][i] for t in titles], [Y[t][i] for t in titles])

    plt.plot([0, 1], [0, 1])
    plt.savefig('loglog_reconstruction.png')
    plt.show()