def test_uniform_repartition(self): """ When the population is uniformly distributed """ city = uniform_city() r = mb.representation(city) r_answer = {"A":{1:1.0, 2:1.0, 3:1.0}, "B":{1:1.0, 2:1.0, 3:1.0}, "C":{1:1.0, 2:1.0, 3:1.0}} for au in r: for cat in r[au]: assert_almost_equal(r[au][cat][0], r_answer[au][cat])
def overrepresented_units(distribution, classes=None): """ Find the areal units in which each class is over-represented We say that a class `\alpha` is overrepresented in that tract `t` if the representation `r_\alpha(t)` is such that .. math:: r_\alpha(t) > 1 + 2.57 \sigma_\alpha(t) Parameters ---------- distribution: nested dictionaries Number of people per class, per areal unit as given in the raw data (ungrouped). The dictionary must have the following formatting: > {areal_id: {class_id: number}} classes: dictionary of lists When the original categories need to be aggregated into different classes. > {class: [categories belonging to this class]} This can be arbitrarily imposed, or computed with uncover_classes function of this package. Returns ------- units: dictionary of lists Dictionnary of classes, with the list of areal units where this class is overrepresented with 99% confidence. > {class:[list of areal units]} """ # Regroup into classes if specified. Otherwise return categories indicated # in the data if not classes: classes = return_categories(distribution) ## Compute the representation of the different classes in all areal units rep = mb.representation(distribution, classes) ## Find the tracts where classes are overrepresented areal_units = { cl: [ au for au in rep if rep[au][cl][0] > 1 + 2.57 * math.sqrt(rep[au][cl][1]) ] for cl in classes } return areal_units
def overrepresented_units(distribution, classes=None): """ Find the areal units in which each class is over-represented We say that a class `\alpha` is overrepresented in that tract `t` if the representation `r_\alpha(t)` is such that .. math:: r_\alpha(t) > 1 + 2.57 \sigma_\alpha(t) Parameters ---------- distribution: nested dictionaries Number of people per class, per areal unit as given in the raw data (ungrouped). The dictionary must have the following formatting: > {areal_id: {class_id: number}} classes: dictionary of lists When the original categories need to be aggregated into different classes. > {class: [categories belonging to this class]} This can be arbitrarily imposed, or computed with uncover_classes function of this package. Returns ------- units: dictionary of lists Dictionnary of classes, with the list of areal units where this class is overrepresented with 99% confidence. > {class:[list of areal units]} """ # Regroup into classes if specified. Otherwise return categories indicated # in the data if not classes: classes = return_categories(distribution) ## Compute the representation of the different classes in all areal units rep = mb.representation(distribution, classes) ## Find the tracts where classes are overrepresented areal_units = {cl:[au for au in rep if rep[au][cl][0] > 1 + 2.57*math.sqrt(rep[au][cl][1])] for cl in classes} return areal_units
def test_fake_city(self): """ Test on values computed by hand """ city = fake_city() r = mb.representation(city) # Answers r_answer = {"A":{1:0, 2:1.479, 3:1.218}, "B":{1:1.9525, 2:0.104, 3:1.6705}, "C":{1:1.183, 2:1.237, 3:0.309}} var_answer = {"A":{1:0.0979, 2:0.0575, 3:0.1151}, "B":{1:0.1275, 2:0.0750, 3:0.1500}, "C":{1:0.0814, 2:0.0479, 3:0.0958}} # Test for au in r: for cat in r[au]: assert_almost_equal(r[au][cat][0], r_answer[au][cat], places=3) assert_almost_equal(r[au][cat][1], var_answer[au][cat], places=3)
def exposure(distribution, classes=None): """ Compute the exposure between classes The exposure between two categories `\alpha` and `\beta` is defined as ..math:: E_{\alpha \beta} = \frac{1}{N} \sum_{t=1}^{T} n(t) r_\alpha(t) r_\beta(t) where `r_\alpha(t)` is the representation of the class `\alpha` in the areal unit `t`, `n(t)` the total population of `t`, and `N` the total population in the considered system. The exposure of a class to itself `E_{\alpha \alpha}` measures the **isolation** of this class. The variance is computed on the null model which corresponds to the unsegregated configuration, that is when the spatial repartition of people of different income classes is no different from that that would be obtained if they scattered at random across the city. Parameters ---------- distribution: nested dictionaries Number of people per class, per areal unit as given in the raw data (ungrouped). The dictionary must have the following formatting: > {areal_id: {class_id: number}} classes: dictionary of lists When the original categories need to be aggregated into different classes. {class: [categories belonging to this class]} This can be arbitrarily imposed, or computed with uncover_classes function of this package. Returns ------- exposure: nested dictionaries Matrix of exposures between categories. > {class_id0: {class_id1: (exposure_01, variance null model)}} """ ## Regroup into classes if specified. if classes: distribution = regroup_per_class(distribution, classes) else: classes = return_categories(distribution) ## Compute the total numbers per class and per areal unit N_unit, N_class, N_tot = compute_totals(distribution, classes) ## Compute representation for all areal unit representation = mb.representation(distribution) ## Compute the exposure matrix # Only half of the values are computed (the matrix is symmetric) exposure = collections.defaultdict(dict) for alpha, beta in itertools.combinations_with_replacement(classes, 2): exposure[alpha][beta] = (pair_exposure(representation, N_unit, N_tot, alpha, beta), pair_variance(representation, N_unit, N_class, N_tot, alpha, beta)) # Symmetrize the output for c0 in exposure.keys(): for c1 in exposure[c0].keys(): if c0 not in exposure[c1]: exposure[c1][c0] = exposure[c0][c1] return exposure
def test_empty_unit(self): """ When an areal unit is empty """ city = empty_unit_city() r = mb.representation(city) for cat in r["A"]: assert_true( math.isnan(r["A"][cat][0]) )
def test_missing_class(self): """ When the zone is empty of a given category """ city = missing_class_city() r = mb.representation(city) for au in r: assert_true( math.isnan(r[au][3][0]) )
def test_no_one(self): """ When a unit is empty of a given category """ city = fake_city() r = mb.representation(city) assert_almost_equal(r["A"][1][0], 0)
# Extract and save the data # for j, city in enumerate(msa): print "Extract the representation of categories for %s (%s/%s)" % ( city, j + 1, len(msa)) ## Import category composition households = {} with open('data/income/msa/%s/income.csv' % city, 'r') as source: reader = csv.reader(source, delimiter='\t') reader.next() for rows in reader: households[rows[0]] = {c: int(h) for c, h in enumerate(rows[1:])} ## Compute representation and variance rep = mb.representation(households) ## Save the values with open('extr/representation/categories/msa/%s_values.csv' % city, 'w') as output: output.write('BLOCKGROUP FIP\n') for bg in rep: output.write(str(bg)) for cat in sorted(rep[bg].iterkeys()): val, var = rep[bg][cat] output.write('\t%s' % val) output.write('\n') ## Save the variance with open('extr/representation/categories/msa/%s_variance.csv' % city, 'w') as output:
categories = range(len(rows[1:])) income[rows[0]] = {c: int(h) for c, h in enumerate(rows[1:])} # Aggregate all distribution[city] = { c: sum([income[bg][c] for bg in income]) for c in categories } households[city] = sum(distribution[city].values()) # # Get the over and under representation figures # ## Country-wide representation representation = mb.representation(distribution, classes) ## Representation in order of population over = {cl: [] for cl in classes} under = {cl: [] for cl in classes} normal = {cl: [] for cl in classes} for city in sorted(msa, key=lambda x: households[x]): for cl in classes: delta = representation[city][cl][0] - 1 sigma = math.sqrt(representation[city][cl][1]) if abs(delta) <= 2.57 * sigma: over[cl].append(0) under[cl].append(0) normal[cl].append(1) else: if delta < 0:
'high': {c: 0 for c in categories} } for j, (bg, rho) in enumerate( sorted(density.iteritems(), key=lambda x: density[x[0]], reverse=1)): print "%s/%s" % (j + 1, len(density)) # Update the numbers for c in categories: income['high'][c] += households[bg][c] income['low'][c] -= households[bg][c] # Compute the representation representation = mb.representation(income, classes) representation_high.append(representation['high']) representation_low.append(representation['low']) # # Save the data # ## High-density neighbourhoods with open( 'extr/representation/classes/density_percolation/%s_high-density.csv' % city, 'w') as output: output.write('Density thresholds (/km^2)') for cl in sorted(classes): output.write('\tRepresentation %s\tVariance %s' % (cl, cl))
## Import blockgroup income distribution households = {} with open('data/income/msa/%s/income.csv' % city, 'r') as source: reader = csv.reader(source, delimiter='\t') reader.next() for rows in reader: categories = range(len(rows[1:])) households[rows[0]] = {c: int(h) for c, h in enumerate(rows[1:])} H_block = {bg: sum(households[bg].values()) for bg in households} ## Compute population density density = {bg: H_block[bg] / area[bg] for bg in H_block} ## Compute representation representation = mb.representation(households, classes) # # Save the data # with open('extr/representation/classes/density/%s_density.csv' % city, 'w') as output: output.write('Blockgroup FIP\tDensity (/km^2)') for cl in sorted(classes): output.write('\tRepresentation %s\tVariance %s' % (cl, cl)) output.write('\n') for bg in sorted(density, key=lambda x: density[x], reverse=1): rho = density[bg] output.write(str(bg) + '\t' + str(rho)) for cl in sorted(classes):
def exposure(distribution, classes=None): """ Compute the exposure between classes The exposure between two categories `\alpha` and `\beta` is defined as ..math:: E_{\alpha \beta} = \frac{1}{N} \sum_{t=1}^{T} n(t) r_\alpha(t) r_\beta(t) where `r_\alpha(t)` is the representation of the class `\alpha` in the areal unit `t`, `n(t)` the total population of `t`, and `N` the total population in the considered system. The exposure of a class to itself `E_{\alpha \alpha}` measures the **isolation** of this class. The variance is computed on the null model which corresponds to the unsegregated configuration, that is when the spatial repartition of people of different income classes is no different from that that would be obtained if they scattered at random across the city. Parameters ---------- distribution: nested dictionaries Number of people per class, per areal unit as given in the raw data (ungrouped). The dictionary must have the following formatting: > {areal_id: {class_id: number}} classes: dictionary of lists When the original categories need to be aggregated into different classes. {class: [categories belonging to this class]} This can be arbitrarily imposed, or computed with uncover_classes function of this package. Returns ------- exposure: nested dictionaries Matrix of exposures between categories. > {class_id0: {class_id1: (exposure_01, variance null model)}} """ ## Regroup into classes if specified. if classes: distribution = regroup_per_class(distribution, classes) else: classes = return_categories(distribution) ## Compute the total numbers per class and per areal unit N_unit, N_class, N_tot = compute_totals(distribution, classes) ## Compute representation for all areal unit representation = mb.representation(distribution) ## Compute the exposure matrix # Only half of the values are computed (the matrix is symmetric) exposure = collections.defaultdict(dict) for alpha, beta in itertools.combinations_with_replacement(classes, 2): exposure[alpha][beta] = (pair_exposure(representation, N_unit, N_tot, alpha, beta), pair_variance(representation, N_unit, N_class, N_tot, alpha, beta)) # Symmetrize the output for c0 in exposure.iterkeys(): for c1 in exposure[c0].iterkeys(): if c0 not in exposure[c1]: exposure[c1][c0] = exposure[c0][c1] return exposure
households = {} with open('data/income/msa/%s/income.csv'%city, 'r') as source: reader = csv.reader(source, delimiter='\t') reader.next() for rows in reader: categories = range(len(rows[1:])) households[rows[0]] = {c:int(h) for c,h in enumerate(rows[1:])} H_block = {bg:sum(households[bg].values()) for bg in households} ## Compute population density density = {bg:H_block[bg] / area[bg] for bg in H_block} ## Compute representation representation = mb.representation(households, classes) # # Save the data # with open('extr/representation/classes/density/%s_density.csv'%city, 'w') as output: output.write('Blockgroup FIP\tDensity (/km^2)') for cl in sorted(classes): output.write('\tRepresentation %s\tVariance %s'%(cl,cl)) output.write('\n') for bg in sorted(density, key=lambda x:density[x], reverse=1): rho = density[bg] output.write(str(bg)+'\t'+str(rho))
df_state_indexed = df_state.set_index(['GISJOIN']) dict_state = df_state_indexed.to_dict(orient='index') dict_by_state[state] = dict_state #dict_by_state['01']['G0101250011401']['ADKXE008'] print 'Ended filling the dict by state' ############################ #4. Calculate representation values for data based on null model for each state. # and work out a population weighted z_score over different classes for each gisjoin ############################ print 'Starting caluclation of representation values based on null model for each state' r_dict_by_state = {} for state in dict_by_state.keys(): r_dict_state = mb.representation(dict_by_state[state]) r_dict_by_state[state] = r_dict_state print 'Ended calculation of representation values based on null model for each state' #r_dict_by_state['01']['G0101250011401']['ADKXE008'] ##################### #4.1 Convert r_dicts to pandas dataframe ##################### def convert_r_dict_to_df(r_dict): r_df = pd.DataFrame([ (area_id, class_id, tuple_of_r_outcomes) for area_id, dict_of_classes_and_rvalues in r_dict.items() for class_id, tuple_of_r_outcomes in dict_of_classes_and_rvalues.items()
income = {'low': {c:sum([households[bg][c] for bg in households]) for c in categories}, 'high': {c:0 for c in categories}} for j, (bg, rho) in enumerate(sorted(density.iteritems(), key=lambda x: density[x[0]], reverse=1)): print "%s/%s"%(j+1, len(density)) # Update the numbers for c in categories: income['high'][c] += households[bg][c] income['low'][c] -= households[bg][c] # Compute the representation representation = mb.representation(income, classes) representation_high.append(representation['high']) representation_low.append(representation['low']) # # Save the data # ## High-density neighbourhoods with open('extr/representation/classes/density_percolation/%s_high-density.csv'%city, 'w') as output: output.write('Density thresholds (/km^2)') for cl in sorted(classes): output.write('\tRepresentation %s\tVariance %s'%(cl,cl))
categories = range(len(rows[1:])) income[rows[0]] = {c:int(h) for c,h in enumerate(rows[1:])} # Aggregate all distribution[city] = {c:sum([income[bg][c] for bg in income]) for c in categories} households[city] = sum(distribution[city].values()) # # Get the over and under representation figures # ## Country-wide representation representation = mb.representation(distribution, classes) ## Representation in order of population over = {cl:[] for cl in classes} under = {cl:[] for cl in classes} normal = {cl:[] for cl in classes} for city in sorted(msa, key=lambda x: households[x]): for cl in classes: delta = representation[city][cl][0] - 1 sigma = math.sqrt(representation[city][cl][1]) if abs(delta) <= 2.57*sigma: over[cl].append(0) under[cl].append(0) normal[cl].append(1) else: if delta < 0:
for j, city in enumerate(msa): print "Extract the representation of categories for %s (%s/%s)"%(city, j+1, len(msa)) ## Import category composition households = {} with open('data/income/msa/%s/income.csv'%city, 'r') as source: reader = csv.reader(source, delimiter='\t') reader.next() for rows in reader: households[rows[0]] = {c:int(h) for c,h in enumerate(rows[1:])} ## Compute representation and variance rep = mb.representation(households) ## Save the values with open('extr/representation/categories/msa/%s_values.csv'%city, 'w') as output: output.write('BLOCKGROUP FIP\n') for bg in rep: output.write(str(bg)) for cat in sorted(rep[bg].iterkeys()): val, var = rep[bg][cat] output.write('\t%s'%val) output.write('\n') ## Save the variance with open('extr/representation/categories/msa/%s_variance.csv'%city, 'w') as output: output.write('BLOCKGROUP FIP\n') for bg in rep: