Пример #1
0
 def test_uniform_repartition(self):
     """ When the population is uniformly distributed """
     city = uniform_city()
     r = mb.representation(city)
     r_answer = {"A":{1:1.0, 2:1.0, 3:1.0},
                 "B":{1:1.0, 2:1.0, 3:1.0},
                 "C":{1:1.0, 2:1.0, 3:1.0}}
     for au in r:
         for cat in r[au]: 
             assert_almost_equal(r[au][cat][0],
                                 r_answer[au][cat])
Пример #2
0
def overrepresented_units(distribution, classes=None):
    """ Find the areal units in which each class is over-represented
   
    We say that a class `\alpha` is overrepresented in that tract `t` if the
    representation `r_\alpha(t)` is such that

    .. math::
        r_\alpha(t) > 1 + 2.57 \sigma_\alpha(t)

    Parameters
    ----------

    distribution: nested dictionaries
        Number of people per class, per areal unit as given in the raw data
        (ungrouped). The dictionary must have the following formatting:
        > {areal_id: {class_id: number}}

    classes: dictionary of lists
        When the original categories need to be aggregated into different
        classes. 
        > {class: [categories belonging to this class]}
        This can be arbitrarily imposed, or computed with uncover_classes
        function of this package.

    Returns
    -------

    units: dictionary of lists
        Dictionnary of classes, with the list of areal units where this class is
        overrepresented with 99% confidence.
        > {class:[list of areal units]}
    """
    # Regroup into classes if specified. Otherwise return categories indicated
    # in the data
    if not classes:
        classes = return_categories(distribution)

    ## Compute the representation of the different classes in all areal units
    rep = mb.representation(distribution, classes)

    ## Find the tracts where classes are overrepresented
    areal_units = {
        cl: [
            au for au in rep
            if rep[au][cl][0] > 1 + 2.57 * math.sqrt(rep[au][cl][1])
        ]
        for cl in classes
    }

    return areal_units
Пример #3
0
def overrepresented_units(distribution, classes=None):
    """ Find the areal units in which each class is over-represented
   
    We say that a class `\alpha` is overrepresented in that tract `t` if the
    representation `r_\alpha(t)` is such that

    .. math::
        r_\alpha(t) > 1 + 2.57 \sigma_\alpha(t)

    Parameters
    ----------

    distribution: nested dictionaries
        Number of people per class, per areal unit as given in the raw data
        (ungrouped). The dictionary must have the following formatting:
        > {areal_id: {class_id: number}}

    classes: dictionary of lists
        When the original categories need to be aggregated into different
        classes. 
        > {class: [categories belonging to this class]}
        This can be arbitrarily imposed, or computed with uncover_classes
        function of this package.

    Returns
    -------

    units: dictionary of lists
        Dictionnary of classes, with the list of areal units where this class is
        overrepresented with 99% confidence.
        > {class:[list of areal units]}
    """
    # Regroup into classes if specified. Otherwise return categories indicated
    # in the data
    if not classes:
       classes = return_categories(distribution) 


    ## Compute the representation of the different classes in all areal units
    rep = mb.representation(distribution, classes)

    ## Find the tracts where classes are overrepresented
    areal_units = {cl:[au for au in rep
                          if rep[au][cl][0] > 1 + 2.57*math.sqrt(rep[au][cl][1])] 
                    for cl in classes}

    return areal_units
Пример #4
0
    def test_fake_city(self):
        """ Test on values computed by hand """
        city = fake_city()
        r = mb.representation(city)

        # Answers
        r_answer = {"A":{1:0, 2:1.479, 3:1.218},
                    "B":{1:1.9525, 2:0.104, 3:1.6705},
                    "C":{1:1.183, 2:1.237, 3:0.309}}

        var_answer = {"A":{1:0.0979, 2:0.0575, 3:0.1151},
                      "B":{1:0.1275, 2:0.0750, 3:0.1500},
                      "C":{1:0.0814, 2:0.0479, 3:0.0958}}

        # Test
        for au in r:
            for cat in r[au]: 
                assert_almost_equal(r[au][cat][0],
                                    r_answer[au][cat],
                                    places=3)
                assert_almost_equal(r[au][cat][1],
                                    var_answer[au][cat],
                                    places=3)
Пример #5
0
def exposure(distribution, classes=None):
    """ Compute the exposure between classes
    
    The exposure between two categories `\alpha` and `\beta` is defined as

    ..math::
        E_{\alpha \beta} = \frac{1}{N} \sum_{t=1}^{T} n(t) r_\alpha(t)
        r_\beta(t)

    where `r_\alpha(t)` is the representation of the class `\alpha` in the areal
    unit `t`, `n(t)` the total population of `t`, and `N` the total population
    in the considered system.

    The exposure of a class to itself `E_{\alpha \alpha}` measures the
    **isolation** of this class.

    The variance is computed on the null model which corresponds to the
    unsegregated configuration, that is when the spatial repartition of people
    of different income classes is no different from that that would be obtained
    if they scattered at random across the city.

    Parameters
    ----------

    distribution: nested dictionaries
        Number of people per class, per areal unit as given in the raw data
        (ungrouped). The dictionary must have the following formatting:
        > {areal_id: {class_id: number}}

    classes: dictionary of lists
        When the original categories need to be aggregated into different
        classes. {class: [categories belonging to this class]}
        This can be arbitrarily imposed, or computed with uncover_classes
        function of this package.

    Returns
    -------

    exposure: nested dictionaries
        Matrix of exposures between categories.
        > {class_id0: {class_id1: (exposure_01, variance null model)}} 
    """
    ## Regroup into classes if specified.
    if classes:
        distribution = regroup_per_class(distribution, classes)
    else:
       classes = return_categories(distribution) 


    ## Compute the total numbers per class and per areal unit 
    N_unit, N_class, N_tot = compute_totals(distribution, classes) 


    ## Compute representation for all areal unit
    representation = mb.representation(distribution)


    ## Compute the exposure matrix
    # Only half of the values are computed (the matrix is symmetric)
    exposure = collections.defaultdict(dict)
    for alpha, beta in itertools.combinations_with_replacement(classes, 2):
        exposure[alpha][beta] = (pair_exposure(representation, N_unit, N_tot, alpha, beta),
                                 pair_variance(representation, N_unit, N_class, N_tot, alpha, beta))

    # Symmetrize the output
    for c0 in exposure.keys():
        for c1 in exposure[c0].keys():
            if c0 not in exposure[c1]:
                exposure[c1][c0] = exposure[c0][c1]

    return exposure 
Пример #6
0
 def test_empty_unit(self):
     """ When an areal unit is empty """
     city = empty_unit_city()
     r = mb.representation(city)
     for cat in r["A"]:
         assert_true( math.isnan(r["A"][cat][0]) )
Пример #7
0
 def test_missing_class(self):
     """ When the zone is empty of a given category """
     city = missing_class_city()
     r = mb.representation(city)
     for au in r:
         assert_true( math.isnan(r[au][3][0]) )
Пример #8
0
 def test_no_one(self):
     """ When a unit is empty of a given category """
     city = fake_city()
     r = mb.representation(city)
     assert_almost_equal(r["A"][1][0], 0)
Пример #9
0
# Extract and save the data
#
for j, city in enumerate(msa):
    print "Extract the representation of categories for %s (%s/%s)" % (
        city, j + 1, len(msa))

    ## Import category composition
    households = {}
    with open('data/income/msa/%s/income.csv' % city, 'r') as source:
        reader = csv.reader(source, delimiter='\t')
        reader.next()
        for rows in reader:
            households[rows[0]] = {c: int(h) for c, h in enumerate(rows[1:])}

    ## Compute representation and variance
    rep = mb.representation(households)

    ## Save the values
    with open('extr/representation/categories/msa/%s_values.csv' % city,
              'w') as output:
        output.write('BLOCKGROUP FIP\n')
        for bg in rep:
            output.write(str(bg))
            for cat in sorted(rep[bg].iterkeys()):
                val, var = rep[bg][cat]
                output.write('\t%s' % val)
            output.write('\n')

    ## Save the variance
    with open('extr/representation/categories/msa/%s_variance.csv' % city,
              'w') as output:
Пример #10
0
            categories = range(len(rows[1:]))
            income[rows[0]] = {c: int(h) for c, h in enumerate(rows[1:])}

    # Aggregate all
    distribution[city] = {
        c: sum([income[bg][c] for bg in income])
        for c in categories
    }
    households[city] = sum(distribution[city].values())

#
# Get the over and under representation figures
#

## Country-wide representation
representation = mb.representation(distribution, classes)

## Representation in order of population
over = {cl: [] for cl in classes}
under = {cl: [] for cl in classes}
normal = {cl: [] for cl in classes}
for city in sorted(msa, key=lambda x: households[x]):
    for cl in classes:
        delta = representation[city][cl][0] - 1
        sigma = math.sqrt(representation[city][cl][1])
        if abs(delta) <= 2.57 * sigma:
            over[cl].append(0)
            under[cl].append(0)
            normal[cl].append(1)
        else:
            if delta < 0:
        'high': {c: 0
                 for c in categories}
    }

    for j, (bg, rho) in enumerate(
            sorted(density.iteritems(), key=lambda x: density[x[0]],
                   reverse=1)):
        print "%s/%s" % (j + 1, len(density))

        # Update the numbers
        for c in categories:
            income['high'][c] += households[bg][c]
            income['low'][c] -= households[bg][c]

        # Compute the representation
        representation = mb.representation(income, classes)
        representation_high.append(representation['high'])
        representation_low.append(representation['low'])

    #
    # Save the data
    #

    ## High-density neighbourhoods
    with open(
            'extr/representation/classes/density_percolation/%s_high-density.csv'
            % city, 'w') as output:
        output.write('Density thresholds (/km^2)')
        for cl in sorted(classes):
            output.write('\tRepresentation %s\tVariance %s' % (cl, cl))
    ## Import blockgroup income distribution
    households = {}
    with open('data/income/msa/%s/income.csv' % city, 'r') as source:
        reader = csv.reader(source, delimiter='\t')
        reader.next()
        for rows in reader:
            categories = range(len(rows[1:]))
            households[rows[0]] = {c: int(h) for c, h in enumerate(rows[1:])}

    H_block = {bg: sum(households[bg].values()) for bg in households}

    ## Compute population density
    density = {bg: H_block[bg] / area[bg] for bg in H_block}

    ## Compute representation
    representation = mb.representation(households, classes)

    #
    # Save the data
    #
    with open('extr/representation/classes/density/%s_density.csv' % city,
              'w') as output:
        output.write('Blockgroup FIP\tDensity (/km^2)')
        for cl in sorted(classes):
            output.write('\tRepresentation %s\tVariance %s' % (cl, cl))

        output.write('\n')
        for bg in sorted(density, key=lambda x: density[x], reverse=1):
            rho = density[bg]
            output.write(str(bg) + '\t' + str(rho))
            for cl in sorted(classes):
Пример #13
0
def exposure(distribution, classes=None):
    """ Compute the exposure between classes
    
    The exposure between two categories `\alpha` and `\beta` is defined as

    ..math::
        E_{\alpha \beta} = \frac{1}{N} \sum_{t=1}^{T} n(t) r_\alpha(t)
        r_\beta(t)

    where `r_\alpha(t)` is the representation of the class `\alpha` in the areal
    unit `t`, `n(t)` the total population of `t`, and `N` the total population
    in the considered system.

    The exposure of a class to itself `E_{\alpha \alpha}` measures the
    **isolation** of this class.

    The variance is computed on the null model which corresponds to the
    unsegregated configuration, that is when the spatial repartition of people
    of different income classes is no different from that that would be obtained
    if they scattered at random across the city.

    Parameters
    ----------

    distribution: nested dictionaries
        Number of people per class, per areal unit as given in the raw data
        (ungrouped). The dictionary must have the following formatting:
        > {areal_id: {class_id: number}}

    classes: dictionary of lists
        When the original categories need to be aggregated into different
        classes. {class: [categories belonging to this class]}
        This can be arbitrarily imposed, or computed with uncover_classes
        function of this package.

    Returns
    -------

    exposure: nested dictionaries
        Matrix of exposures between categories.
        > {class_id0: {class_id1: (exposure_01, variance null model)}} 
    """
    ## Regroup into classes if specified.
    if classes:
        distribution = regroup_per_class(distribution, classes)
    else:
       classes = return_categories(distribution) 


    ## Compute the total numbers per class and per areal unit 
    N_unit, N_class, N_tot = compute_totals(distribution, classes) 


    ## Compute representation for all areal unit
    representation = mb.representation(distribution)


    ## Compute the exposure matrix
    # Only half of the values are computed (the matrix is symmetric)
    exposure = collections.defaultdict(dict)
    for alpha, beta in itertools.combinations_with_replacement(classes, 2):
        exposure[alpha][beta] = (pair_exposure(representation, N_unit, N_tot, alpha, beta),
                                 pair_variance(representation, N_unit, N_class, N_tot, alpha, beta))

    # Symmetrize the output
    for c0 in exposure.iterkeys():
        for c1 in exposure[c0].iterkeys():
            if c0 not in exposure[c1]:
                exposure[c1][c0] = exposure[c0][c1]

    return exposure 
    households = {}
    with open('data/income/msa/%s/income.csv'%city, 'r') as source:
        reader = csv.reader(source, delimiter='\t')
        reader.next()
        for rows in reader:
            categories = range(len(rows[1:]))
            households[rows[0]] = {c:int(h) for c,h in enumerate(rows[1:])}

    H_block = {bg:sum(households[bg].values()) for bg in households}


    ## Compute population density
    density = {bg:H_block[bg] / area[bg] for bg in H_block}

    ## Compute representation
    representation = mb.representation(households, classes)


    #
    # Save the data
    #
    with open('extr/representation/classes/density/%s_density.csv'%city,
            'w') as output:
        output.write('Blockgroup FIP\tDensity (/km^2)')
        for cl in sorted(classes):
            output.write('\tRepresentation %s\tVariance %s'%(cl,cl))

        output.write('\n')
        for bg in sorted(density, key=lambda x:density[x], reverse=1):
            rho = density[bg]
            output.write(str(bg)+'\t'+str(rho))
    df_state_indexed = df_state.set_index(['GISJOIN'])
    dict_state = df_state_indexed.to_dict(orient='index')
    dict_by_state[state] = dict_state

#dict_by_state['01']['G0101250011401']['ADKXE008']
print 'Ended filling the dict by state'

############################
#4.  Calculate representation values for data based on null model for each state.
# and work out a population weighted z_score over different classes for each gisjoin
############################

print 'Starting caluclation of representation values based on null model for each state'
r_dict_by_state = {}
for state in dict_by_state.keys():
    r_dict_state = mb.representation(dict_by_state[state])
    r_dict_by_state[state] = r_dict_state
print 'Ended calculation of representation values based on null model for each state'

#r_dict_by_state['01']['G0101250011401']['ADKXE008']

#####################
#4.1 Convert r_dicts to pandas dataframe
#####################


def convert_r_dict_to_df(r_dict):
    r_df = pd.DataFrame([
        (area_id, class_id, tuple_of_r_outcomes)
        for area_id, dict_of_classes_and_rvalues in r_dict.items() for
        class_id, tuple_of_r_outcomes in dict_of_classes_and_rvalues.items()
    income = {'low': {c:sum([households[bg][c] for bg in households])
                for c in categories},
            'high': {c:0 for c in categories}}

    for j, (bg, rho) in enumerate(sorted(density.iteritems(),
                                    key=lambda x: density[x[0]],
                                    reverse=1)):
        print "%s/%s"%(j+1, len(density)) 

        # Update the numbers
        for c in categories:
            income['high'][c] += households[bg][c]
            income['low'][c] -= households[bg][c]

        # Compute the representation
        representation = mb.representation(income, classes)
        representation_high.append(representation['high'])
        representation_low.append(representation['low'])


    #
    # Save the data
    #

    ## High-density neighbourhoods
    with open('extr/representation/classes/density_percolation/%s_high-density.csv'%city,
            'w') as output:
        output.write('Density thresholds (/km^2)')
        for cl in sorted(classes):
            output.write('\tRepresentation %s\tVariance %s'%(cl,cl))
            categories = range(len(rows[1:]))
            income[rows[0]] = {c:int(h) for c,h in enumerate(rows[1:])}
    
    # Aggregate all
    distribution[city] = {c:sum([income[bg][c] for bg in income]) for c in categories}
    households[city] = sum(distribution[city].values())




#
# Get the over and under representation figures 
#

## Country-wide representation
representation = mb.representation(distribution, classes)

## Representation in order of population
over = {cl:[] for cl in classes}
under = {cl:[] for cl in classes}
normal = {cl:[] for cl in classes}
for city in sorted(msa, key=lambda x: households[x]):
    for cl in classes:
        delta = representation[city][cl][0] - 1
        sigma = math.sqrt(representation[city][cl][1]) 
        if abs(delta) <= 2.57*sigma:
            over[cl].append(0)
            under[cl].append(0)
            normal[cl].append(1)
        else:
            if delta < 0:
for j, city in enumerate(msa):
    print "Extract the representation of categories for %s (%s/%s)"%(city,
                                                                    j+1,
                                                                    len(msa))


    ## Import category composition
    households = {}
    with open('data/income/msa/%s/income.csv'%city, 'r') as source:
        reader = csv.reader(source, delimiter='\t')
        reader.next()
        for rows in reader:
            households[rows[0]] = {c:int(h) for c,h in enumerate(rows[1:])}

    ## Compute representation and variance
    rep = mb.representation(households) 

    ## Save the values
    with open('extr/representation/categories/msa/%s_values.csv'%city, 'w') as output:
        output.write('BLOCKGROUP FIP\n')
        for bg in rep:
            output.write(str(bg))
            for cat in sorted(rep[bg].iterkeys()):
                val, var = rep[bg][cat]
                output.write('\t%s'%val)
            output.write('\n')

    ## Save the variance
    with open('extr/representation/categories/msa/%s_variance.csv'%city, 'w') as output:
        output.write('BLOCKGROUP FIP\n')
        for bg in rep: