Exemplo n.º 1
0
def clustering(distribution, areal_units, classes=None):
    """ Return the clustering coefficient for the different classes
    
    Assume that the class `c` is overrepresented in `N_u` areal units, and that
    we obtain `N_c` clusters after aggregating the neighbourhings units.
    Parameter
    ---------

    distribution: nested dictionaries
        Number of people per class, per areal unit as given in the raw data
        (ungrouped). The dictionary must have the following formatting:
        > {areal_id: {class_id: number}}

    areal_units: dictionnary
        Dictionnary of areal unit ids with shapely polygon object representing
        the unit's geometry as values.

    classes: dictionary of lists
        When the original categories need to be aggregated into different
        classes. 
        > {class: [categories belonging to this class]}
        This can be arbitrarily imposed, or computed with uncover_classes
        function of this package.

    Returns
    -------

    clustering: dictionary
        Dictionary of classes names with clustering values.
    """

    # Regroup into classes if specified. Otherwise return categories indicated
    # in the data
    if not classes:
        classes = return_categories(distribution)

    ## Get the number of neighbourhoods
    neigh = mb.neighbourhoods(distribution, areal_units, classes)
    num_neigh = {cl: len(neigh[cl]) for cl in classes}
    num_units = {
        cl: len([a for ne in neigh[cl] for a in ne])
        for cl in classes
    }

    ## Compute clustering values
    clustering = {}
    for cl in classes:
        if num_units[cl] == 0:
            clustering[cl] = float('nan')
        elif num_units[cl] == 1:
            clustering[cl] = 1
        else:
            clustering[cl] = _single_clustering(num_units[cl], num_neigh[cl])

            clustering[cl] = ((num_neigh[cl] - num_units[cl]) /
                              (1 - num_units[cl]))
    return clustering
Exemplo n.º 2
0
def clustering(distribution, areal_units, classes=None):
    """ Return the clustering coefficient for the different classes
    
    Assume that the class `c` is overrepresented in `N_u` areal units, and that
    we obtain `N_c` clusters after aggregating the neighbourhings units.
    Parameter
    ---------

    distribution: nested dictionaries
        Number of people per class, per areal unit as given in the raw data
        (ungrouped). The dictionary must have the following formatting:
        > {areal_id: {class_id: number}}

    areal_units: dictionnary
        Dictionnary of areal unit ids with shapely polygon object representing
        the unit's geometry as values.

    classes: dictionary of lists
        When the original categories need to be aggregated into different
        classes. 
        > {class: [categories belonging to this class]}
        This can be arbitrarily imposed, or computed with uncover_classes
        function of this package.

    Returns
    -------

    clustering: dictionary
        Dictionary of classes names with clustering values.
    """

    # Regroup into classes if specified. Otherwise return categories indicated
    # in the data
    if not classes:
       classes = return_categories(distribution) 
    
    ## Get the number of neighbourhoods
    neigh = mb.neighbourhoods(distribution, areal_units, classes)
    num_neigh = {cl: len(neigh[cl]) for cl in classes}
    num_units = {cl: len([a for ne in neigh[cl] for a in ne])
                    for cl in classes}

    ## Compute clustering values
    clustering = {}
    for cl in classes:
        if num_units[cl] == 0:
            clustering[cl] = float('nan')
        elif num_units[cl] == 1:
            clustering[cl] = 1
        else:
            clustering[cl] = _single_clustering(num_units[cl],
                                                num_neigh[cl])

            clustering[cl] = ((num_neigh[cl] - num_units[cl]) /
                              (1 - num_units[cl]))
    return clustering
Exemplo n.º 3
0
def dissimilarity(distribution, classes=None):
    """ Compute the inter-class dissimilarity index

    The dissimilarity index between two categories `\alpha` and `\beta` is
    defined as 

    ..math::
        D_{\alpha \beta} = \frac{1}{2} \sum_{i=1}^{T} \left|
    \frac{n_\alpha(t)}{N_\alpha} - \frac{n_\beta(t)}{N_\beta} \right|

    Its value ranges from 0 to 1.

    Parameters
    ----------

    distribution: nested dictionaries
        Number of people per class, per areal unit as given in the raw data
        (ungrouped). The dictionary must have the following formatting:
        > {areal_id: {class_id: number}}

    classes: dictionary of lists
        When the original categories need to be aggregated into different
        classes. {class: [categories belonging to this class]}
        This can be arbitrarily imposed, or computed with uncover_classes
        function of this package.

    Returns
    -------

    dissimilarity: nested dictionary
        Classes matrix with dissimilarity as values
        > {alpha: {beta: D_{\alpha \beta}}}
    """
    ## Regroup into classes if specified
    if classes is not None:
        distribution = regroup_per_class(distribution, classes)
    else:
        classes = return_categories(distribution)

    ## Compute total numbers of individuals per class and areal unit
    N_unit, N_class, N_tot = compute_totals(distribution, classes)

    ## Compute the dissimilarity matrix
    # Only half of the values are computed (the matrix is symmetric)
    dissimilarity = collections.defaultdict(dict)
    for alpha, beta in itertools.combinations_with_replacement(classes, 2):
        dissimilarity[alpha][beta] = _pair_dissimilarity(
            distribution, N_class, alpha, beta)

    # Symmetrize the output
    for c0 in dissimilarity.iterkeys():
        for c1 in dissimilarity[c0].iterkeys():
            if c0 not in dissimilarity[c1]:
                dissimilarity[c1][c0] = dissimilarity[c0][c1]

    return dissimilarity
Exemplo n.º 4
0
def neighbourhoods(distribution, areal_units, classes=None):
    """ Return the neighbourhoods where different classes gather

    Parameter
    ---------

    distribution: nested dictionaries
        Number of people per class, per areal unit as given in the raw data
        (ungrouped). The dictionary must have the following formatting:
        > {areal_id: {class_id: number}}

    areal_units: dictionnary
        Dictionnary of areal unit ids with shapely polygon object representing
        the unit's geometry as values.

    classes: dictionary of lists
        When the original categories need to be aggregated into different
        classes. 
        > {class: [categories belonging to this class]}
        This can be arbitrarily imposed, or computed with uncover_classes
        function of this package.

    Returns
    -------

    neighbourhoods: dictionary
        Dictionary of classes names with list of neighbourhoods (that are
        each represented by a list of areal unit)
        > {'class': [ [areal units in cluster i], ...]}
    """

    # Regroup into classes if specified. Otherwise return categories indicated
    # in the data
    if not classes:
        classes = return_categories(distribution)

    ## Find the areal units where classes are overrepresented
    or_units = overrepresented_units(distribution, classes)

    ## Compute the adjacency list
    adjacency = _adjacency(areal_units)

    ## Extract neighbourhooods as connected components
    G = nx.from_dict_of_lists(adjacency)  # Graph from adjacency
    neighbourhoods = {
        cl: [
            list(subgraph) for subgraph in nx.connected_component_subgraphs(
                G.subgraph(or_units[cl]))
        ]
        for cl in classes
    }

    return neighbourhoods
Exemplo n.º 5
0
def uncover_classes(distribution, exposure, ci_factor=10):
    """ Returns the categories sorted in classes

    The classes are uncovered using the spatial repartition of individuals from
    different categories, using their relative exposure.

    We only aggregate pair in the same class if the two categories attract each other, that is
    if the exposure
   
   .. math::
        E_{\beta, \delta} > 1 + 10 \sigma_{\beta, \delta} 
        
    (99% CI according to the Chebyshev inequality). The aggregation procedure
    may therefore stop before all categories are aggregated in one unique class,
    and output the classes repartition of the original categories. 

    Parameters
    ----------

    distribution: nested dictionaries
        Number of people per class, per areal unit as given in the raw data
        (ungrouped). The dictionary must have the following formatting:
        > {areal_id: {class_id: number}}

    exposure: nested dictionaries
        Matrix of exposures between categories.
        > {class_id0: {class_id1: (exposure_01, variance null model)}} 

    ci_factor: float
        Number of standard deviations over which we consider to have a 99%
        confidence interval on the exposure value. The default value, 10, is the
        upper bound given by Chebyshev's inequality.

    Returns
    -------

    classes: nested lists
        list of classes with the list of the corresponding original
        categories as values.
        > [[categories]]
    """

    ## Get the categories from the distribution
    categories = return_categories(distribution).keys()

    ## Extract the linkage matrix
    linkage = cluster_categories(distribution, exposure) 

    ## Get the classes
    classes = _aggregate_linkage(linkage, categories, ci_factor)

    return classes
Exemplo n.º 6
0
def uncover_classes(distribution, exposure, ci_factor=10):
    """ Returns the categories sorted in classes

    The classes are uncovered using the spatial repartition of individuals from
    different categories, using their relative exposure.

    We only aggregate pair in the same class if the two categories attract each other, that is
    if the exposure
   
   .. math::
        E_{\beta, \delta} > 1 + 10 \sigma_{\beta, \delta} 
        
    (99% CI according to the Chebyshev inequality). The aggregation procedure
    may therefore stop before all categories are aggregated in one unique class,
    and output the classes repartition of the original categories. 

    Parameters
    ----------

    distribution: nested dictionaries
        Number of people per class, per areal unit as given in the raw data
        (ungrouped). The dictionary must have the following formatting:
        > {areal_id: {class_id: number}}

    exposure: nested dictionaries
        Matrix of exposures between categories.
        > {class_id0: {class_id1: (exposure_01, variance null model)}} 

    ci_factor: float
        Number of standard deviations over which we consider to have a 99%
        confidence interval on the exposure value. The default value, 10, is the
        upper bound given by Chebyshev's inequality.

    Returns
    -------

    classes: nested lists
        list of classes with the list of the corresponding original
        categories as values.
        > [[categories]]
    """

    ## Get the categories from the distribution
    categories = return_categories(distribution).keys()

    ## Extract the linkage matrix
    linkage = cluster_categories(distribution, exposure)

    ## Get the classes
    classes = _aggregate_linkage(linkage, categories, ci_factor)

    return classes
Exemplo n.º 7
0
def overrepresented_units(distribution, classes=None):
    """ Find the areal units in which each class is over-represented
   
    We say that a class `\alpha` is overrepresented in that tract `t` if the
    representation `r_\alpha(t)` is such that

    .. math::
        r_\alpha(t) > 1 + 2.57 \sigma_\alpha(t)

    Parameters
    ----------

    distribution: nested dictionaries
        Number of people per class, per areal unit as given in the raw data
        (ungrouped). The dictionary must have the following formatting:
        > {areal_id: {class_id: number}}

    classes: dictionary of lists
        When the original categories need to be aggregated into different
        classes. 
        > {class: [categories belonging to this class]}
        This can be arbitrarily imposed, or computed with uncover_classes
        function of this package.

    Returns
    -------

    units: dictionary of lists
        Dictionnary of classes, with the list of areal units where this class is
        overrepresented with 99% confidence.
        > {class:[list of areal units]}
    """
    # Regroup into classes if specified. Otherwise return categories indicated
    # in the data
    if not classes:
        classes = return_categories(distribution)

    ## Compute the representation of the different classes in all areal units
    rep = mb.representation(distribution, classes)

    ## Find the tracts where classes are overrepresented
    areal_units = {
        cl: [
            au for au in rep
            if rep[au][cl][0] > 1 + 2.57 * math.sqrt(rep[au][cl][1])
        ]
        for cl in classes
    }

    return areal_units
Exemplo n.º 8
0
def representation(distribution, classes=None):
    """ Compute the representation of the different classes in all areal units

    Parameters
    ----------

    distribution: nested dictionaries
        Number of people per class, per areal unit as given in the raw data
        (ungrouped). The dictionary must have the following formatting:
        > {areal_id: {class_id: number}}

    classes: dictionary of lists
        When the original categories need to be aggregated into different
        classes. 
        > {class: [categories belonging to this class]}
        This can be arbitrarily imposed, or computed with uncover_classes
        function of this package.

    Returns
    -------

    representation: nested dictionnaries
        Representation of each category in each areal unit.
        > {areal_id: {class_id: (representation_values, variance of the null
                                model)}}
    """
    # Regroup into classes if specified. Otherwise return categories indicated
    # in the data
    if classes:
        distribution = regroup_per_class(distribution, classes)
    else:
       classes = return_categories(distribution) 


    # Compute the total numbers per class and per individual
    N_unit, N_class, N_tot = compute_totals(distribution, classes) 


    # Compute the representation and standard deviation for all areal units
    representation = {au:{cl:(single_representation(dist_au[cl],
                                                    N_unit[au],
                                                    N_class[cl],
                                                    N_tot), 
                              single_variance(N_unit[au],
                                               N_class[cl],
                                               N_tot) 
                             ) for cl in classes}
                      for au, dist_au in distribution.iteritems()}
    
    return representation
Exemplo n.º 9
0
def neighbourhoods(distribution, areal_units, classes=None):
    """ Return the neighbourhoods where different classes gather

    Parameter
    ---------

    distribution: nested dictionaries
        Number of people per class, per areal unit as given in the raw data
        (ungrouped). The dictionary must have the following formatting:
        > {areal_id: {class_id: number}}

    areal_units: dictionnary
        Dictionnary of areal unit ids with shapely polygon object representing
        the unit's geometry as values.

    classes: dictionary of lists
        When the original categories need to be aggregated into different
        classes. 
        > {class: [categories belonging to this class]}
        This can be arbitrarily imposed, or computed with uncover_classes
        function of this package.

    Returns
    -------

    neighbourhoods: dictionary
        Dictionary of classes names with list of neighbourhoods (that are
        each represented by a list of areal unit)
        > {'class': [ [areal units in cluster i], ...]}
    """

    # Regroup into classes if specified. Otherwise return categories indicated
    # in the data
    if not classes:
       classes = return_categories(distribution) 

    ## Find the areal units where classes are overrepresented
    or_units = overrepresented_units(distribution, classes)
    
    ## Compute the adjacency list
    adjacency = _adjacency(areal_units)

    ## Extract neighbourhooods as connected components
    G = nx.from_dict_of_lists(adjacency) # Graph from adjacency
    neighbourhoods = {cl: [list(subgraph) for subgraph in
                            nx.connected_component_subgraphs(G.subgraph(or_units[cl]))]
                        for cl in classes}

    return neighbourhoods
Exemplo n.º 10
0
def representation(distribution, classes=None):
    """ Compute the representation of the different classes in all areal units

    Parameters
    ----------

    distribution: nested dictionaries
        Number of people per class, per areal unit as given in the raw data
        (ungrouped). The dictionary must have the following formatting:
        > {areal_id: {class_id: number}}

    classes: dictionary of lists
        When the original categories need to be aggregated into different
        classes. 
        > {class: [categories belonging to this class]}
        This can be arbitrarily imposed, or computed with uncover_classes
        function of this package.

    Returns
    -------

    representation: nested dictionnaries
        Representation of each category in each areal unit.
        > {areal_id: {class_id: (representation_values, variance of the null
                                model)}}
    """
    # Regroup into classes if specified. Otherwise return categories indicated
    # in the data
    if classes:
        distribution = regroup_per_class(distribution, classes)
    else:
        classes = return_categories(distribution)

    # Compute the total numbers per class and per individual
    N_unit, N_class, N_tot = compute_totals(distribution, classes)

    # Compute the representation and standard deviation for all areal units
    representation = {
        au: {
            cl: (single_representation(dist_au[cl], N_unit[au], N_class[cl],
                                       N_tot),
                 single_variance(N_unit[au], N_class[cl], N_tot))
            for cl in classes
        }
        for au, dist_au in distribution.iteritems()
    }

    return representation
Exemplo n.º 11
0
def overrepresented_units(distribution, classes=None):
    """ Find the areal units in which each class is over-represented
   
    We say that a class `\alpha` is overrepresented in that tract `t` if the
    representation `r_\alpha(t)` is such that

    .. math::
        r_\alpha(t) > 1 + 2.57 \sigma_\alpha(t)

    Parameters
    ----------

    distribution: nested dictionaries
        Number of people per class, per areal unit as given in the raw data
        (ungrouped). The dictionary must have the following formatting:
        > {areal_id: {class_id: number}}

    classes: dictionary of lists
        When the original categories need to be aggregated into different
        classes. 
        > {class: [categories belonging to this class]}
        This can be arbitrarily imposed, or computed with uncover_classes
        function of this package.

    Returns
    -------

    units: dictionary of lists
        Dictionnary of classes, with the list of areal units where this class is
        overrepresented with 99% confidence.
        > {class:[list of areal units]}
    """
    # Regroup into classes if specified. Otherwise return categories indicated
    # in the data
    if not classes:
       classes = return_categories(distribution) 


    ## Compute the representation of the different classes in all areal units
    rep = mb.representation(distribution, classes)

    ## Find the tracts where classes are overrepresented
    areal_units = {cl:[au for au in rep
                          if rep[au][cl][0] > 1 + 2.57*math.sqrt(rep[au][cl][1])] 
                    for cl in classes}

    return areal_units
Exemplo n.º 12
0
def cluster_categories(distribution, exposure):
    """ Perform hierarhical clustering on the intra-tract exposure values 
    
    At each step of the aggregation, we look for the pair `(\beta, \delta)` of
    categories that has the highest exposure (renormalised by the maximum
    possible value). We aggregate them in a new category `\gamma` whose exposure
    with the other categories `\alpha` is given by

    .. math::
        E_{\alpha, \gamma} = \frac{1}{N_\beta + N_\delta} \left( N_\beta
        E_{\alpha, \beta} + N_\delta E_{\alpha, \delta} \right)


    Parameters
    ----------

    distribution: nested dictionaries
        Number of people per class, per areal unit as given in the raw data
        (ungrouped). The dictionary must have the following formatting:
        > {areal_id: {class_id: number}}

    exposure: nested dictionaries
        Matrix of exposures between categories.
        > {class_id0: {class_id1: (exposure_01, variance null model)}} 


    Returns
    -------

    linkage: list of tuples
        list L that encodes the hierarhical tree. At the ith iteration of the
        algorithm, L[i,0] and L[i,1] are aggregated to form the n+ith cluster. The
        exposure between L[i,1] and L[i,0] is given by L[i,3], the variance is
        given by L[i,4].
    """
    #
    # Data preparation
    #

    ## Linkage matrix
    linkage = [cl for cl in sorted(exposure, key=lambda x: int(x))]
    N = len(linkage)

    ## Get total
    categories = return_categories(distribution)
    N_unit, N_class, N_tot = compute_totals(distribution, categories) 

    

    ## Use classes' position in the linkage matrix rather than names
    # Class totals
    for cl in categories:
        N_class[linkage.index(cl)] = N_class.pop(cl)

    #exposure
    E = {linkage.index(cl0):{linkage.index(cl1):exposure[cl0][cl1][0]
                                for cl1 in exposure[cl0]}
            for cl0 in exposure}
    E_var = {linkage.index(cl0):{linkage.index(cl1):exposure[cl0][cl1][1]
                                for cl1 in exposure[cl0]}
            for cl0 in exposure}



    #
    # Clustering
    #
    for i in range(N-1): 
        a, b = _find_friends(E, N_class)
        linkage.append((a, b, E[a][b], E_var[a][b])) 
        E, E_var, N_class = _update_matrix(E, E_var, N_class, a, b) 


    return linkage 
Exemplo n.º 13
0
def dissimilarity(distribution, classes=None):
    """ Compute the inter-class dissimilarity index

    The dissimilarity index between two categories `\alpha` and `\beta` is
    defined as 

    ..math::
        D_{\alpha \beta} = \frac{1}{2} \sum_{i=1}^{T} \left|
    \frac{n_\alpha(t)}{N_\alpha} - \frac{n_\beta(t)}{N_\beta} \right|

    Its value ranges from 0 to 1.

    Parameters
    ----------

    distribution: nested dictionaries
        Number of people per class, per areal unit as given in the raw data
        (ungrouped). The dictionary must have the following formatting:
        > {areal_id: {class_id: number}}

    classes: dictionary of lists
        When the original categories need to be aggregated into different
        classes. {class: [categories belonging to this class]}
        This can be arbitrarily imposed, or computed with uncover_classes
        function of this package.

    Returns
    -------

    dissimilarity: nested dictionary
        Classes matrix with dissimilarity as values
        > {alpha: {beta: D_{\alpha \beta}}}
    """
    ## Regroup into classes if specified
    if classes is not None:
        distribution = regroup_per_class(distribution, classes)
    else:
        classes = return_categories(distribution)


    ## Compute total numbers of individuals per class and areal unit
    N_unit, N_class, N_tot = compute_totals(distribution, classes) 


    ## Compute the dissimilarity matrix
    # Only half of the values are computed (the matrix is symmetric)
    dissimilarity = collections.defaultdict(dict)
    for alpha, beta in itertools.combinations_with_replacement(classes, 2):
        dissimilarity[alpha][beta] = _pair_dissimilarity(distribution, 
                                                        N_class, 
                                                        alpha, 
                                                        beta)

    # Symmetrize the output
    for c0 in dissimilarity.iterkeys():
        for c1 in dissimilarity[c0].iterkeys():
            if c0 not in dissimilarity[c1]:
                dissimilarity[c1][c0] = dissimilarity[c0][c1]


    return dissimilarity
Exemplo n.º 14
0
def exposure(distribution, classes=None):
    """ Compute the exposure between classes
    
    The exposure between two categories `\alpha` and `\beta` is defined as

    ..math::
        E_{\alpha \beta} = \frac{1}{N} \sum_{t=1}^{T} n(t) r_\alpha(t)
        r_\beta(t)

    where `r_\alpha(t)` is the representation of the class `\alpha` in the areal
    unit `t`, `n(t)` the total population of `t`, and `N` the total population
    in the considered system.

    The exposure of a class to itself `E_{\alpha \alpha}` measures the
    **isolation** of this class.

    The variance is computed on the null model which corresponds to the
    unsegregated configuration, that is when the spatial repartition of people
    of different income classes is no different from that that would be obtained
    if they scattered at random across the city.

    Parameters
    ----------

    distribution: nested dictionaries
        Number of people per class, per areal unit as given in the raw data
        (ungrouped). The dictionary must have the following formatting:
        > {areal_id: {class_id: number}}

    classes: dictionary of lists
        When the original categories need to be aggregated into different
        classes. {class: [categories belonging to this class]}
        This can be arbitrarily imposed, or computed with uncover_classes
        function of this package.

    Returns
    -------

    exposure: nested dictionaries
        Matrix of exposures between categories.
        > {class_id0: {class_id1: (exposure_01, variance null model)}} 
    """
    ## Regroup into classes if specified.
    if classes:
        distribution = regroup_per_class(distribution, classes)
    else:
       classes = return_categories(distribution) 


    ## Compute the total numbers per class and per areal unit 
    N_unit, N_class, N_tot = compute_totals(distribution, classes) 


    ## Compute representation for all areal unit
    representation = mb.representation(distribution)


    ## Compute the exposure matrix
    # Only half of the values are computed (the matrix is symmetric)
    exposure = collections.defaultdict(dict)
    for alpha, beta in itertools.combinations_with_replacement(classes, 2):
        exposure[alpha][beta] = (pair_exposure(representation, N_unit, N_tot, alpha, beta),
                                 pair_variance(representation, N_unit, N_class, N_tot, alpha, beta))

    # Symmetrize the output
    for c0 in exposure.iterkeys():
        for c1 in exposure[c0].iterkeys():
            if c0 not in exposure[c1]:
                exposure[c1][c0] = exposure[c0][c1]

    return exposure 
Exemplo n.º 15
0
def exposure(distribution, classes=None):
    """ Compute the exposure between classes
    
    The exposure between two categories `\alpha` and `\beta` is defined as

    ..math::
        E_{\alpha \beta} = \frac{1}{N} \sum_{t=1}^{T} n(t) r_\alpha(t)
        r_\beta(t)

    where `r_\alpha(t)` is the representation of the class `\alpha` in the areal
    unit `t`, `n(t)` the total population of `t`, and `N` the total population
    in the considered system.

    The exposure of a class to itself `E_{\alpha \alpha}` measures the
    **isolation** of this class.

    The variance is computed on the null model which corresponds to the
    unsegregated configuration, that is when the spatial repartition of people
    of different income classes is no different from that that would be obtained
    if they scattered at random across the city.

    Parameters
    ----------

    distribution: nested dictionaries
        Number of people per class, per areal unit as given in the raw data
        (ungrouped). The dictionary must have the following formatting:
        > {areal_id: {class_id: number}}

    classes: dictionary of lists
        When the original categories need to be aggregated into different
        classes. {class: [categories belonging to this class]}
        This can be arbitrarily imposed, or computed with uncover_classes
        function of this package.

    Returns
    -------

    exposure: nested dictionaries
        Matrix of exposures between categories.
        > {class_id0: {class_id1: (exposure_01, variance null model)}} 
    """
    ## Regroup into classes if specified.
    if classes:
        distribution = regroup_per_class(distribution, classes)
    else:
        classes = return_categories(distribution)

    ## Compute the total numbers per class and per areal unit
    N_unit, N_class, N_tot = compute_totals(distribution, classes)

    ## Compute representation for all areal unit
    representation = mb.representation(distribution)

    ## Compute the exposure matrix
    # Only half of the values are computed (the matrix is symmetric)
    exposure = collections.defaultdict(dict)
    for alpha, beta in itertools.combinations_with_replacement(classes, 2):
        exposure[alpha][beta] = (pair_exposure(representation, N_unit, N_tot,
                                               alpha, beta),
                                 pair_variance(representation, N_unit, N_class,
                                               N_tot, alpha, beta))

    # Symmetrize the output
    for c0 in exposure.iterkeys():
        for c1 in exposure[c0].iterkeys():
            if c0 not in exposure[c1]:
                exposure[c1][c0] = exposure[c0][c1]

    return exposure
Exemplo n.º 16
0
def cluster_categories(distribution, exposure):
    """ Perform hierarhical clustering on the intra-tract exposure values 
    
    At each step of the aggregation, we look for the pair `(\beta, \delta)` of
    categories that has the highest exposure (renormalised by the maximum
    possible value). We aggregate them in a new category `\gamma` whose exposure
    with the other categories `\alpha` is given by

    .. math::
        E_{\alpha, \gamma} = \frac{1}{N_\beta + N_\delta} \left( N_\beta
        E_{\alpha, \beta} + N_\delta E_{\alpha, \delta} \right)


    Parameters
    ----------

    distribution: nested dictionaries
        Number of people per class, per areal unit as given in the raw data
        (ungrouped). The dictionary must have the following formatting:
        > {areal_id: {class_id: number}}

    exposure: nested dictionaries
        Matrix of exposures between categories.
        > {class_id0: {class_id1: (exposure_01, variance null model)}} 


    Returns
    -------

    linkage: list of tuples
        list L that encodes the hierarhical tree. At the ith iteration of the
        algorithm, L[i,0] and L[i,1] are aggregated to form the n+ith cluster. The
        exposure between L[i,1] and L[i,0] is given by L[i,3], the variance is
        given by L[i,4].
    """
    #
    # Data preparation
    #

    ## Linkage matrix
    linkage = [cl for cl in sorted(exposure, key=lambda x: int(x))]
    N = len(linkage)

    ## Get total
    categories = return_categories(distribution)
    N_unit, N_class, N_tot = compute_totals(distribution, categories)

    ## Use classes' position in the linkage matrix rather than names
    # Class totals
    for cl in categories:
        N_class[linkage.index(cl)] = N_class.pop(cl)

    #exposure
    E = {
        linkage.index(cl0):
        {linkage.index(cl1): exposure[cl0][cl1][0]
         for cl1 in exposure[cl0]}
        for cl0 in exposure
    }
    E_var = {
        linkage.index(cl0):
        {linkage.index(cl1): exposure[cl0][cl1][1]
         for cl1 in exposure[cl0]}
        for cl0 in exposure
    }

    #
    # Clustering
    #
    for i in range(N - 1):
        a, b = _find_friends(E, N_class)
        linkage.append((a, b, E[a][b], E_var[a][b]))
        E, E_var, N_class = _update_matrix(E, E_var, N_class, a, b)

    return linkage