Пример #1
0
def _multi_gini_seg(data, groups):
    """
    Calculation of Multigroup Gini Segregation index

    Parameters
    ----------

    data   : a pandas DataFrame

    groups : list of strings.
             The variables names in data of the groups of interest of the analysis.

    Returns
    -------

    statistic : float
                Multigroup Gini Segregation Index

    core_data : a pandas DataFrame
                A pandas DataFrame that contains the columns used to perform the estimate.

    Notes
    -----
    Based on Reardon, Sean F., and Glenn Firebaugh. "Measures of multigroup segregation." Sociological methodology 32.1 (2002): 33-67.

    Reference: :cite:`reardon2002measures`.

    """

    core_data = data[groups]
    data = _nan_handle(core_data)

    df = np.array(core_data)

    K = df.shape[1]

    T = df.sum()

    ti = df.sum(axis=1)
    pik = df / ti[:, None]
    Pk = df.sum(axis=0) / df.sum()
    Is = (Pk * (1 - Pk)).sum()

    elements_sum = np.empty(K)
    for k in range(K):
        aux = np.multiply(np.outer(ti, ti),
                          manhattan_distances(pik[:, k].reshape(-1, 1))).sum()
        elements_sum[k] = aux

    multi_Gini_Seg = elements_sum.sum() / (2 * (T**2) * Is)

    return multi_Gini_Seg, core_data, groups
Пример #2
0
def _multi_diversity(data, groups, normalized=False):
    """
    Calculation of Multigroup Diversity index

    Parameters
    ----------

    data   : a pandas DataFrame

    groups : list of strings.
             The variables names in data of the groups of interest of the analysis.

    Returns
    -------

    statistic  : float
                 Multigroup Diversity Index

    core_data  : a pandas DataFrame
                 A pandas DataFrame that contains the columns used to perform the estimate.

    normalized : bool. Default is False.
                 Wheter the resulting index will be divided by its maximum (natural log of the number of groups)

    Notes
    -----
    Based on Reardon, Sean F., and Glenn Firebaugh. "Measures of multigroup segregation." Sociological methodology 32.1 (2002): 33-67 and Theil, Henry. "Statistical decomposition analysis; with applications in the social and administrative sciences". No. 04; HA33, T4.. 1972.

    This is also know as Theil's Entropy Index (Equation 2 of page 37 of Reardon, Sean F., and Glenn Firebaugh. "Measures of multigroup segregation." Sociological methodology 32.1 (2002): 33-67)

    High diversity means less segregation.

    Reference: :cite:`reardon2002measures`.

    """

    core_data = data[groups]
    data = _nan_handle(core_data)

    df = np.array(core_data)

    Pk = df.sum(axis=0) / df.sum()

    E = -(Pk * np.log(Pk)).sum()

    if normalized:
        K = df.shape[1]
        E = E / np.log(K)

    return E, core_data, groups
Пример #3
0
def _multi_dissim(data, groups):
    """
    Calculation of Multigroup Dissimilarity index

    Parameters
    ----------

    data   : a pandas DataFrame

    groups : list of strings.
             The variables names in data of the groups of interest of the analysis.

    Returns
    -------

    statistic : float
                Multigroup Dissimilarity Index

    core_data : a pandas DataFrame
                A pandas DataFrame that contains the columns used to perform the estimate.

    Notes
    -----
    Based on Sakoda, James M. "A generalized index of dissimilarity." Demography 18.2 (1981): 245-250.

    Reference: :cite:`sakoda1981generalized`.

    """

    core_data = data[groups]
    data = _nan_handle(core_data)

    df = np.array(core_data)

    n = df.shape[0]
    K = df.shape[1]

    T = df.sum()

    ti = df.sum(axis=1)
    pik = df / ti[:, None]
    Pk = df.sum(axis=0) / df.sum()

    Is = (Pk * (1 - Pk)).sum()

    multi_D = 1 / (2 * T * Is) * np.multiply(
        abs(pik - Pk),
        np.repeat(ti, K, axis=0).reshape(n, K)).sum()

    return multi_D, core_data, groups
Пример #4
0
def _multi_information_theory(data, groups):
    """
    Calculation of Multigroup Information Theory index

    Parameters
    ----------

    data   : a pandas DataFrame

    groups : list of strings.
             The variables names in data of the groups of interest of the analysis.

    Returns
    -------

    statistic : float
                Multigroup Information Theory Index

    core_data : a pandas DataFrame
                A pandas DataFrame that contains the columns used to perform the estimate.

    Notes
    -----
    Based on Reardon, Sean F., and Glenn Firebaugh. "Measures of multigroup segregation." Sociological methodology 32.1 (2002): 33-67.

    Reference: :cite:`reardon2002measures`.

    """

    core_data = data[groups]
    data = _nan_handle(core_data)

    df = np.array(core_data)

    T = df.sum()

    ti = df.sum(axis=1)
    pik = df / ti[:, None]
    Pk = df.sum(axis=0) / df.sum()

    # The natural logarithm is used, but this could be used with any base following Footnote 3 of pg. 37
    # of Reardon, Sean F., and Glenn Firebaugh. "Measures of multigroup segregation." Sociological methodology 32.1 (2002): 33-67.
    E = (Pk * np.log(1 / Pk)).sum()

    MIT = np.nansum(ti[:, None] * pik * np.log(pik / Pk)) / (T * E)

    return MIT, core_data, groups
Пример #5
0
def _multi_relative_diversity(data, groups):
    """
    Calculation of Multigroup Relative Diversity index

    Parameters
    ----------

    data   : a pandas DataFrame

    groups : list of strings.
             The variables names in data of the groups of interest of the analysis.

    Returns
    -------

    statistic : float
                Multigroup Relative Diversity Index

    core_data : a pandas DataFrame
                A pandas DataFrame that contains the columns used to perform the estimate.

    Notes
    -----
    Based on Reardon, Sean F. "Measures of racial diversity and segregation in multigroup and hierarchically structured populations." annual meeting of the Eastern Sociological Society, Philadelphia, PA. 1998.

    High diversity means less segregation.

    Reference: :cite:`reardon1998measures`.

    """

    core_data = data[groups]
    data = _nan_handle(core_data)

    df = np.array(core_data)

    T = df.sum()

    ti = df.sum(axis=1)
    pik = df / ti[:, None]
    Pk = df.sum(axis=0) / df.sum()
    Is = (Pk * (1 - Pk)).sum()

    MRD = (ti[:, None] * (pik - Pk)**2).sum() / (T * Is)

    return MRD, core_data, groups
Пример #6
0
def _simpsons_concentration(data, groups):
    """
    Calculation of Simpson's Concentration index

    Parameters
    ----------

    data   : a pandas DataFrame

    groups : list of strings.
             The variables names in data of the groups of interest of the analysis.

    Returns
    -------

    statistic  : float
                 Simpson's Concentration Index

    core_data  : a pandas DataFrame
                 A pandas DataFrame that contains the columns used to perform the estimate.

    Notes
    -----
    Based on Simpson, Edward H. "Measurement of diversity." nature 163.4148 (1949): 688.

    Simpson's concentration index (Lambda) can be simply interpreted as the probability that two individuals chosen at random and independently from the population will be found to belong to the same group.

    Higher values means higher segregation.

    Simpson's Concentration + Simpson's Interaction = 1

    Reference: :cite:`simpson1949measurement`.

    """

    core_data = data[groups]
    data = _nan_handle(core_data)

    df = np.array(core_data)

    Pk = df.sum(axis=0) / df.sum()

    Lambda = (Pk * Pk).sum()

    return Lambda, core_data, groups
Пример #7
0
def _multi_squared_coefficient_of_variation(data, groups):
    """
    Calculation of Multigroup Squared Coefficient of Variation index

    Parameters
    ----------

    data   : a pandas DataFrame

    groups : list of strings.
             The variables names in data of the groups of interest of the analysis.

    Returns
    -------

    statistic : float
                Multigroup Squared Coefficient of Variation Index

    core_data : a pandas DataFrame
                A pandas DataFrame that contains the columns used to perform the estimate.

    Notes
    -----
    Based on Reardon, Sean F., and Glenn Firebaugh. "Measures of multigroup segregation." Sociological methodology 32.1 (2002): 33-67.

    Reference: :cite:`reardon2002measures`.

    """

    core_data = data[groups]
    data = _nan_handle(core_data)

    df = np.array(core_data)

    K = df.shape[1]

    T = df.sum()

    ti = df.sum(axis=1)
    pik = df / ti[:, None]
    Pk = df.sum(axis=0) / df.sum()

    C = ((ti[:, None] * (pik - Pk)**2) / (T * (K - 1) * Pk)).sum()

    return C, core_data, groups
Пример #8
0
def _multi_divergence(data, groups):
    """
    Calculation of Multigroup Divergence index

    Parameters
    ----------

    data   : a pandas DataFrame

    groups : list of strings.
             The variables names in data of the groups of interest of the analysis.

    Returns
    -------

    statistic : float
                Multigroup Divergence Index

    core_data : a pandas DataFrame
                A pandas DataFrame that contains the columns used to perform the estimate.

    Notes
    -----
    Based on Roberto, Elizabeth. "The Divergence Index: A Decomposable Measure of Segregation and Inequality." arXiv preprint arXiv:1508.01167 (2015).

    Reference: :cite:`roberto2015divergence`.

    """

    core_data = data[groups]
    data = _nan_handle(core_data)

    df = np.array(core_data)

    T = df.sum()

    ti = df.sum(axis=1)
    pik = df / ti[:, None]
    Pk = df.sum(axis=0) / df.sum()

    Di = np.nansum(pik * np.log(pik / Pk), axis=1)

    Divergence_Index = ((ti / T) * Di).sum()

    return Divergence_Index, core_data, groups
Пример #9
0
def _simpsons_interaction(data, groups):
    """
    Calculation of Simpson's Interaction index

    Parameters
    ----------

    data   : a pandas DataFrame

    groups : list of strings.
             The variables names in data of the groups of interest of the analysis.

    Returns
    -------

    statistic  : float
                 Simpson's Interaction Index

    core_data  : a pandas DataFrame
                 A pandas DataFrame that contains the columns used to perform the estimate.

    Notes
    -----
    Based on Equation 1 of page 37 of Reardon, Sean F., and Glenn Firebaugh. "Measures of multigroup segregation." Sociological methodology 32.1 (2002): 33-67.

    Simpson's interaction index (I) can be simply interpreted as the probability that two individuals chosen at random and independently from the population will be found to not belong to the same group.

    Higher values means lesser segregation.

    Simpson's Concentration + Simpson's Interaction = 1

    Reference: :cite:`reardon2002measures`.

    """

    core_data = data[groups]
    data = _nan_handle(core_data)

    df = np.array(core_data)

    Pk = df.sum(axis=0) / df.sum()

    I = (Pk * (1 - Pk)).sum()

    return I, core_data, groups
Пример #10
0
def _multi_normalized_exposure(data, groups):
    """
    Calculation of Multigroup Normalized Exposure index

    Parameters
    ----------

    data   : a pandas DataFrame

    groups : list of strings.
             The variables names in data of the groups of interest of the analysis.

    Returns
    -------

    statistic : float
                Multigroup Normalized Exposure Index

    core_data : a pandas DataFrame
                A pandas DataFrame that contains the columns used to perform the estimate.

    Notes
    -----
    Based on Reardon, Sean F., and Glenn Firebaugh. "Measures of multigroup segregation." Sociological methodology 32.1 (2002): 33-67.

    Reference: :cite:`reardon2002measures`.

    """

    core_data = data[groups]
    data = _nan_handle(core_data)

    df = np.array(core_data)

    T = df.sum()

    ti = df.sum(axis=1)
    pik = df / ti[:, None]
    Pk = df.sum(axis=0) / df.sum()

    MNE = ((ti[:, None] * (pik - Pk)**2) / (1 - Pk)).sum() / T

    return MNE, core_data, groups