def _decompose_segregation(index1,
                           index2,
                           counterfactual_approach='composition'):
    """Decompose segregation differences into spatial and attribute components.

    Given two segregation indices of the same type, use Shapley decomposition
    to measure whether the differences between index measures arise from
    differences in spatial structure or population structure

    Parameters
    ----------
    index1 : segregation.SegIndex class
        First SegIndex class to compare.
    index2 : segregation.SegIndex class
        Second SegIndex class to compare.
    counterfactual_approach : str, one of
                              ["composition", "share", "dual_composition"]
        The technique used to generate the counterfactual population
        distributions.

    Returns
    -------
    tuple
        (shapley spatial component, 
         shapley attribute component, 
         core data of index1, 
         core data of index2, 
         data with counterfactual variables for index1, 
         data with counterfactual variables for index2)

    """
    df1 = index1.core_data.copy()
    df2 = index2.core_data.copy()

    assert index1._function == index2._function, "Segregation indices must be of the same type"

    counterfac_df1, counterfac_df2 = _generate_counterfactual(
        df1,
        df2,
        'group_pop_var',
        'total_pop_var',
        counterfactual_approach=counterfactual_approach)

    seg_func = index1._function

    # index for spatial 1, attribute 1
    G_S1_A1 = index1.statistic

    # index for spatial 2, attribute 2
    G_S2_A2 = index2.statistic

    # index for spatial 1 attribute 2 (counterfactual population for structure 1)
    G_S1_A2 = seg_func(counterfac_df1, 'counterfactual_group_pop',
                       'counterfactual_total_pop')[0]

    # index for spatial 2 attribute 1 (counterfactual population for structure 2)
    G_S2_A1 = seg_func(counterfac_df2, 'counterfactual_group_pop',
                       'counterfactual_total_pop')[0]

    # take the average difference in spatial structure, holding attributes constant
    C_S = 1 / 2 * (G_S1_A1 - G_S2_A1 + G_S1_A2 - G_S2_A2)

    # take the average difference in attributes, holding spatial structure constant
    C_A = 1 / 2 * (G_S1_A1 - G_S1_A2 + G_S2_A1 - G_S2_A2)

    return C_S, C_A, df1, df2, counterfac_df1, counterfac_df2, counterfactual_approach
Пример #2
0
def _compare_segregation(seg_class_1,
                         seg_class_2,
                         iterations_under_null=500,
                         null_approach="random_label",
                         **kwargs):
    '''
    Perform inference comparison for a two segregation measures

    Parameters
    ----------

    seg_class_1           : a PySAL segregation object to be compared to seg_class_2
    
    seg_class_2           : a PySAL segregation object to be compared to seg_class_1
    
    iterations_under_null : number of iterations under null hyphothesis
    
    null_approach: argument that specifies which type of null hypothesis the inference will iterate.
    
        "random_label"               : random label the data in each iteration
        
        "counterfactual_composition" : randomizes the number of minority population according to both cumulative distribution function of a variable that represents the composition of the minority group. The composition is the division of the minority population of unit i divided by total population of tract i.

        "counterfactual_share" : randomizes the number of minority population and total population according to both cumulative distribution function of a variable that represents the share of the minority group. The share is the division of the minority population of unit i divided by total population of minority population.
        
        "counterfactual_dual_composition" : applies the "counterfactual_composition" for both minority and complementary groups.

    **kwargs : customizable parameters to pass to the segregation measures. Usually they need to be the same as both seg_class_1 and seg_class_2  was built.
    
    Attributes
    ----------

    p_value        : float
                     Two-Tailed p-value
    
    est_sim        : numpy array
                     Estimates of the segregation measure differences under the null hypothesis
                  
    est_point_diff : float
                     Point estimation of the difference between the segregation measures
                
    Notes
    -----
    This function performs inference to compare two segregation measures. This can be either two measures of the same locations in two different points in time or it can be two different locations at the same point in time.
    
    The null hypothesis is H0: Segregation_1 is not different than Segregation_2.
    
    Based on Rey, Sergio J., and Myrna L. Sastré-Gutiérrez. "Interregional inequality dynamics in Mexico." Spatial Economic Analysis 5.3 (2010): 277-298.

    '''

    if not null_approach in [
            'random_label', 'counterfactual_composition',
            'counterfactual_share', 'counterfactual_dual_composition'
    ]:
        raise ValueError(
            'null_approach must one of \'random_label\', \'counterfactual_composition\', \'counterfactual_share\', \'counterfactual_dual_composition\''
        )

    if (type(seg_class_1) != type(seg_class_2)):
        raise TypeError('seg_class_1 and seg_class_2 must be the same type/class.')

    point_estimation = seg_class_1.statistic - seg_class_2.statistic

    aux = str(type(seg_class_1))
    _class_name = aux[1 + aux.rfind('.'):-2]  # 'rfind' finds the last occurence of a pattern in a string

    data_1 = seg_class_1.core_data.copy()
    data_2 = seg_class_2.core_data.copy()
    
    est_sim = np.empty(iterations_under_null)

    ################
    # RANDOM LABEL #
    ################
    if (null_approach == "random_label"):
        
        data_1['grouping_variable'] = 'Group_1'
        data_2['grouping_variable'] = 'Group_2'
        
        if ('multigroup' not in str(type(seg_class_1))):
            
            # This step is just to make sure the each frequecy column is integer for the approaches and from the same type in order to be able to stack them
            data_1['group_pop_var'] = round(data_1['group_pop_var']).astype(int)
            data_1['total_pop_var'] = round(data_1['total_pop_var']).astype(int)
        
            data_2['group_pop_var'] = round(data_2['group_pop_var']).astype(int)
            data_2['total_pop_var'] = round(data_2['total_pop_var']).astype(int)
            
            stacked_data = pd.concat([data_1, data_2], ignore_index=True)
    
            with tqdm(total=iterations_under_null) as pbar:
                for i in np.array(range(iterations_under_null)):
                    
                    stacked_data['grouping_variable'] = np.random.permutation(stacked_data['grouping_variable'])
    
                    stacked_data_1 = stacked_data.loc[stacked_data['grouping_variable'] == 'Group_1']
                    stacked_data_2 = stacked_data.loc[stacked_data['grouping_variable'] == 'Group_2']
    
                    simulations_1 = seg_class_1._function(stacked_data_1,'group_pop_var','total_pop_var',**kwargs)[0]
                    simulations_2 = seg_class_2._function(stacked_data_2,'group_pop_var','total_pop_var',**kwargs)[0]
    
                    est_sim[i] = simulations_1 - simulations_2
                    pbar.set_description('Processed {} iterations out of {}'.format(i + 1, iterations_under_null))
                    pbar.update(1)
                    
        if ('multigroup' in str(type(seg_class_1))):
            
            groups_list = seg_class_1._groups
            
            for i in range(len(groups_list)):
                data_1[groups_list[i]] = round(data_1[groups_list[i]]).astype(int)
                data_2[groups_list[i]] = round(data_2[groups_list[i]]).astype(int)
            
            if (seg_class_1._groups != seg_class_2._groups):
                raise ValueError('MultiGroup groups should be the same')
                
            stacked_data = pd.concat([data_1, data_2], ignore_index=True)
            
            with tqdm(total=iterations_under_null) as pbar:
                for i in np.array(range(iterations_under_null)):
                    
                    stacked_data['grouping_variable'] = np.random.permutation(stacked_data['grouping_variable'])
    
                    stacked_data_1 = stacked_data.loc[stacked_data['grouping_variable'] == 'Group_1']
                    stacked_data_2 = stacked_data.loc[stacked_data['grouping_variable'] == 'Group_2']
    
                    simulations_1 = seg_class_1._function(stacked_data_1, groups_list, **kwargs)[0]
                    simulations_2 = seg_class_2._function(stacked_data_2, groups_list, **kwargs)[0]
    
                    est_sim[i] = simulations_1 - simulations_2
                    pbar.set_description('Processed {} iterations out of {}'.format(i + 1, iterations_under_null))
                    pbar.update(1)
    
    
    ##############################
    # COUNTERFACTUAL COMPOSITION #
    ##############################
    if (null_approach in ['counterfactual_composition', 'counterfactual_share','counterfactual_dual_composition']):
        
        if ('multigroup' in str(type(seg_class_1))):
            raise ValueError('Not implemented for MultiGroup indexes.')

        internal_arg = null_approach[15:]  # Remove 'counterfactual_' from the beginning of the string

        counterfac_df1, counterfac_df2 = _generate_counterfactual(
            data_1,
            data_2,
            'group_pop_var',
            'total_pop_var',
            counterfactual_approach=internal_arg)

        if (null_approach in [
                'counterfactual_share', 'counterfactual_dual_composition'
        ]):
            data_1['total_pop_var'] = counterfac_df1[
                'counterfactual_total_pop']
            data_2['total_pop_var'] = counterfac_df2[
                'counterfactual_total_pop']
        with tqdm(total=iterations_under_null) as pbar:
            for i in np.array(range(iterations_under_null)):

                data_1['fair_coin'] = np.random.uniform(size=len(data_1))
                data_1['test_group_pop_var'] = np.where(
                    data_1['fair_coin'] > 0.5, data_1['group_pop_var'],
                    counterfac_df1['counterfactual_group_pop'])

                # Dropping to avoid confusion in the internal function
                data_1_test = data_1.drop(['group_pop_var'], axis=1)

                simulations_1 = seg_class_1._function(data_1_test,
                                                      'test_group_pop_var',
                                                      'total_pop_var',
                                                      **kwargs)[0]

                # Dropping to avoid confusion in the next iteration
                data_1 = data_1.drop(['fair_coin', 'test_group_pop_var'],
                                     axis=1)

                data_2['fair_coin'] = np.random.uniform(size=len(data_2))
                data_2['test_group_pop_var'] = np.where(
                    data_2['fair_coin'] > 0.5, data_2['group_pop_var'],
                    counterfac_df2['counterfactual_group_pop'])

                # Dropping to avoid confusion in the internal function
                data_2_test = data_2.drop(['group_pop_var'], axis=1)

                simulations_2 = seg_class_2._function(data_2_test,
                                                      'test_group_pop_var',
                                                      'total_pop_var',
                                                      **kwargs)[0]

                # Dropping to avoid confusion in the next iteration
                data_2 = data_2.drop(['fair_coin', 'test_group_pop_var'],
                                     axis=1)

                est_sim[i] = simulations_1 - simulations_2

                pbar.set_description(
                    'Processed {} iterations out of {}'.format(
                        i + 1, iterations_under_null))
                pbar.update(1)

    # Check and, if the case, remove iterations_under_null that resulted in nan or infinite values
    if any((np.isinf(est_sim) | np.isnan(est_sim))):
        warnings.warn(
            'Some estimates resulted in NaN or infinite values for estimations under null hypothesis. These values will be removed for the final results.'
        )
        est_sim = est_sim[~(np.isinf(est_sim) | np.isnan(est_sim))]

    # Two-Tailed p-value
    # Obs.: the null distribution can be located far from zero. Therefore, this is the the appropriate way to calculate the two tailed p-value.
    aux1 = (point_estimation < est_sim).sum()
    aux2 = (point_estimation > est_sim).sum()
    p_value = 2 * np.array([aux1, aux2]).min() / len(est_sim)

    return p_value, est_sim, point_estimation, _class_name