def explore_categorical_imputation (self, variable):
        """
        Compares the results from various imputation methods so that you can choose the best suited one

        
        # 1st chart => existing categories and avg target value
        # 2nd chart => missing value replaced by frequent category ; then plot a chart with target value
        # 3rd chart => missing value replaced by 'Missing' category ; then plot a chart with target value
        # 4th chart => missing value replaced by random distribution ; then plot a chart with target value
        
        """
        df = self.__df__
        c = variable
        
        printmd ('**<u>Missing Values :</u>**')

        print ('  Number :', df[c].isnull().sum())
        print ('  Percentage :', df[c].isnull().mean()*100, '%')
        print ()
        
        printmd(f'**<u>We have following options for Imputing the Missing Value for Categorical Variable, {c} :</u>**')
        print ('  1. Imputing missing values by Frequent Category' )
        print ('  2. Imputing missing values by Missing Label' )
        print ('  3. Imputing missing values by Randomly selected value' )
        
        print ()
        print ("Let's visualize the impact of each imputation and compare it with original distribution")
        print ()
        
        printmd ('**<u>1. Original Distribution of all Categories :</u>**')
        plot_categories_with_target(df, c, target = self.__target__)
        
        printmd ('**<u>2. All Categories after Frequent Category Imputation :</u>**')
        
        # Frequent value
        print ('Look at the Distibution of Frequent Category and Missing Data. Are there some major differences')
        fig = plt.figure(figsize = (8,4))
        ax = fig.add_subplot(111)
        
        value = df[c].mode().item()
        print ('\n\nMost Frequent Category: ', value)
        
        df[df[c] == value][self.__target__].plot(kind = 'kde', ax = ax, color = 'blue')
        
        # NA Value
        df[df[c].isnull()][self.__target__].plot(kind = 'kde', ax = ax, color = 'red')
        
        # Add the legend
        labels = ['Most Frequent category', 'with NA']
        ax.legend(labels, loc = 'best')
        plt.show()
        
        
        df[c+'_freq'] = df[c].fillna(value)
        
        plot_categories_with_target(df, c+'_freq', target = self.__target__)
        
        
        print ("3. All Categories after Missing Label Imputation")
        value = 'Missing'
        df[c+'_miss'] = df[c].fillna(value)
        
        plot_categories_with_target(df, c+'_miss', target = self.__target__)
        
        
        print ("4. All Categories after Randomly Selected Value Imputation")
        temp = self.__random_category_imputation__(c)
        plot_categories_with_target(temp, c+'_random', target = self.__target__)
Пример #2
0
    def eda_numerical_variable(self, variable):
        '''
        Parameter:
            variable: pass the variable for which EDA is required
            
        provides basic statistcs, missing values, distribution, spread statistics, 
        Q-Q plot, Box plot, outliers using IQR, various variable transformations'''
        c = variable
        s = self.__df__[variable]

        
        # 1. Basic Statistics

        print ('Total Number of observations : ', len(s))
        print ()

        print ('Datatype :', (s.dtype))
        print ()

        printmd ('**<u>5 Point Summary :</u>**')

        print ('  Minimum  :\t\t', s.min(), '\n  25th Percentile :\t', s.quantile(0.25), 
               '\n  Median :\t\t', s.median(), '\n  75th Percentile :\t', s.quantile(0.75), 
               '\n  Maximum  :\t\t', s.max())

        print ()

        # 2. Missing values

        printmd ('**<u>Missing Values :</u>**')

        print ('  Number :', s.isnull().sum())
        print ('  Percentage :', s.isnull().mean()*100, '%')

        # 3. Histogram
        
        printmd ('**<u>Variable distribution and Spread statistics :</u>**')

        sns.distplot(s.dropna(), hist = True, fit = norm, kde = True)
        plt.show()

        # 4. Spread Statistics

        print ('Skewness :' , s.skew())
        print ('Kurtosis :', s.kurt())
        print ()

        # 5. Q-Q plot
        printmd ('**<u>Normality Check :</u>**')
        res = stats.probplot(s.dropna(), dist = 'norm', plot = plt)
        plt.show()

        # 6. Box plot to check the spread outliers
        print ()
        printmd ('**<u>Box Plot and Visual check for Outlier  :</u>**')
        sns.boxplot(s.dropna(), orient = 'v')
        plt.show()

        # 7. Get outliers. Here distance could be a user defined parameter which defaults to 1.5

        print ()
        printmd ('**<u>Outliers (using IQR):</u>**')

        IQR = np.quantile(s, .75) - np.quantile(s, .25)
        upper_boundary = np.quantile(s, .75) + 1.5 * IQR
        lower_boundary = np.quantile(s, .25) - 1.5 * IQR

        print ('  Right end outliers :', np.sum(s>upper_boundary))
        print ('  Left end outliers :', np.sum(s < lower_boundary))

        # 8. Various Variable Transformations

        print ()
        printmd (f'**<u>Explore various transformations for {c}</u>**')
        print ()

        print ('1. Logarithmic Transformation')
        s_log = np.log(s)
        normality_diagnostic(s_log)

        print ('2. Exponential Transformation')
        s_exp = np.exp(s)
        normality_diagnostic(s_exp)

        print ('3. Square Transformation')
        s_sqr = np.square(s)
        normality_diagnostic(s_sqr)

        print ('4. Square-root Transformation')
        s_sqrt = np.sqrt(s)
        normality_diagnostic(s_sqrt)

        print ('5. Box-Cox Transformation')
        s_boxcox, lambda_param = stats.boxcox(s)
        normality_diagnostic(s_boxcox)
        print ('Optimal Lambda for Box-Cox transformation is :', lambda_param )
        print ()

        print ('6. Yeo Johnson Transformation')
        s = s.astype('float')
        s_yeojohnson, lambda_param = stats.yeojohnson(s)
        normality_diagnostic(s_yeojohnson)
        print ('Optimal Lambda for Yeo Johnson transformation is :', lambda_param )
        print ()
Пример #3
0
    def eda_categorical_variable(self, variable, add_missing=False, add_rare=False, tol=0.05):
        """
        """
        c = variable
        df = self.__df__
        s = self.__df__[variable]
        target = self.__target__
        model = self.__model__
        
        # 1. Basic Statistics
        printmd ('**<u>Basic Info :</u>**')
        print ('Total Number of observations : ', len(s))
        print ()
        
        # 2. Cardinality
        printmd ('**<u>Cardinality of the variable :</u>**')
        print ('Number of Distinct Categories (Cardinality): ', len(s.unique()))
        print ('Distinct Values : ', s.unique())
        print ()
        
        
        # 3. Missing Values

        printmd ('**<u>Missing Values :</u>**')
        
        nmiss = s.isnull().sum()
        print ('  Number :', s.isnull().sum())
        print ('  Percentage :', s.isnull().mean()*100, '%')

        # 4. Plot Categories
        
        printmd ('**<u>Category Plots :</u>**')
        plot_categories(df, c)

        # 5. Plot Categories by including Missing Values
        
        if nmiss:
            printmd ('**<u>Category plot by including Missing Values**')
            plot_categories(df, c, add_missing = True)
            
        # 6. Plot categories by combining Rare label
        
        printmd ('**<u>Category plot by including missing (if any) and Rare labels**')
        print (f'Categories less than {tol} value are clubbed in Rare label')
        plot_categories(df, c, add_missing = True, add_rare = True)
        
        #7. Plot categories with target
        
        if target:
            printmd ('**<u>Category Plot and Mean Target value:</u>**')
            plot_categories_with_target(df, c, target)
               

       #8. Plot distribution of target variable for each categories
    
        if target:
            printmd ('**<u>Distribution of Target variable for all categories:</u>**')
            plot_target_with_categories(df, c, target)
    def explore_categorical_imputation(self, variable):
        '''
        Parameters:
        -----------
            df :Dataset we are working on for Analysis.
            model : default is None. Most of the encoding methods can be used for both classification and regression problems. 
            variables : list of all the categorical variables
            target : target variable if any target 
            
            Methods for Imputation:
                method : 
                    'Mode'
                    'Random_Imputation'
                    'Rare_Encding'
                    'constant'
                    'Frequency_Encoding'
            
        Returns:
        --------
            plots/graph Histograms, KDE, CountPlots for categorical variables depecting their distribution, counts, corelations among 
            themselves/the target before and after imputation for missing value are done via different
            methods(strategy) along with columns for imputed value.
            
        Compares the results from various imputation methods so that you can choose the best suited one

            # 1st chart => existing categories and avg target value
            # 2nd chart => missing value replaced by frequent category ; then plot a chart with target value
            # 3rd chart => missing value replaced by 'Missing' category ; then plot a chart with target value
            # 4th chart => missing value replaced by random distribution ; then plot a chart with target value
        '''
        df = self.__df__
        c = variable

        printmd('**<u>Missing Values :</u>**')

        print('  Number :', df[c].isnull().sum())
        print('  Percentage :', df[c].isnull().mean() * 100, '%')
        print()

        printmd(
            f'**<u>We have following options for Imputing the Missing Value for Categorical Variable, {c} :</u>**'
        )

        print('  1. Imputing missing values by Frequent Category')
        print('  2. Imputing missing values by Missing Label')
        print('  3. Imputing missing values by Randomly selected value')

        print()
        print(
            "Let's visualize the impact of each imputation and compare it with original distribution"
        )
        print()

        printmd('**<u>1. Original Distribution of all Categories :</u>**')
        plot_categories_with_target(df, c, target=self.__target__)

        printmd(
            '**<u>2. All Categories after Frequent Category Imputation :</u>**'
        )

        # Frequent value
        print(
            'Look at the Distibution of Frequent Category and Missing Data. Are there some major differences'
        )
        fig = plt.figure(figsize=(8, 4))
        ax = fig.add_subplot(111)

        value = df[c].mode().item()
        print('\n\nMost Frequent Category: ', value)

        df[df[c] == value][self.__target__].plot(kind='kde',
                                                 ax=ax,
                                                 color='blue')

        # NA Value
        df[df[c].isnull()][self.__target__].plot(kind='kde',
                                                 ax=ax,
                                                 color='red')

        # Add the legend
        labels = ['Most Frequent category', 'with NA']
        ax.legend(labels, loc='best')
        plt.show()

        df[c + '_freq'] = df[c].fillna(value)

        plot_categories_with_target(df, c + '_freq', target=self.__target__)

        print("3. All Categories after Missing Label Imputation")
        value = 'Missing'
        df[c + '_miss'] = df[c].fillna(value)

        plot_categories_with_target(df, c + '_miss', target=self.__target__)

        print("4. All Categories after Randomly Selected Value Imputation")
        temp = self.__random_category_imputation__(c)
        plot_categories_with_target(temp,
                                    c + '_random',
                                    target=self.__target__)
Пример #5
0
    def eda_categorical_variable(self, variable, add_missing=False, add_rare=False, tol=0.05):
        """
        This function provides EDA for Categorical variable, this includes 
            - Counts
            - Cardinality, number of Categories in each Varaible
            - Missing values counts and percentages
           
        Also Category wise basic plots will be generated for the given variable 
            - Plot Categories
            - Plot Categories by including Missing Values
            - Plot categories by combining Rare label
            - Plot categories with target
            - Plot distribution of target variable for each categories (If Target Variable is passed)
       
        Parameters :
        ----------- 
            variable: Pass the variable(s) for which EDA is required
            tol : Threshold limit to combine the rare occurrence categories, (tol=0.05) i.e., less than 5% occurance categories will be grouped and forms a rare category   
                
        Optional Arguments :
        -------------------
            
            target : Define the target variable, default None
            model : Specify the model either regression OR classification 
            
         Return :
         -------
         
         Returns summary & plots of given variable
        """
        c = variable
        df = self.__df__
        s = self.__df__[variable]
        target = self.__target__
        model = self.__model__
        
        # 1. Basic Statistics
        printmd ('**<u>Basic Info :</u>**')
        print ('Total Number of observations : ', len(s))
        print ()
        
        # 2. Cardinality
        printmd ('**<u>Cardinality of the variable :</u>**')
        print ('Number of Distinct Categories (Cardinality): ', len(s.unique()))
        print ('Distinct Values : ', s.unique())
        print ()
        
        
        # 3. Missing Values

        printmd ('**<u>Missing Values :</u>**')
        
        nmiss = s.isnull().sum()
        print ('  Number :', s.isnull().sum())
        print ('  Percentage :', s.isnull().mean()*100, '%')

        # 4. Plot Categories
        
        printmd ('**<u>Category Plots :</u>**')
        plot_categories(df, c)

        # 5. Plot Categories by including Missing Values
        
        if nmiss:
            printmd ('**<u>Category plot by including Missing Values**')
            plot_categories(df, c, add_missing = True)
            
        # 6. Plot categories by combining Rare label
        
        printmd ('**<u>Category plot by including missing (if any) and Rare labels**')
        print (f'Categories less than {tol} value are clubbed in Rare label')
        plot_categories(df, c, add_missing = True, add_rare = True)
        
        #7. Plot categories with target
        
        if target:
            printmd ('**<u>Category Plot and Mean Target value:</u>**')
            plot_categories_with_target(df, c, target)
               

       #8. Plot distribution of target variable for each categories
    
        if target:
            printmd ('**<u>Distribution of Target variable for all categories:</u>**')
            plot_target_with_categories(df, c, target)
Пример #6
0
    def explore_categorical_imputation(self, variable):
        """
        In this function you just pass each variable one-by-one to explore the various missing value treatments
        Parameters:
        -----------
            variables : single categorical variable
            
            Methods for Imputation:
                method : 
                    'Mode'
                    'Random_Imputation'
                    'Rare_Encding'
                    'constant'
                    'Frequency_Encoding'
            
        Returns:
        --------
            plots/graph Histograms, KDE, CountPlots for categorical variables depecting their distribution, counts, corelations among 
            themselves/the target before and after imputation for missing value are done via different
            methods(strategy) along with columns for imputed value.
            
        Compares the results from various imputation methods so that you can choose the best suited one
            # 1st chart => existing categories and avg target value
            # 2nd chart => missing value replaced by frequent category ; then plot a chart with target value
            # 3rd chart => missing value replaced by 'Missing' category ; then plot a chart with target value
            # 4th chart => missing value replaced by random distribution ; then plot a chart with target value
        """

        miss_df = self.__miss_df__
        c = variable

        printmd('**<u>Missing Values :</u>**')

        n_miss = miss_df[c].isnull().sum()
        n_miss_perc = miss_df[c].isnull().mean() * 100
        print('  Number :', n_miss)
        print('  Percentage :', n_miss_perc, '%')
        print()

        if n_miss == 0:
            print('No Missing values... ')
            print('\n Stopping the process')

        else:

            printmd(
                f'**<u>We have following options for Imputing the Missing Value for Categorical Variable, {c} :</u>**'
            )

            print('  1. Imputing missing values by Frequent Category')
            print('  2. Imputing missing values by Missing Label')
            print('  3. Imputing missing values by Randomly selected value')

            print()
            print(
                "Let's visualize the impact of each imputation and compare it with original distribution"
            )
            print()

            printmd('**<u>1. Original Distribution of all Categories :</u>**')
            plot_categories_with_target(miss_df, c, target=self.__target__)

            printmd(
                '**<u>2. All Categories after Frequent Category Imputation :</u>**'
            )

            # Frequent value
            print(
                'Look at the Distibution of Frequent Category and Missing Data. Are there some major differences'
            )

            if n_miss > 10:
                fig = plt.figure(figsize=(8, 4))
                ax = fig.add_subplot(111)

                value = miss_df[c].mode().item()
                print('\n\nMost Frequent Category: ', value)

                miss_df[miss_df[c] == value][self.__target__].plot(
                    kind='kde', ax=ax, color='blue')

                # NA Value
                miss_df[miss_df[c].isnull()][self.__target__].plot(kind='kde',
                                                                   ax=ax,
                                                                   color='red')

                # Add the legend
                labels = ['Most Frequent category', 'with NA']
                ax.legend(labels, loc='best')
                plt.show()
            else:
                print(
                    'Not plotting the KDE plot because number of missing values is less than 10'
                )

            miss_df[c + '_freq'] = miss_df[c].fillna(value)

            plot_categories_with_target(miss_df,
                                        c + '_freq',
                                        target=self.__target__)

            print("3. All Categories after Missing Label Imputation")
            value = 'Missing'
            miss_df[c + '_miss'] = miss_df[c].fillna(value)

            plot_categories_with_target(miss_df,
                                        c + '_miss',
                                        target=self.__target__)

            print("4. All Categories after Randomly Selected Value Imputation")
            if n_miss_perc < 50:
                temp = self.__random_category_imputation__(c)
                plot_categories_with_target(temp,
                                            c + '_random',
                                            target=self.__target__)
            else:
                print(
                    'Since more than 50% Missing Value... Random value transformation is not advisable '
                )