예제 #1
0
def create_or_load(train_path):
    """Reads dataset from a CSV file or previously saved binary formats."""

    temp_dir = train_path.parent / 'tmp'
    temp_dir.mkdir(parents=True, exist_ok=True)

    temp_data = temp_dir / 'data.feather'
    if temp_data.exists():
        print(f'Loading previously saved training data: {temp_data}')
        data = feather.read_dataframe(temp_data)
    else:
        print(f'Reading the CSV file with training data: {train_path}')
        data = pd.read_csv(train_path, low_memory=False)
        print(f'Saving data frame into feather file...')
        data.to_feather(temp_data)

    temp_summary = temp_dir / 'summary.pickle'
    if temp_summary.exists():
        print(f'Loading previously saved summary: {temp_summary}')
        state = pickle.load(temp_summary.open('rb'))
        summary = DataFrameSummary(pd.DataFrame())
        summary.__dict__.update(state)
    else:
        print('Generating summary statistics')
        summary = DataFrameSummary(data)
        print('Saving summary into pickle file...')
        with temp_summary.open('wb') as file:
            state = {'length': summary.length,
                     'columns_stats': summary.columns_stats,
                     'corr': summary.corr}
            pickle.dump(state, file)

    return data, summary
예제 #2
0
 def test_get_list_of_type(self):
     xdfs = DataFrameSummary(self.xdf)
     the_type = "numeric"
     columns = xdfs._get_list_of_type(the_type)
     print(columns)
     # xdfs.get_numeric_summary()
     cols = [x.encode('ascii') for x in columns]
     print(cols)
예제 #3
0
    def test_clean_column_on_excel(self):
        xdfs = DataFrameSummary(self.xdf)
        xdf_columns = self.xdf.columns.tolist()

        print(xdfs._clean_column(xdf_columns[0]))

        for x in xdf_columns:
            # print(xdfs._clean_column(x))
            self.assertTrue(xdfs._clean_column(x))
예제 #4
0
    def test_numer_format_works_as_expected(self):
        float_nums = [(123.123, '123.12'), (123.1243453, '123.12'),
                      (213213213.123, '213,213,213.12')]
        int_nums = [(213214, '213,214'), (123213.00, '123,213')]

        for num, expected in float_nums:
            self.assertEqual(DataFrameSummary._number_format(num), expected)

        for num, expected in int_nums:
            self.assertEqual(DataFrameSummary._number_format(num), expected)
예제 #5
0
    def test_get_perc_works_as_expected(self):
        float_nums = [(0.123, '12.30%'), (3.1243453, '312.43%'),
                      (213.12312, '21,312.31%')]

        int_nums = [(0.14, '14%'), (1.300, '130%')]

        for num, expected in float_nums:
            self.assertEqual(DataFrameSummary._percent(num), expected)

        for num, expected in int_nums:
            self.assertEqual(DataFrameSummary._percent(num), expected)
    def test_numer_format_works_as_expected(self):
        float_nums = [(123.123, '123.12'),
                      (123.1243453, '123.12'),
                      (213213213.123, '213,213,213.12')]
        int_nums = [(213214, '213,214'),
                    (123213.00, '123,213')]

        for num, expected in float_nums:
            self.assertEqual(DataFrameSummary._number_format(num), expected)

        for num, expected in int_nums:
            self.assertEqual(DataFrameSummary._number_format(num), expected)
    def test_get_perc_works_as_expected(self):
        float_nums = [(0.123, '12.30%'),
                      (3.1243453, '312.43%'),
                      (213.12312, '21,312.31%')]

        int_nums = [(0.14, '14%'),
                    (1.300, '130%')]

        for num, expected in float_nums:
            self.assertEqual(DataFrameSummary._percent(num), expected)

        for num, expected in int_nums:
            self.assertEqual(DataFrameSummary._percent(num), expected)
    def test_bool1_summary(self):
        count_values = self.df['dbool1'].value_counts()
        total_count = self.df['dbool1'].count()
        count0 = count_values[0]
        count1 = count_values[1]
        perc0 = DataFrameSummary._percent(count0 / total_count)
        perc1 = DataFrameSummary._percent(count1 / total_count)
        expected = pd.Series(index=['"0" count', '"0" perc', '"1" count', '"1" perc',
                                    'counts', 'uniques', 'missing', 'missing_perc', 'types'],
                             data=[str(count0), perc0, str(count1), perc1,
                                   self.size, 2, 0, '0%', DataFrameSummary.TYPE_BOOL],
                             name='dbool1',
                             dtype=object).sort_index()

        assert_series_equal(self.dfs['dbool1'].sort_index(), expected)
예제 #9
0
def check_data_completeness(df):
    """
    Automated test to ensure data is complete and has no missing values.
    """
    df_summary = DataFrameSummary(df).summary()
    for col in df_summary.columns:
        assert df_summary.loc['missing', col] == 0, f'{col} has missing values'
예제 #10
0
def cleaningFeatures(pData, pRelMaxMissing=0.75):
    vDataSummary = DataFrameSummary(pData)  # Bestimme Feature Infos
    vCleanedData = pData.copy()

    vColumns = vDataSummary.columns_stats  # Bestimme die Lister der Columns

    for fea in vColumns:
        vTemp = vDataSummary.columns_stats[
            fea]  # Bekomme die Infos eines Features

        #print(fea[0:5])

        if fea == 'NewID':  # Prueft, ob ID
            del vCleanedData[fea]
        elif fea[0] == 'M' and fea[
                1] == 'C':  # Schmeißt die MC Wahrheiten mit MC raus
            del vCleanedData[fea]
        elif fea[0] == 'I' and fea[
                1] == '3':  # Schmeißt die MC Wahrheiten mit I3 raus
            del vCleanedData[fea]
        elif fea[
                0:6] == 'Weight':  # Schmeißt die MC Wahrheiten mit Weight raus
            del vCleanedData[fea]
        elif fea[
                0:
                7] == 'Corsika':  # Schmeißt die MC Wahrheiten mit Corsika raus
            del vCleanedData[fea]
        elif (vTemp[2] / (vTemp[0] + vTemp[2])
              ) >= pRelMaxMissing:  # Prueft ob ausreichend vollstaendig
            del vCleanedData[fea]
        elif vTemp[1] <= 1:  # Prueft ob Konstante
            del vCleanedData[fea]

    return vCleanedData
    def test_numerics_summary(self):
        num1 = self.df['dnumerics1']
        dm, dmp = self.dfs._get_deviation_of_mean(num1)
        dam, damp = self.dfs._get_median_absolute_deviation(num1)
        expected = pd.Series(index=['mean', 'std', 'variance', 'min', 'max', '5%', '25%', '50%',
                                    '75%', '95%', 'iqr', 'kurtosis', 'skewness', 'sum', 'mad', 'cv',
                                    'zeros_num', 'zeros_perc', 'deviating_of_mean',
                                    'deviating_of_mean_perc', 'deviating_of_median',
                                    'deviating_of_median_perc', 'top_correlations', 'counts',
                                    'uniques', 'missing', 'missing_perc', 'types'],
                             data=[num1.mean(), num1.std(), num1.var(), num1.min(), num1.max(),
                                   num1.quantile(0.05), num1.quantile(
                                       0.25), num1.quantile(0.5),
                                   num1.quantile(0.75), num1.quantile(0.95),
                                   num1.quantile(0.75) - num1.quantile(0.25),
                                   num1.kurt(), num1.skew(), num1.sum(), num1.mad(),
                                   num1.std() / num1.mean() if num1.mean() else np.nan,
                                   self.size - np.count_nonzero(num1),
                                   DataFrameSummary._percent(
                                       (self.size - np.count_nonzero(num1))/self.size),
                                   dm, dmp, dam, damp, 'dnumerics2: 100%', self.size, self.size,
                                   0, '0%', DataFrameSummary.TYPE_NUMERIC],
                             name='dnumerics1',
                             dtype=object)

        assert_series_equal(self.dfs['dnumerics1'], expected)
    def setUp(self):
        self.size = 1000
        missing = ([np.nan] * (self.size // 10) + list(range(10)) *
                   ((self.size - self.size // 10) // 10))
        shuffle(missing)

        self.types = [DataFrameSummary.TYPE_NUMERIC, DataFrameSummary.TYPE_BOOL,
                      DataFrameSummary.TYPE_CATEGORICAL, DataFrameSummary.TYPE_CONSTANT,
                      DataFrameSummary.TYPE_UNIQUE, DataFrameSummary.TYPE_DATE]

        self.columns = ['dbool1', 'dbool2', 'duniques', 'dcategoricals', 'dnumerics1', 'dnumerics2',
                        'dnumerics3', 'dmissing', 'dconstant', 'ddates']

        self.df = pd.DataFrame(dict(
            dbool1=np.random.choice([0, 1], size=self.size),
            dbool2=np.random.choice(['a', 'b'], size=self.size),
            duniques=['x{}'.format(i) for i in range(self.size)],
            dcategoricals=['a'.format(i) if i % 2 == 0 else
                           'b'.format(i) if i % 3 == 0 else
                           'c'.format(i) for i in range(self.size)],
            dnumerics1=range(self.size),
            dnumerics2=range(self.size,  2 * self.size),
            dnumerics3=list(range(self.size - self.size // 10)) + list(range(-self.size // 10, 0)),
            dmissing=missing,
            dconstant=['a'] * self.size,
            ddates=pd.date_range('2010-01-01', periods=self.size, freq='1M')))

        self.dfs = DataFrameSummary(self.df)
예제 #13
0
def loadtypes(data_df):
    summary_df = DataFrameSummary(data_df).summary()
    # auto evaluate datatype
    contin_vars = [
        col for col in summary_df.columns
        if summary_df.loc["types"][col] == 'numeric'
    ]
    bool_vars = [
        col for col in summary_df.columns
        if summary_df.loc["types"][col] == 'bool'
    ]
    cat_vars = [
        col for col in summary_df.columns
        if summary_df.loc["types"][col] == 'categorical'
    ]
    dt_vars = [
        col for col in summary_df.columns
        if summary_df.loc["types"][col] == 'date'
    ]
    const_vars = [
        col for col in summary_df.columns
        if summary_df.loc["types"][col] == 'constant'
    ]
    text_vars = []
    for var in ['DEPT_CODE']:
        contin_vars.remove(var), cat_vars.append(var)
    for var in ['OP_NAME', 'INHOS_DIAG_NAME']:
        cat_vars.remove(var), text_vars.append(var)
    cat_vars.extend(bool_vars)
    return contin_vars, cat_vars, dt_vars, text_vars, const_vars
예제 #14
0
def datacompleteness(df):
    """
    Takes a dataframe df and returns summary statistics for each column including 
    missing values.
    """
    df_summary = DataFrameSummary(df).summary()
    return df_summary
예제 #15
0
    def setUp(self):
        self.size = 1000
        missing = [np.nan] * (self.size // 10) + list(range(10)) * (
            (self.size - self.size // 10) // 10)
        shuffle(missing)

        self.types = [
            DataFrameSummary.TYPE_NUMERIC, DataFrameSummary.TYPE_BOOL,
            DataFrameSummary.TYPE_CATEGORICAL, DataFrameSummary.TYPE_CONSTANT,
            DataFrameSummary.TYPE_UNIQUE, DataFrameSummary.TYPE_DATE
        ]

        self.columns = [
            'dbool1', 'dbool2', 'duniques1', 'duniques2', 'dcategoricals1',
            'dcategoricals2', 'dnumerics1', 'dnumerics2', 'dnumerics3',
            'dmissing', 'dconstant', 'ddates1', 'ddates2'
        ]

        self.df = pd.DataFrame(
            dict(
                dbool1=np.random.choice([0, 1], size=self.size),
                dbool2=np.random.choice(['a', 'b'], size=self.size),
                duniques1=['x{}'.format(i) for i in range(self.size)],
                duniques2=['y{}'.format(i) for i in range(self.size)],
                dcategoricals1=[
                    'a'.format(i) if i % 2 == 0 else 'b'.format(i) if i %
                    3 == 0 else 'c'.format(i) for i in range(self.size)
                ],
                dcategoricals2=[
                    'x'.format(i) if i % 2 == 0 else 'y'.format(i) if i %
                    3 == 0 else 'z'.format(i) for i in range(self.size)
                ],
                dnumerics1=range(self.size),
                dnumerics2=range(self.size, 2 * self.size),
                dnumerics3=list(range(self.size - self.size // 10)) +
                list(range(-self.size // 10, 0)),
                dmissing=missing,
                dconstant=['a'] * self.size,
                ddates1=pd.date_range('2010-01-01',
                                      periods=self.size,
                                      freq='1M'),
                ddates2=pd.date_range('2000-01-01',
                                      periods=self.size,
                                      freq='1W'),
            ))

        self.dfs = DataFrameSummary(self.df)
예제 #16
0
def test_datacompleteness():
    """
    Takes a dataframe and checks for missing values.
    """
    df = pd.read_csv('exploration/data/titanic.csv',
                     usecols=['Name', 'Sex', 'Age', 'Survived'])
    df_summary = DataFrameSummary(df).summary()
    for col in df_summary.columns:
        assert df_summary.loc['missing', col] == 0, f'{col} has missing values'
예제 #17
0
    def test_bool1_summary(self):
        count_values = self.df['dbool1'].value_counts()
        total_count = self.df['dbool1'].count()
        count0 = count_values[0]
        count1 = count_values[1]
        perc0 = DataFrameSummary._percent(count0 / total_count)
        perc1 = DataFrameSummary._percent(count1 / total_count)
        expected = pd.Series(index=[
            '"0" count', '"0" perc', '"1" count', '"1" perc', 'counts',
            'uniques', 'missing', 'missing_perc', 'types'
        ],
                             data=[
                                 str(count0), perc0,
                                 str(count1), perc1, self.size, 2, 0, '0%',
                                 DataFrameSummary.TYPE_BOOL
                             ],
                             name='dbool1',
                             dtype=object)

        assert_series_equal(self.dfs['dbool1'], expected)
예제 #18
0
    def column_info(self, dataset="train"):
        """
        Describes your columns using the DataFrameSummary library with basic descriptive info.

        Credits go to @mouradmourafiq for his pandas-summary library.

        Info
        ----
        counts
        uniques
        missing
        missing_perc
        types
        
        Parameters
        ----------
        dataset : str, optional
            Type of dataset to describe. Can either be `train` or `test`.
            If you are using the full dataset it will automatically describe
            your full dataset no matter the input, 
            by default 'train'
        
        Returns
        -------
        DataFrame
            Dataframe describing your columns with basic descriptive info

        Examples
        ---------
        >>> data.column_info()
        """

        if dataset == "train":
            x_train_summary = DataFrameSummary(self.x_train)

            return x_train_summary.columns_stats
        else:
            x_test_summary = DataFrameSummary(self.x_test)

            return x_test_summary.columns_stats
예제 #19
0
    def describe(self, dataset="train"):
        """
        Describes your dataset using the DataFrameSummary library with basic descriptive info.
        Extends the DataFrame.describe() method to give more info.

        Credits go to @mouradmourafiq for his pandas-summary library.
        
        Parameters
        ----------
        dataset : str, optional
            Type of dataset to describe. Can either be `train` or `test`.
            If you are using the full dataset it will automatically describe
            your full dataset no matter the input, 
            by default 'train'
        
        Returns
        -------
        DataFrame
            Dataframe describing your dataset with basic descriptive info

        Examples
        ---------
        >>> data.describe()
        """

        if dataset == "train":
            x_train_summary = DataFrameSummary(self.x_train)

            return x_train_summary.summary()
        else:
            x_test_summary = DataFrameSummary(self.x_test)

            return x_test_summary.summary()
예제 #20
0
def ka_display_muti_tables_summary(tables, table_names):
    '''display multi tables' summary

        Parameters
        ----------
        tables: list_like
                Pandas dataframes
        table_names: list_like
                     names of each dataframe
    '''
    for t, t_name in zip(tables, table_names):
        print(t_name + ":")
        display(DataFrameSummary(t).summary())
예제 #21
0
def ka_display_muti_tables_summary(tables, table_names, n=5):
    '''display multi tables' summary

        Parameters
        ----------
        tables: list_like
                Pandas dataframes
        table_names: list_like
                     names of each dataframe

        Return
        ------
        1. show head of data
        2. show column types of data
        3. show summary of data
    '''
    for t, t_name in zip(tables, table_names):
        print(t_name + ":", t.shape)
        ka_display_side_by_side(t.head(n=n), _ka_display_col_type(t),
                                DataFrameSummary(t).summary())
예제 #22
0
    def test_numerics_summary(self):
        num1 = self.df['dnumerics1']
        dm, dmp = self.dfs._get_deviation_of_mean(num1)
        dam, damp = self.dfs._get_median_absolute_deviation(num1)
        expected = pd.Series(
            index=[
                'mean', 'std', 'variance', 'min', 'max', 'mode', '5%', '25%',
                '50%', '75%', '95%', 'iqr', 'kurtosis', 'skewness', 'sum',
                'mad', 'cv', 'zeros_num', 'zeros_perc', 'deviating_of_mean',
                'deviating_of_mean_perc', 'deviating_of_median',
                'deviating_of_median_perc', 'top_correlations', 'counts',
                'uniques', 'missing', 'missing_perc', 'types'
            ],
            data=[
                num1.mean(),
                num1.std(),
                num1.var(),
                num1.min(),
                num1.max(),
                num1.mode()[0],
                num1.quantile(0.05),
                num1.quantile(0.25),
                num1.quantile(0.5),
                num1.quantile(0.75),
                num1.quantile(0.95),
                num1.quantile(0.75) - num1.quantile(0.25),
                num1.kurt(),
                num1.skew(),
                num1.sum(),
                num1.mad(),
                num1.std() / num1.mean() if num1.mean() else np.nan,
                self.size - np.count_nonzero(num1),
                DataFrameSummary._percent(
                    (self.size - np.count_nonzero(num1)) / self.size), dm, dmp,
                dam, damp, 'dnumerics2: 100%', self.size, self.size, 0, '0%',
                DataFrameSummary.TYPE_NUMERIC
            ],
            name='dnumerics1',
            dtype=object)

        assert_series_equal(self.dfs['dnumerics1'], expected)
예제 #23
0
def replaceValues(pData, pTransform=False):
    vScaler = StandardScaler()
    vDataSummary = DataFrameSummary(pData)

    for col in pData.columns.values:
        if vDataSummary.columns_stats[col][
                4] != 'numeric':  # Ersetzt alle nicht numerischen Werte durcdh numerisch
            vEleList = pData[col].value_counts(
            ).index.values  #     und fuellt falsche Eintraege (NaN, etc) mit 0
            vReplacements = np.linspace(1, len(vEleList), len(vEleList))

            pData[col] = pData[col].replace(to_replace=vEleList,
                                            value=vReplacements)
            pData[col] = pData[col].fillna(0)
        elif pData[
                col].dtype == 'int64':  # Fuellt fuer Integer Spalten falsche Eintraege mit 0
            pData[col] = pData[col].fillna(0)
            #pData[col] = vScaler.fit_transform(pData[col])
        else:  # Fuellt kontinuierliche Eintraege mit dem durchschnitt.
            pData[col] = pData[col].fillna(pData[col].mean())
            #pData[col] = vScaler.fit_transform(pData[col])

    return pData
    # Extract the patient's covariates as a numpy array
    x_df = df.drop([event_col, time_col], axis=1)
    x = x_df.values.astype(np.float32)

    # Return the deep surv dataframe
    return x, {'e': e, 't': t}


# Please include SQL export csv file with destination below
temp_features = pd.read_csv(r'', sep=',')

# =============================================================================
# #Get summary for features and for each individual class labels
# =============================================================================
from pandas_summary import DataFrameSummary
df_summary = DataFrameSummary(temp_features)
temp_features.describe().transpose().join(
    df_summary.columns_stats.transpose()).to_csv(r'T:\tbase\feature_stats.csv')

# =============================================================================
# #Remove constant features
# =============================================================================
for index, row in df_summary.columns_stats.transpose()[
        df_summary.columns_stats.transpose()['types'].str.lower().str.contains(
            'constant')].iterrows():
    print('Removed column ' + index + ' (constant)')
    temp_features.drop([index], axis=1, inplace=True)

print('The shape of our features is:', temp_features.shape)
temp_features.describe()
feature_list = list(temp_features.columns)
예제 #25
0
import matplotlib.pyplot as plt
sns.set(rc={'figure.figsize': (15, 12)})

df1 = temp_features.select_dtypes([np.int, np.float]).fillna(-5)
for i, col in enumerate(df1.columns):
    plt.figure(i)
    sns_plot = sns.distplot(df1[col])
    fig = sns_plot.get_figure()
    #    Please change the path according to your need
    fig.savefig(r'T:\\tbase\\plots\\' + col + '_PreImputation.png')

# =============================================================================
# #Get summary for features and for each individual class labels
# =============================================================================
from pandas_summary import DataFrameSummary
df_summary = DataFrameSummary(temp_features)
#Saving the summary in a CSV file
temp_features.describe(
).transpose().join(df_summary.columns_stats.transpose()).to_csv(
    r'T:\tbase\short\feature_stats.csv')  #Please change the path accordingly

#Failure class
feature_distribution_failure = temp_features.loc[
    temp_features['Shortterm_TransplantOutcome'] == 1]
df_summary_distribution_failure = DataFrameSummary(
    feature_distribution_failure)
feature_distribution_failure.describe().transpose().join(
    df_summary_distribution_failure.columns_stats.transpose()).to_csv(
        r'T:\tbase\short\feature_stats_failure.csv'
    )  #Please change the path accordingly
#Success class
예제 #26
0
 def test_pandas_summary_on_csv_df(self):
     #: this works great!
     cdfs = DataFrameSummary(self.cdf)
     print(cdfs.get_numeric_summary())
예제 #27
0
 def test_pandas_summary_on_excel_df(self):
     #: fixme: this returns error
     xdfs = DataFrameSummary(self.xdf)
     print(xdfs.get_numeric_summary())
예제 #28
0
df = df.loc[df['Outlier'] == False]
print("df despues",df.shape)

"""# Fin comprobacion"""

cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen', 'Promo2Weeks', 
            'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear', 'State', 
            'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_bool_fw', 'StateHoliday_bool_bw', 'SchoolHoliday_fw', 'SchoolHoliday_bw','Open']

#embeddings_model.hdf5'
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day','StateHoliday','State','StoreType','Assortment']

#cat_vars = ['Store', 'DayOfWeek','StoreType','Year', 'Month', 'Day', 'State']
len(cat_vars)

uniques = DataFrameSummary(df[cat_vars]).summary().loc[['uniques']]

contin_vars = ['CompetitionDistance', 
   'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC', 'Precipitationmm',
   'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 
   'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
   'AfterStateHoliday_bool', 'BeforeStateHoliday_bool', 'Promo', 'SchoolHoliday', 'StateHoliday_bool']
# contin_vars = []

#embeddings_model.hdf5'
contin_vars = ['CompetitionDistance','Promo','Max_TemperatureC','BeforeStateHoliday_bool']
len(contin_vars)

y_out_columns = ['Sales']

df_train = df[df.Date < datetime.datetime(2015, 7, 1)]  
예제 #29
0
def check_data_completeness(df):
    df_summary = DataFrameSummary(df).summary()
    for col in df_summary.columns:
        assert df_summary.loc['missing', col] == 0, f'{col} has missing values'
예제 #30
0
    'id': pd.read_csv('{}/store_id_relation.csv'.format(DATA_DIR)),
    'tes': pd.read_csv('{}/sample_submission.csv'.format(DATA_DIR)),
    'hol': pd.read_csv('{}/date_info.csv'.format(DATA_DIR))
}
#%%
for tbl, df in data.items():
    print(tbl)
    display(df.head())

#%%

#%%
test_df = utils.tes2trn(data['tes'])
test_df.visit_date = pd.to_datetime(test_df.visit_date)
display(test_df.head())
display(DataFrameSummary(test_df).summary())

#

#%%
""" 
test dataset:
2017-04-23 to 2017-05-31
39 unique days in test day
821 air_store_id

trn dataset:
478 unique days
829 stores

store not in test:
예제 #31
0
# 

# In[17]:


for t in tables: display(t.head())


# This is very representative of a typical industry dataset.

# The following returns summarized aggregate information to each table accross each field.

# In[41]:


for t in tables: display(DataFrameSummary(t).summary())


# ## Data Cleaning / Feature Engineering

# As a structured data problem, we necessarily have to go through all the cleaning and feature engineering, even though we're using a neural network.

# In[44]:


train, store, store_states, state_names, googletrend, weather, test = tables


# In[45]:

예제 #32
0
def get_summary(df):
    return DataFrameSummary(df)
예제 #33
0
# coding: utf-8
__title__ = 'data_eda'
__author__ = 'JieYuan'
__mtime__ = '2018/2/13'

from pandas_summary import DataFrameSummary

# 描述性统计
summary = lambda x: DataFrameSummary(x).summary().transpose()
예제 #34
0
    def describe_column(self, column, dataset="train"):
        """
        Analyzes a column and reports descriptive statistics about the columns.

        Credits go to @mouradmourafiq for his pandas-summary library.

        Statistics
        ----------
        std                                      
        max                                      
        min                                      
        variance                                 
        mean
        mode                                     
        5%                                       
        25%                                      
        50%                                      
        75%                                      
        95%                                      
        iqr                                      
        kurtosis                                 
        skewness                                 
        sum                                      
        mad                                      
        cv                                       
        zeros_num                                
        zeros_perc                               
        deviating_of_mean                        
        deviating_of_mean_perc                   
        deviating_of_median                      
        deviating_of_median_perc                 
        top_correlations                         
        counts                                   
        uniques                                  
        missing                                  
        missing_perc                             
        types                            
        
        Parameters
        ----------
        column : str
            Column in your dataset you want to analze.

        dataset : str, optional
            Type of dataset to describe. Can either be `train` or `test`.
            If you are using the full dataset it will automatically describe
            your full dataset no matter the input, 
            by default 'train'
        
        Returns
        -------
        dict
            Dictionary mapping a statistic and its value for a specific column
            
        Examples
        --------
        >>> data.describe_column('col1')
        """

        if dataset == "train":
            x_train_summary = DataFrameSummary(self.x_train)

            return x_train_summary[column]
        else:
            x_test_summary = DataFrameSummary(self.x_test)

            return x_test_summary[column]
class DataFrameSummaryTest(unittest.TestCase):

    def setUp(self):
        self.size = 1000
        missing = ([np.nan] * (self.size // 10) + list(range(10)) *
                   ((self.size - self.size // 10) // 10))
        shuffle(missing)

        self.types = [DataFrameSummary.TYPE_NUMERIC, DataFrameSummary.TYPE_BOOL,
                      DataFrameSummary.TYPE_CATEGORICAL, DataFrameSummary.TYPE_CONSTANT,
                      DataFrameSummary.TYPE_UNIQUE, DataFrameSummary.TYPE_DATE]

        self.columns = ['dbool1', 'dbool2', 'duniques', 'dcategoricals', 'dnumerics1', 'dnumerics2',
                        'dnumerics3', 'dmissing', 'dconstant', 'ddates']

        self.df = pd.DataFrame(dict(
            dbool1=np.random.choice([0, 1], size=self.size),
            dbool2=np.random.choice(['a', 'b'], size=self.size),
            duniques=['x{}'.format(i) for i in range(self.size)],
            dcategoricals=['a'.format(i) if i % 2 == 0 else
                           'b'.format(i) if i % 3 == 0 else
                           'c'.format(i) for i in range(self.size)],
            dnumerics1=range(self.size),
            dnumerics2=range(self.size,  2 * self.size),
            dnumerics3=list(range(self.size - self.size // 10)) + list(range(-self.size // 10, 0)),
            dmissing=missing,
            dconstant=['a'] * self.size,
            ddates=pd.date_range('2010-01-01', periods=self.size, freq='1M')))

        self.dfs = DataFrameSummary(self.df)

    def test_get_columns_works_as_expected(self):
        assert len(self.dfs.get_columns(self.df, DataFrameSummary.ALL)) == 10

        assert len(self.dfs.get_columns(self.df,
                                        DataFrameSummary.INCLUDE,
                                        ['dnumerics1', 'dnumerics2', 'dnumerics3'])) == 3

        assert len(self.dfs.get_columns(self.df,
                                        DataFrameSummary.EXCLUDE,
                                        ['dnumerics1', 'dnumerics2', 'dnumerics3'])) == 7

    def test_column_types_works_as_expected(self):
        expected = pd.Series(index=self.types, data=[
                             4, 2, 1, 1, 1, 1], name='types')
        assert_series_equal(
            self.dfs.columns_types[self.types], expected[self.types])

    def test_column_stats_works_as_expected(self):
        column_stats = self.dfs.columns_stats
        self.assertTupleEqual(column_stats.shape, (5, 10))

        # counts
        expected = pd.Series(index=self.columns,
                             data=self.size,
                             name='counts',
                             dtype='object')
        expected['dmissing'] -= 100
        assert_series_equal(column_stats[self.columns].loc['counts'],
                            expected[self.columns])

        # uniques
        expected = pd.Series(index=self.columns,
                             data=self.size,
                             name='uniques',
                             dtype='object')
        expected[['dbool1', 'dbool2']] = 2
        expected[['dcategoricals']] = 3
        expected[['dconstant']] = 1
        expected[['dmissing']] = 10
        assert_series_equal(column_stats[self.columns].loc['uniques'].sort_index(),
                            expected[self.columns].sort_index(), check_dtype=False)

        # missing
        expected = pd.Series(index=self.columns,
                             data=0,
                             name='missing',
                             dtype='object')
        expected[['dmissing']] = 100
        assert_series_equal(column_stats[self.columns].loc['missing'],
                            expected[self.columns],
                            check_dtype=False)

        # missing_perc
        expected = pd.Series(index=self.columns,
                             data=['0%'] * 10,
                             name='missing_perc',
                             dtype='object')

        expected[['dmissing']] = '10%'
        assert_series_equal(column_stats[self.columns].loc['missing_perc'],
                            expected[self.columns])

        # types
        expected = pd.Series(index=self.columns,
                             data=[np.nan] * 10,
                             name='types',
                             dtype='object')

        expected[['dbool1', 'dbool2']] = DataFrameSummary.TYPE_BOOL
        expected[['dcategoricals']] = DataFrameSummary.TYPE_CATEGORICAL
        expected[['dconstant']] = DataFrameSummary.TYPE_CONSTANT
        expected[['ddates']] = DataFrameSummary.TYPE_DATE
        expected[['duniques']] = DataFrameSummary.TYPE_UNIQUE
        expected[['dnumerics1', 'dnumerics2',
                  'dnumerics3', 'dmissing']] = DataFrameSummary.TYPE_NUMERIC
        assert_series_equal(column_stats[self.columns].loc['types'],
                            expected[self.columns])

    def test_numer_format_works_as_expected(self):
        float_nums = [(123.123, '123.12'),
                      (123.1243453, '123.12'),
                      (213213213.123, '213,213,213.12')]
        int_nums = [(213214, '213,214'),
                    (123213.00, '123,213')]

        for num, expected in float_nums:
            self.assertEqual(DataFrameSummary._number_format(num), expected)

        for num, expected in int_nums:
            self.assertEqual(DataFrameSummary._number_format(num), expected)

    def test_get_perc_works_as_expected(self):
        float_nums = [(0.123, '12.30%'),
                      (3.1243453, '312.43%'),
                      (213.12312, '21,312.31%')]

        int_nums = [(0.14, '14%'),
                    (1.300, '130%')]

        for num, expected in float_nums:
            self.assertEqual(DataFrameSummary._percent(num), expected)

        for num, expected in int_nums:
            self.assertEqual(DataFrameSummary._percent(num), expected)

    def test_uniques_summary(self):
        expected = pd.Series(index=['counts', 'uniques', 'missing', 'missing_perc', 'types'],
                             data=[self.size, self.size, 0, '0%',
                                   DataFrameSummary.TYPE_UNIQUE],
                             name='duniques',
                             dtype=object)
        assert_series_equal(self.dfs['duniques'], expected)

    def test_constant_summary(self):
        self.assertEqual(self.dfs['dconstant'], 'This is a constant value: a')

    def test_bool1_summary(self):
        count_values = self.df['dbool1'].value_counts()
        total_count = self.df['dbool1'].count()
        count0 = count_values[0]
        count1 = count_values[1]
        perc0 = DataFrameSummary._percent(count0 / total_count)
        perc1 = DataFrameSummary._percent(count1 / total_count)
        expected = pd.Series(index=['"0" count', '"0" perc', '"1" count', '"1" perc',
                                    'counts', 'uniques', 'missing', 'missing_perc', 'types'],
                             data=[str(count0), perc0, str(count1), perc1,
                                   self.size, 2, 0, '0%', DataFrameSummary.TYPE_BOOL],
                             name='dbool1',
                             dtype=object).sort_index()

        assert_series_equal(self.dfs['dbool1'].sort_index(), expected)

    def test_bool2_summary(self):
        count_values = self.df['dbool2'].value_counts()
        total_count = self.df['dbool2'].count()
        count0 = count_values['a']
        count1 = count_values['b']
        perc0 = DataFrameSummary._percent(count0 / total_count)
        perc1 = DataFrameSummary._percent(count1 / total_count)
        expected = pd.Series(index=['"a" count', '"a" perc', '"b" count', '"b" perc',
                                    'counts', 'uniques', 'missing', 'missing_perc', 'types'],
                             data=[str(count0), perc0, str(count1), perc1,
                                   self.size, 2, 0, '0%', DataFrameSummary.TYPE_BOOL],
                             name='dbool2',
                             dtype=object)

        assert_series_equal(self.dfs['dbool2'],
                            expected)

    def test_categorical_summary(self):
        expected = pd.Series(index=['top',
                                    'counts', 'uniques', 'missing', 'missing_perc', 'types'],
                             data=['a: 500',
                                   self.size, 3, 0, '0%', DataFrameSummary.TYPE_CATEGORICAL],
                             name='dcategoricals',
                             dtype=object)

        assert_series_equal(self.dfs['dcategoricals'], expected)

    def test_dates_summary(self):
        dmin = self.df['ddates'].min()
        dmax = self.df['ddates'].max()
        expected = pd.Series(index=['max', 'min', 'range',
                                    'counts', 'uniques', 'missing', 'missing_perc', 'types'],
                             data=[dmax, dmin, dmax - dmin,
                                   self.size, self.size, 0, '0%', DataFrameSummary.TYPE_DATE],
                             name='ddates',
                             dtype=object).sort_index()

        tmp = self.dfs['ddates'].sort_index()
        assert_series_equal(tmp, expected)

    def test_numerics_summary(self):
        num1 = self.df['dnumerics1']
        dm, dmp = self.dfs._get_deviation_of_mean(num1)
        dam, damp = self.dfs._get_median_absolute_deviation(num1)
        expected = pd.Series(index=['mean', 'std', 'variance', 'min', 'max', '5%', '25%', '50%',
                                    '75%', '95%', 'iqr', 'kurtosis', 'skewness', 'sum', 'mad', 'cv',
                                    'zeros_num', 'zeros_perc', 'deviating_of_mean',
                                    'deviating_of_mean_perc', 'deviating_of_median',
                                    'deviating_of_median_perc', 'top_correlations', 'counts',
                                    'uniques', 'missing', 'missing_perc', 'types'],
                             data=[num1.mean(), num1.std(), num1.var(), num1.min(), num1.max(),
                                   num1.quantile(0.05), num1.quantile(
                                       0.25), num1.quantile(0.5),
                                   num1.quantile(0.75), num1.quantile(0.95),
                                   num1.quantile(0.75) - num1.quantile(0.25),
                                   num1.kurt(), num1.skew(), num1.sum(), num1.mad(),
                                   num1.std() / num1.mean() if num1.mean() else np.nan,
                                   self.size - np.count_nonzero(num1),
                                   DataFrameSummary._percent(
                                       (self.size - np.count_nonzero(num1))/self.size),
                                   dm, dmp, dam, damp, 'dnumerics2: 100%', self.size, self.size,
                                   0, '0%', DataFrameSummary.TYPE_NUMERIC],
                             name='dnumerics1',
                             dtype=object)

        assert_series_equal(self.dfs['dnumerics1'], expected)
예제 #36
0
class DataFrameSummaryTest(unittest.TestCase):
    """
    Test the new methods added by Alfonso R. Reyes.
    Dataframe has been expanded to show more columns of the same type.
    Needed for the summary.
    """
    def setUp(self):
        self.size = 1000
        missing = [np.nan] * (self.size // 10) + list(range(10)) * (
            (self.size - self.size // 10) // 10)
        shuffle(missing)

        self.types = [
            DataFrameSummary.TYPE_NUMERIC, DataFrameSummary.TYPE_BOOL,
            DataFrameSummary.TYPE_CATEGORICAL, DataFrameSummary.TYPE_CONSTANT,
            DataFrameSummary.TYPE_UNIQUE, DataFrameSummary.TYPE_DATE
        ]

        self.columns = [
            'dbool1', 'dbool2', 'duniques1', 'duniques2', 'dcategoricals1',
            'dcategoricals2', 'dnumerics1', 'dnumerics2', 'dnumerics3',
            'dmissing', 'dconstant', 'ddates1', 'ddates2'
        ]

        self.df = pd.DataFrame(
            dict(
                dbool1=np.random.choice([0, 1], size=self.size),
                dbool2=np.random.choice(['a', 'b'], size=self.size),
                duniques1=['x{}'.format(i) for i in range(self.size)],
                duniques2=['y{}'.format(i) for i in range(self.size)],
                dcategoricals1=[
                    'a'.format(i) if i % 2 == 0 else 'b'.format(i) if i %
                    3 == 0 else 'c'.format(i) for i in range(self.size)
                ],
                dcategoricals2=[
                    'x'.format(i) if i % 2 == 0 else 'y'.format(i) if i %
                    3 == 0 else 'z'.format(i) for i in range(self.size)
                ],
                dnumerics1=range(self.size),
                dnumerics2=range(self.size, 2 * self.size),
                dnumerics3=list(range(self.size - self.size // 10)) +
                list(range(-self.size // 10, 0)),
                dmissing=missing,
                dconstant=['a'] * self.size,
                ddates1=pd.date_range('2010-01-01',
                                      periods=self.size,
                                      freq='1M'),
                ddates2=pd.date_range('2000-01-01',
                                      periods=self.size,
                                      freq='1W'),
            ))

        self.dfs = DataFrameSummary(self.df)

    def test_columns_stats(self):
        """
        Test the columns_stats instance variable and the columns of the test dataframe.
        :return:
        """
        columns_stats = self.dfs.columns_stats
        print(type(columns_stats))
        self.assertIsInstance(columns_stats, pd.core.frame.DataFrame)
        expected = [
            'dbool1', 'dbool2', 'dcategoricals1', 'dcategoricals2',
            'dconstant', 'ddates1', 'ddates2', 'dmissing', 'dnumerics1',
            'dnumerics2', 'dnumerics3', 'duniques1', 'duniques2'
        ]
        result = columns_stats.columns.tolist()
        print(result)
        self.assertEqual(expected, result)

    def test__is_all_numeric_false(self):
        """
        Test that not all the columns provided in the list are "numeric".
        It must return "False"
        :return:
        """
        columns = [
            'dbool1', 'dbool2', 'dcategoricals', 'dconstant', 'ddates',
            'dmissing', 'dnumerics1', 'dnumerics2', 'dnumerics3', 'duniques'
        ]
        result = self.dfs._is_all_numeric(columns)
        print(result)
        self.assertFalse(result)

    def test__is_all_numeric_true(self):
        """
        Test that all columns passed are "numeric".
        It must be "True"
        :return:
        """
        columns = ['dnumerics1', 'dnumerics2', 'dnumerics3']
        result = self.dfs._is_all_numeric(columns)
        print(result)
        self.assertTrue(result)

    def test__is_all_numeric_true_missing(self):
        """
        Numeric columns provided this time included NaNs.
        It muest be "True"
        :return:
        """
        #: includes missing nan column, which is numeric as well
        columns = ['dnumerics1', 'dnumerics2', 'dnumerics3', 'dmissing']
        result = self.dfs._is_all_numeric(columns)
        print(result)
        self.assertTrue(result)

    def test__get_list_of_type_numeric(self):
        """
        Test that a list of numeric columns matches the test dataframe
        :return:
        """
        expected = ['dmissing', 'dnumerics1', 'dnumerics2', 'dnumerics3']
        result = self.dfs._get_list_of_type("numeric")
        print(result)
        print(self.dfs[result])
        self.assertTrue(expected == result)

    def test__get_list_of_type_numeric_generic(self):
        """
        Test that all the columns returning are all of the same `numeric` type
        :return:
        """
        the_type = "numeric"
        columns = self.dfs._get_list_of_type(the_type)
        frame = self.dfs[columns]
        print(frame)
        types = frame.ix['types']
        set_of_types = set(types.tolist())
        result = the_type in set_of_types
        print(result)
        self.assertTrue(result)

    def test_get_numeric_summary(self):
        """
        Test that the columns types reduce to a unique numeric value and matches.
        :return:
        """
        frame = self.dfs.get_numeric_summary()
        print(frame)
        result = self.dfs.TYPE_NUMERIC in set(frame.ix['types'])
        print(result)
        self.assertTrue(result)

    def test__get_list_of_type_boolean(self):
        """
        Test that boolean columns match the type `bool`
        :return:
        """
        expected = ['dbool1', 'dbool2']
        result = self.dfs._get_list_of_type("bool")
        print(result)
        self.assertTrue(expected == result)

    def test_show_dataframe_per_type(self):
        """
        Shows a column, one by one grouping by column type
        :return:
        """
        for column in self.types:
            print(column)
            columns = self.dfs._get_list_of_type(column)
            # print(self.dfs[columns])
            list_of = columns
            for col in list_of:
                print(self.dfs[col])

    def test__get_list_of_type_bool_generic(self):
        """
        This is an OLD behavior. Now corrected.
        There is a problem when the list of columns specified is not numeric: what returns when
        dfs[columns] is specified could be a list of the columns values.
        No what we are looking for.
        """
        the_type = "bool"
        columns = self.dfs._get_list_of_type(the_type)
        print(columns)
        df = self.dfs[['dbool1', 'dbool2']]
        print(df)
        self.assertTrue(df.shape[1] == 2)

    def test_get_all_series_bool(self):
        """
        Test that boolean summary return the same number of rows.
        WIth the new behavior the number of rows must be 9 in the case of booleans
        :return:
        """
        list_of = ['dbool1', 'dbool2']
        for col in list_of:
            ser = self.dfs[col]
            print ser
            print ser.shape[0]
            self.assertTrue(ser.shape[0] == 9)

    def test_show_columns_types(self):
        """
        Test that the columns in the test dataframe is a subset of the class variable "types"
        :return:
        """
        self.assertTrue(
            set(self.dfs.columns_types.index).issubset(self.dfs.types))

    def test__is_type_the_same_bool(self):
        """
        Test that the columns passed are of the same type
        :return:
        """
        columns = ['dbool1', 'dbool2']
        list_of_types = self.dfs._is_type_the_same(columns)
        self.assertTrue(list_of_types)

    def test__is_type_the_same_many_false(self):
        """
        Tests that the columns passed are NOT all of the same type
        :return:
        """
        columns = ['dbool1', 'dbool2', 'dnumerics1']
        list_of_types = self.dfs._is_type_the_same(columns)
        self.assertFalse(list_of_types)

    def test__is_type_the_same_numeric(self):
        """
        Test that the columns passed are all the same
        :return:
        """
        columns = ['dnumerics1', 'dnumerics2', 'dnumerics3', 'dmissing']
        list_of_types = self.dfs._is_type_the_same(columns)
        self.assertTrue(list_of_types)

    def test_get_all_the_same_unique(self):
        """
        Test that the unique columns passed are all unique
        :return:
        """
        columns = ['duniques1', 'duniques2']
        self.assertTrue(
            set(self.dfs[columns].loc['types'].tolist()) == {'unique'})

    def test_get_all_the_same_numeric(self):
        """
        Test that all the numeric columns are all numeric
        """
        columns = ['dnumerics1', 'dnumerics2', 'dnumerics3', 'dmissing']
        self.assertTrue(
            set(self.dfs[columns].loc['types'].tolist()) == {'numeric'})

    def test_get_all_the_same_categorical(self):
        """
        Tests that all categorical columns reduce to `categorical`
        :return:
        """
        columns = ['dcategoricals1', 'dcategoricals2']
        self.assertTrue(
            set(self.dfs[columns].loc['types'].tolist()) == {'categorical'})

    def test_get_all_the_same_dates(self):
        """
        Test that all the ``date columns reduce to a unique type `date`
        :return:
        """
        columns = ['ddates1', 'ddates2']
        self.assertTrue(
            set(self.dfs[columns].loc['types'].tolist()) == {'date'})