def create_or_load(train_path): """Reads dataset from a CSV file or previously saved binary formats.""" temp_dir = train_path.parent / 'tmp' temp_dir.mkdir(parents=True, exist_ok=True) temp_data = temp_dir / 'data.feather' if temp_data.exists(): print(f'Loading previously saved training data: {temp_data}') data = feather.read_dataframe(temp_data) else: print(f'Reading the CSV file with training data: {train_path}') data = pd.read_csv(train_path, low_memory=False) print(f'Saving data frame into feather file...') data.to_feather(temp_data) temp_summary = temp_dir / 'summary.pickle' if temp_summary.exists(): print(f'Loading previously saved summary: {temp_summary}') state = pickle.load(temp_summary.open('rb')) summary = DataFrameSummary(pd.DataFrame()) summary.__dict__.update(state) else: print('Generating summary statistics') summary = DataFrameSummary(data) print('Saving summary into pickle file...') with temp_summary.open('wb') as file: state = {'length': summary.length, 'columns_stats': summary.columns_stats, 'corr': summary.corr} pickle.dump(state, file) return data, summary
def test_get_list_of_type(self): xdfs = DataFrameSummary(self.xdf) the_type = "numeric" columns = xdfs._get_list_of_type(the_type) print(columns) # xdfs.get_numeric_summary() cols = [x.encode('ascii') for x in columns] print(cols)
def test_clean_column_on_excel(self): xdfs = DataFrameSummary(self.xdf) xdf_columns = self.xdf.columns.tolist() print(xdfs._clean_column(xdf_columns[0])) for x in xdf_columns: # print(xdfs._clean_column(x)) self.assertTrue(xdfs._clean_column(x))
def test_numer_format_works_as_expected(self): float_nums = [(123.123, '123.12'), (123.1243453, '123.12'), (213213213.123, '213,213,213.12')] int_nums = [(213214, '213,214'), (123213.00, '123,213')] for num, expected in float_nums: self.assertEqual(DataFrameSummary._number_format(num), expected) for num, expected in int_nums: self.assertEqual(DataFrameSummary._number_format(num), expected)
def test_get_perc_works_as_expected(self): float_nums = [(0.123, '12.30%'), (3.1243453, '312.43%'), (213.12312, '21,312.31%')] int_nums = [(0.14, '14%'), (1.300, '130%')] for num, expected in float_nums: self.assertEqual(DataFrameSummary._percent(num), expected) for num, expected in int_nums: self.assertEqual(DataFrameSummary._percent(num), expected)
def test_bool1_summary(self): count_values = self.df['dbool1'].value_counts() total_count = self.df['dbool1'].count() count0 = count_values[0] count1 = count_values[1] perc0 = DataFrameSummary._percent(count0 / total_count) perc1 = DataFrameSummary._percent(count1 / total_count) expected = pd.Series(index=['"0" count', '"0" perc', '"1" count', '"1" perc', 'counts', 'uniques', 'missing', 'missing_perc', 'types'], data=[str(count0), perc0, str(count1), perc1, self.size, 2, 0, '0%', DataFrameSummary.TYPE_BOOL], name='dbool1', dtype=object).sort_index() assert_series_equal(self.dfs['dbool1'].sort_index(), expected)
def check_data_completeness(df): """ Automated test to ensure data is complete and has no missing values. """ df_summary = DataFrameSummary(df).summary() for col in df_summary.columns: assert df_summary.loc['missing', col] == 0, f'{col} has missing values'
def cleaningFeatures(pData, pRelMaxMissing=0.75): vDataSummary = DataFrameSummary(pData) # Bestimme Feature Infos vCleanedData = pData.copy() vColumns = vDataSummary.columns_stats # Bestimme die Lister der Columns for fea in vColumns: vTemp = vDataSummary.columns_stats[ fea] # Bekomme die Infos eines Features #print(fea[0:5]) if fea == 'NewID': # Prueft, ob ID del vCleanedData[fea] elif fea[0] == 'M' and fea[ 1] == 'C': # Schmeißt die MC Wahrheiten mit MC raus del vCleanedData[fea] elif fea[0] == 'I' and fea[ 1] == '3': # Schmeißt die MC Wahrheiten mit I3 raus del vCleanedData[fea] elif fea[ 0:6] == 'Weight': # Schmeißt die MC Wahrheiten mit Weight raus del vCleanedData[fea] elif fea[ 0: 7] == 'Corsika': # Schmeißt die MC Wahrheiten mit Corsika raus del vCleanedData[fea] elif (vTemp[2] / (vTemp[0] + vTemp[2]) ) >= pRelMaxMissing: # Prueft ob ausreichend vollstaendig del vCleanedData[fea] elif vTemp[1] <= 1: # Prueft ob Konstante del vCleanedData[fea] return vCleanedData
def test_numerics_summary(self): num1 = self.df['dnumerics1'] dm, dmp = self.dfs._get_deviation_of_mean(num1) dam, damp = self.dfs._get_median_absolute_deviation(num1) expected = pd.Series(index=['mean', 'std', 'variance', 'min', 'max', '5%', '25%', '50%', '75%', '95%', 'iqr', 'kurtosis', 'skewness', 'sum', 'mad', 'cv', 'zeros_num', 'zeros_perc', 'deviating_of_mean', 'deviating_of_mean_perc', 'deviating_of_median', 'deviating_of_median_perc', 'top_correlations', 'counts', 'uniques', 'missing', 'missing_perc', 'types'], data=[num1.mean(), num1.std(), num1.var(), num1.min(), num1.max(), num1.quantile(0.05), num1.quantile( 0.25), num1.quantile(0.5), num1.quantile(0.75), num1.quantile(0.95), num1.quantile(0.75) - num1.quantile(0.25), num1.kurt(), num1.skew(), num1.sum(), num1.mad(), num1.std() / num1.mean() if num1.mean() else np.nan, self.size - np.count_nonzero(num1), DataFrameSummary._percent( (self.size - np.count_nonzero(num1))/self.size), dm, dmp, dam, damp, 'dnumerics2: 100%', self.size, self.size, 0, '0%', DataFrameSummary.TYPE_NUMERIC], name='dnumerics1', dtype=object) assert_series_equal(self.dfs['dnumerics1'], expected)
def setUp(self): self.size = 1000 missing = ([np.nan] * (self.size // 10) + list(range(10)) * ((self.size - self.size // 10) // 10)) shuffle(missing) self.types = [DataFrameSummary.TYPE_NUMERIC, DataFrameSummary.TYPE_BOOL, DataFrameSummary.TYPE_CATEGORICAL, DataFrameSummary.TYPE_CONSTANT, DataFrameSummary.TYPE_UNIQUE, DataFrameSummary.TYPE_DATE] self.columns = ['dbool1', 'dbool2', 'duniques', 'dcategoricals', 'dnumerics1', 'dnumerics2', 'dnumerics3', 'dmissing', 'dconstant', 'ddates'] self.df = pd.DataFrame(dict( dbool1=np.random.choice([0, 1], size=self.size), dbool2=np.random.choice(['a', 'b'], size=self.size), duniques=['x{}'.format(i) for i in range(self.size)], dcategoricals=['a'.format(i) if i % 2 == 0 else 'b'.format(i) if i % 3 == 0 else 'c'.format(i) for i in range(self.size)], dnumerics1=range(self.size), dnumerics2=range(self.size, 2 * self.size), dnumerics3=list(range(self.size - self.size // 10)) + list(range(-self.size // 10, 0)), dmissing=missing, dconstant=['a'] * self.size, ddates=pd.date_range('2010-01-01', periods=self.size, freq='1M'))) self.dfs = DataFrameSummary(self.df)
def loadtypes(data_df): summary_df = DataFrameSummary(data_df).summary() # auto evaluate datatype contin_vars = [ col for col in summary_df.columns if summary_df.loc["types"][col] == 'numeric' ] bool_vars = [ col for col in summary_df.columns if summary_df.loc["types"][col] == 'bool' ] cat_vars = [ col for col in summary_df.columns if summary_df.loc["types"][col] == 'categorical' ] dt_vars = [ col for col in summary_df.columns if summary_df.loc["types"][col] == 'date' ] const_vars = [ col for col in summary_df.columns if summary_df.loc["types"][col] == 'constant' ] text_vars = [] for var in ['DEPT_CODE']: contin_vars.remove(var), cat_vars.append(var) for var in ['OP_NAME', 'INHOS_DIAG_NAME']: cat_vars.remove(var), text_vars.append(var) cat_vars.extend(bool_vars) return contin_vars, cat_vars, dt_vars, text_vars, const_vars
def datacompleteness(df): """ Takes a dataframe df and returns summary statistics for each column including missing values. """ df_summary = DataFrameSummary(df).summary() return df_summary
def setUp(self): self.size = 1000 missing = [np.nan] * (self.size // 10) + list(range(10)) * ( (self.size - self.size // 10) // 10) shuffle(missing) self.types = [ DataFrameSummary.TYPE_NUMERIC, DataFrameSummary.TYPE_BOOL, DataFrameSummary.TYPE_CATEGORICAL, DataFrameSummary.TYPE_CONSTANT, DataFrameSummary.TYPE_UNIQUE, DataFrameSummary.TYPE_DATE ] self.columns = [ 'dbool1', 'dbool2', 'duniques1', 'duniques2', 'dcategoricals1', 'dcategoricals2', 'dnumerics1', 'dnumerics2', 'dnumerics3', 'dmissing', 'dconstant', 'ddates1', 'ddates2' ] self.df = pd.DataFrame( dict( dbool1=np.random.choice([0, 1], size=self.size), dbool2=np.random.choice(['a', 'b'], size=self.size), duniques1=['x{}'.format(i) for i in range(self.size)], duniques2=['y{}'.format(i) for i in range(self.size)], dcategoricals1=[ 'a'.format(i) if i % 2 == 0 else 'b'.format(i) if i % 3 == 0 else 'c'.format(i) for i in range(self.size) ], dcategoricals2=[ 'x'.format(i) if i % 2 == 0 else 'y'.format(i) if i % 3 == 0 else 'z'.format(i) for i in range(self.size) ], dnumerics1=range(self.size), dnumerics2=range(self.size, 2 * self.size), dnumerics3=list(range(self.size - self.size // 10)) + list(range(-self.size // 10, 0)), dmissing=missing, dconstant=['a'] * self.size, ddates1=pd.date_range('2010-01-01', periods=self.size, freq='1M'), ddates2=pd.date_range('2000-01-01', periods=self.size, freq='1W'), )) self.dfs = DataFrameSummary(self.df)
def test_datacompleteness(): """ Takes a dataframe and checks for missing values. """ df = pd.read_csv('exploration/data/titanic.csv', usecols=['Name', 'Sex', 'Age', 'Survived']) df_summary = DataFrameSummary(df).summary() for col in df_summary.columns: assert df_summary.loc['missing', col] == 0, f'{col} has missing values'
def test_bool1_summary(self): count_values = self.df['dbool1'].value_counts() total_count = self.df['dbool1'].count() count0 = count_values[0] count1 = count_values[1] perc0 = DataFrameSummary._percent(count0 / total_count) perc1 = DataFrameSummary._percent(count1 / total_count) expected = pd.Series(index=[ '"0" count', '"0" perc', '"1" count', '"1" perc', 'counts', 'uniques', 'missing', 'missing_perc', 'types' ], data=[ str(count0), perc0, str(count1), perc1, self.size, 2, 0, '0%', DataFrameSummary.TYPE_BOOL ], name='dbool1', dtype=object) assert_series_equal(self.dfs['dbool1'], expected)
def column_info(self, dataset="train"): """ Describes your columns using the DataFrameSummary library with basic descriptive info. Credits go to @mouradmourafiq for his pandas-summary library. Info ---- counts uniques missing missing_perc types Parameters ---------- dataset : str, optional Type of dataset to describe. Can either be `train` or `test`. If you are using the full dataset it will automatically describe your full dataset no matter the input, by default 'train' Returns ------- DataFrame Dataframe describing your columns with basic descriptive info Examples --------- >>> data.column_info() """ if dataset == "train": x_train_summary = DataFrameSummary(self.x_train) return x_train_summary.columns_stats else: x_test_summary = DataFrameSummary(self.x_test) return x_test_summary.columns_stats
def describe(self, dataset="train"): """ Describes your dataset using the DataFrameSummary library with basic descriptive info. Extends the DataFrame.describe() method to give more info. Credits go to @mouradmourafiq for his pandas-summary library. Parameters ---------- dataset : str, optional Type of dataset to describe. Can either be `train` or `test`. If you are using the full dataset it will automatically describe your full dataset no matter the input, by default 'train' Returns ------- DataFrame Dataframe describing your dataset with basic descriptive info Examples --------- >>> data.describe() """ if dataset == "train": x_train_summary = DataFrameSummary(self.x_train) return x_train_summary.summary() else: x_test_summary = DataFrameSummary(self.x_test) return x_test_summary.summary()
def ka_display_muti_tables_summary(tables, table_names): '''display multi tables' summary Parameters ---------- tables: list_like Pandas dataframes table_names: list_like names of each dataframe ''' for t, t_name in zip(tables, table_names): print(t_name + ":") display(DataFrameSummary(t).summary())
def ka_display_muti_tables_summary(tables, table_names, n=5): '''display multi tables' summary Parameters ---------- tables: list_like Pandas dataframes table_names: list_like names of each dataframe Return ------ 1. show head of data 2. show column types of data 3. show summary of data ''' for t, t_name in zip(tables, table_names): print(t_name + ":", t.shape) ka_display_side_by_side(t.head(n=n), _ka_display_col_type(t), DataFrameSummary(t).summary())
def test_numerics_summary(self): num1 = self.df['dnumerics1'] dm, dmp = self.dfs._get_deviation_of_mean(num1) dam, damp = self.dfs._get_median_absolute_deviation(num1) expected = pd.Series( index=[ 'mean', 'std', 'variance', 'min', 'max', 'mode', '5%', '25%', '50%', '75%', '95%', 'iqr', 'kurtosis', 'skewness', 'sum', 'mad', 'cv', 'zeros_num', 'zeros_perc', 'deviating_of_mean', 'deviating_of_mean_perc', 'deviating_of_median', 'deviating_of_median_perc', 'top_correlations', 'counts', 'uniques', 'missing', 'missing_perc', 'types' ], data=[ num1.mean(), num1.std(), num1.var(), num1.min(), num1.max(), num1.mode()[0], num1.quantile(0.05), num1.quantile(0.25), num1.quantile(0.5), num1.quantile(0.75), num1.quantile(0.95), num1.quantile(0.75) - num1.quantile(0.25), num1.kurt(), num1.skew(), num1.sum(), num1.mad(), num1.std() / num1.mean() if num1.mean() else np.nan, self.size - np.count_nonzero(num1), DataFrameSummary._percent( (self.size - np.count_nonzero(num1)) / self.size), dm, dmp, dam, damp, 'dnumerics2: 100%', self.size, self.size, 0, '0%', DataFrameSummary.TYPE_NUMERIC ], name='dnumerics1', dtype=object) assert_series_equal(self.dfs['dnumerics1'], expected)
def replaceValues(pData, pTransform=False): vScaler = StandardScaler() vDataSummary = DataFrameSummary(pData) for col in pData.columns.values: if vDataSummary.columns_stats[col][ 4] != 'numeric': # Ersetzt alle nicht numerischen Werte durcdh numerisch vEleList = pData[col].value_counts( ).index.values # und fuellt falsche Eintraege (NaN, etc) mit 0 vReplacements = np.linspace(1, len(vEleList), len(vEleList)) pData[col] = pData[col].replace(to_replace=vEleList, value=vReplacements) pData[col] = pData[col].fillna(0) elif pData[ col].dtype == 'int64': # Fuellt fuer Integer Spalten falsche Eintraege mit 0 pData[col] = pData[col].fillna(0) #pData[col] = vScaler.fit_transform(pData[col]) else: # Fuellt kontinuierliche Eintraege mit dem durchschnitt. pData[col] = pData[col].fillna(pData[col].mean()) #pData[col] = vScaler.fit_transform(pData[col]) return pData
# Extract the patient's covariates as a numpy array x_df = df.drop([event_col, time_col], axis=1) x = x_df.values.astype(np.float32) # Return the deep surv dataframe return x, {'e': e, 't': t} # Please include SQL export csv file with destination below temp_features = pd.read_csv(r'', sep=',') # ============================================================================= # #Get summary for features and for each individual class labels # ============================================================================= from pandas_summary import DataFrameSummary df_summary = DataFrameSummary(temp_features) temp_features.describe().transpose().join( df_summary.columns_stats.transpose()).to_csv(r'T:\tbase\feature_stats.csv') # ============================================================================= # #Remove constant features # ============================================================================= for index, row in df_summary.columns_stats.transpose()[ df_summary.columns_stats.transpose()['types'].str.lower().str.contains( 'constant')].iterrows(): print('Removed column ' + index + ' (constant)') temp_features.drop([index], axis=1, inplace=True) print('The shape of our features is:', temp_features.shape) temp_features.describe() feature_list = list(temp_features.columns)
import matplotlib.pyplot as plt sns.set(rc={'figure.figsize': (15, 12)}) df1 = temp_features.select_dtypes([np.int, np.float]).fillna(-5) for i, col in enumerate(df1.columns): plt.figure(i) sns_plot = sns.distplot(df1[col]) fig = sns_plot.get_figure() # Please change the path according to your need fig.savefig(r'T:\\tbase\\plots\\' + col + '_PreImputation.png') # ============================================================================= # #Get summary for features and for each individual class labels # ============================================================================= from pandas_summary import DataFrameSummary df_summary = DataFrameSummary(temp_features) #Saving the summary in a CSV file temp_features.describe( ).transpose().join(df_summary.columns_stats.transpose()).to_csv( r'T:\tbase\short\feature_stats.csv') #Please change the path accordingly #Failure class feature_distribution_failure = temp_features.loc[ temp_features['Shortterm_TransplantOutcome'] == 1] df_summary_distribution_failure = DataFrameSummary( feature_distribution_failure) feature_distribution_failure.describe().transpose().join( df_summary_distribution_failure.columns_stats.transpose()).to_csv( r'T:\tbase\short\feature_stats_failure.csv' ) #Please change the path accordingly #Success class
def test_pandas_summary_on_csv_df(self): #: this works great! cdfs = DataFrameSummary(self.cdf) print(cdfs.get_numeric_summary())
def test_pandas_summary_on_excel_df(self): #: fixme: this returns error xdfs = DataFrameSummary(self.xdf) print(xdfs.get_numeric_summary())
df = df.loc[df['Outlier'] == False] print("df despues",df.shape) """# Fin comprobacion""" cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen', 'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear', 'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_bool_fw', 'StateHoliday_bool_bw', 'SchoolHoliday_fw', 'SchoolHoliday_bw','Open'] #embeddings_model.hdf5' cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day','StateHoliday','State','StoreType','Assortment'] #cat_vars = ['Store', 'DayOfWeek','StoreType','Year', 'Month', 'Day', 'State'] len(cat_vars) uniques = DataFrameSummary(df[cat_vars]).summary().loc[['uniques']] contin_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC', 'Precipitationmm', 'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE', 'AfterStateHoliday_bool', 'BeforeStateHoliday_bool', 'Promo', 'SchoolHoliday', 'StateHoliday_bool'] # contin_vars = [] #embeddings_model.hdf5' contin_vars = ['CompetitionDistance','Promo','Max_TemperatureC','BeforeStateHoliday_bool'] len(contin_vars) y_out_columns = ['Sales'] df_train = df[df.Date < datetime.datetime(2015, 7, 1)]
def check_data_completeness(df): df_summary = DataFrameSummary(df).summary() for col in df_summary.columns: assert df_summary.loc['missing', col] == 0, f'{col} has missing values'
'id': pd.read_csv('{}/store_id_relation.csv'.format(DATA_DIR)), 'tes': pd.read_csv('{}/sample_submission.csv'.format(DATA_DIR)), 'hol': pd.read_csv('{}/date_info.csv'.format(DATA_DIR)) } #%% for tbl, df in data.items(): print(tbl) display(df.head()) #%% #%% test_df = utils.tes2trn(data['tes']) test_df.visit_date = pd.to_datetime(test_df.visit_date) display(test_df.head()) display(DataFrameSummary(test_df).summary()) # #%% """ test dataset: 2017-04-23 to 2017-05-31 39 unique days in test day 821 air_store_id trn dataset: 478 unique days 829 stores store not in test:
# # In[17]: for t in tables: display(t.head()) # This is very representative of a typical industry dataset. # The following returns summarized aggregate information to each table accross each field. # In[41]: for t in tables: display(DataFrameSummary(t).summary()) # ## Data Cleaning / Feature Engineering # As a structured data problem, we necessarily have to go through all the cleaning and feature engineering, even though we're using a neural network. # In[44]: train, store, store_states, state_names, googletrend, weather, test = tables # In[45]:
def get_summary(df): return DataFrameSummary(df)
# coding: utf-8 __title__ = 'data_eda' __author__ = 'JieYuan' __mtime__ = '2018/2/13' from pandas_summary import DataFrameSummary # 描述性统计 summary = lambda x: DataFrameSummary(x).summary().transpose()
def describe_column(self, column, dataset="train"): """ Analyzes a column and reports descriptive statistics about the columns. Credits go to @mouradmourafiq for his pandas-summary library. Statistics ---------- std max min variance mean mode 5% 25% 50% 75% 95% iqr kurtosis skewness sum mad cv zeros_num zeros_perc deviating_of_mean deviating_of_mean_perc deviating_of_median deviating_of_median_perc top_correlations counts uniques missing missing_perc types Parameters ---------- column : str Column in your dataset you want to analze. dataset : str, optional Type of dataset to describe. Can either be `train` or `test`. If you are using the full dataset it will automatically describe your full dataset no matter the input, by default 'train' Returns ------- dict Dictionary mapping a statistic and its value for a specific column Examples -------- >>> data.describe_column('col1') """ if dataset == "train": x_train_summary = DataFrameSummary(self.x_train) return x_train_summary[column] else: x_test_summary = DataFrameSummary(self.x_test) return x_test_summary[column]
class DataFrameSummaryTest(unittest.TestCase): def setUp(self): self.size = 1000 missing = ([np.nan] * (self.size // 10) + list(range(10)) * ((self.size - self.size // 10) // 10)) shuffle(missing) self.types = [DataFrameSummary.TYPE_NUMERIC, DataFrameSummary.TYPE_BOOL, DataFrameSummary.TYPE_CATEGORICAL, DataFrameSummary.TYPE_CONSTANT, DataFrameSummary.TYPE_UNIQUE, DataFrameSummary.TYPE_DATE] self.columns = ['dbool1', 'dbool2', 'duniques', 'dcategoricals', 'dnumerics1', 'dnumerics2', 'dnumerics3', 'dmissing', 'dconstant', 'ddates'] self.df = pd.DataFrame(dict( dbool1=np.random.choice([0, 1], size=self.size), dbool2=np.random.choice(['a', 'b'], size=self.size), duniques=['x{}'.format(i) for i in range(self.size)], dcategoricals=['a'.format(i) if i % 2 == 0 else 'b'.format(i) if i % 3 == 0 else 'c'.format(i) for i in range(self.size)], dnumerics1=range(self.size), dnumerics2=range(self.size, 2 * self.size), dnumerics3=list(range(self.size - self.size // 10)) + list(range(-self.size // 10, 0)), dmissing=missing, dconstant=['a'] * self.size, ddates=pd.date_range('2010-01-01', periods=self.size, freq='1M'))) self.dfs = DataFrameSummary(self.df) def test_get_columns_works_as_expected(self): assert len(self.dfs.get_columns(self.df, DataFrameSummary.ALL)) == 10 assert len(self.dfs.get_columns(self.df, DataFrameSummary.INCLUDE, ['dnumerics1', 'dnumerics2', 'dnumerics3'])) == 3 assert len(self.dfs.get_columns(self.df, DataFrameSummary.EXCLUDE, ['dnumerics1', 'dnumerics2', 'dnumerics3'])) == 7 def test_column_types_works_as_expected(self): expected = pd.Series(index=self.types, data=[ 4, 2, 1, 1, 1, 1], name='types') assert_series_equal( self.dfs.columns_types[self.types], expected[self.types]) def test_column_stats_works_as_expected(self): column_stats = self.dfs.columns_stats self.assertTupleEqual(column_stats.shape, (5, 10)) # counts expected = pd.Series(index=self.columns, data=self.size, name='counts', dtype='object') expected['dmissing'] -= 100 assert_series_equal(column_stats[self.columns].loc['counts'], expected[self.columns]) # uniques expected = pd.Series(index=self.columns, data=self.size, name='uniques', dtype='object') expected[['dbool1', 'dbool2']] = 2 expected[['dcategoricals']] = 3 expected[['dconstant']] = 1 expected[['dmissing']] = 10 assert_series_equal(column_stats[self.columns].loc['uniques'].sort_index(), expected[self.columns].sort_index(), check_dtype=False) # missing expected = pd.Series(index=self.columns, data=0, name='missing', dtype='object') expected[['dmissing']] = 100 assert_series_equal(column_stats[self.columns].loc['missing'], expected[self.columns], check_dtype=False) # missing_perc expected = pd.Series(index=self.columns, data=['0%'] * 10, name='missing_perc', dtype='object') expected[['dmissing']] = '10%' assert_series_equal(column_stats[self.columns].loc['missing_perc'], expected[self.columns]) # types expected = pd.Series(index=self.columns, data=[np.nan] * 10, name='types', dtype='object') expected[['dbool1', 'dbool2']] = DataFrameSummary.TYPE_BOOL expected[['dcategoricals']] = DataFrameSummary.TYPE_CATEGORICAL expected[['dconstant']] = DataFrameSummary.TYPE_CONSTANT expected[['ddates']] = DataFrameSummary.TYPE_DATE expected[['duniques']] = DataFrameSummary.TYPE_UNIQUE expected[['dnumerics1', 'dnumerics2', 'dnumerics3', 'dmissing']] = DataFrameSummary.TYPE_NUMERIC assert_series_equal(column_stats[self.columns].loc['types'], expected[self.columns]) def test_numer_format_works_as_expected(self): float_nums = [(123.123, '123.12'), (123.1243453, '123.12'), (213213213.123, '213,213,213.12')] int_nums = [(213214, '213,214'), (123213.00, '123,213')] for num, expected in float_nums: self.assertEqual(DataFrameSummary._number_format(num), expected) for num, expected in int_nums: self.assertEqual(DataFrameSummary._number_format(num), expected) def test_get_perc_works_as_expected(self): float_nums = [(0.123, '12.30%'), (3.1243453, '312.43%'), (213.12312, '21,312.31%')] int_nums = [(0.14, '14%'), (1.300, '130%')] for num, expected in float_nums: self.assertEqual(DataFrameSummary._percent(num), expected) for num, expected in int_nums: self.assertEqual(DataFrameSummary._percent(num), expected) def test_uniques_summary(self): expected = pd.Series(index=['counts', 'uniques', 'missing', 'missing_perc', 'types'], data=[self.size, self.size, 0, '0%', DataFrameSummary.TYPE_UNIQUE], name='duniques', dtype=object) assert_series_equal(self.dfs['duniques'], expected) def test_constant_summary(self): self.assertEqual(self.dfs['dconstant'], 'This is a constant value: a') def test_bool1_summary(self): count_values = self.df['dbool1'].value_counts() total_count = self.df['dbool1'].count() count0 = count_values[0] count1 = count_values[1] perc0 = DataFrameSummary._percent(count0 / total_count) perc1 = DataFrameSummary._percent(count1 / total_count) expected = pd.Series(index=['"0" count', '"0" perc', '"1" count', '"1" perc', 'counts', 'uniques', 'missing', 'missing_perc', 'types'], data=[str(count0), perc0, str(count1), perc1, self.size, 2, 0, '0%', DataFrameSummary.TYPE_BOOL], name='dbool1', dtype=object).sort_index() assert_series_equal(self.dfs['dbool1'].sort_index(), expected) def test_bool2_summary(self): count_values = self.df['dbool2'].value_counts() total_count = self.df['dbool2'].count() count0 = count_values['a'] count1 = count_values['b'] perc0 = DataFrameSummary._percent(count0 / total_count) perc1 = DataFrameSummary._percent(count1 / total_count) expected = pd.Series(index=['"a" count', '"a" perc', '"b" count', '"b" perc', 'counts', 'uniques', 'missing', 'missing_perc', 'types'], data=[str(count0), perc0, str(count1), perc1, self.size, 2, 0, '0%', DataFrameSummary.TYPE_BOOL], name='dbool2', dtype=object) assert_series_equal(self.dfs['dbool2'], expected) def test_categorical_summary(self): expected = pd.Series(index=['top', 'counts', 'uniques', 'missing', 'missing_perc', 'types'], data=['a: 500', self.size, 3, 0, '0%', DataFrameSummary.TYPE_CATEGORICAL], name='dcategoricals', dtype=object) assert_series_equal(self.dfs['dcategoricals'], expected) def test_dates_summary(self): dmin = self.df['ddates'].min() dmax = self.df['ddates'].max() expected = pd.Series(index=['max', 'min', 'range', 'counts', 'uniques', 'missing', 'missing_perc', 'types'], data=[dmax, dmin, dmax - dmin, self.size, self.size, 0, '0%', DataFrameSummary.TYPE_DATE], name='ddates', dtype=object).sort_index() tmp = self.dfs['ddates'].sort_index() assert_series_equal(tmp, expected) def test_numerics_summary(self): num1 = self.df['dnumerics1'] dm, dmp = self.dfs._get_deviation_of_mean(num1) dam, damp = self.dfs._get_median_absolute_deviation(num1) expected = pd.Series(index=['mean', 'std', 'variance', 'min', 'max', '5%', '25%', '50%', '75%', '95%', 'iqr', 'kurtosis', 'skewness', 'sum', 'mad', 'cv', 'zeros_num', 'zeros_perc', 'deviating_of_mean', 'deviating_of_mean_perc', 'deviating_of_median', 'deviating_of_median_perc', 'top_correlations', 'counts', 'uniques', 'missing', 'missing_perc', 'types'], data=[num1.mean(), num1.std(), num1.var(), num1.min(), num1.max(), num1.quantile(0.05), num1.quantile( 0.25), num1.quantile(0.5), num1.quantile(0.75), num1.quantile(0.95), num1.quantile(0.75) - num1.quantile(0.25), num1.kurt(), num1.skew(), num1.sum(), num1.mad(), num1.std() / num1.mean() if num1.mean() else np.nan, self.size - np.count_nonzero(num1), DataFrameSummary._percent( (self.size - np.count_nonzero(num1))/self.size), dm, dmp, dam, damp, 'dnumerics2: 100%', self.size, self.size, 0, '0%', DataFrameSummary.TYPE_NUMERIC], name='dnumerics1', dtype=object) assert_series_equal(self.dfs['dnumerics1'], expected)
class DataFrameSummaryTest(unittest.TestCase): """ Test the new methods added by Alfonso R. Reyes. Dataframe has been expanded to show more columns of the same type. Needed for the summary. """ def setUp(self): self.size = 1000 missing = [np.nan] * (self.size // 10) + list(range(10)) * ( (self.size - self.size // 10) // 10) shuffle(missing) self.types = [ DataFrameSummary.TYPE_NUMERIC, DataFrameSummary.TYPE_BOOL, DataFrameSummary.TYPE_CATEGORICAL, DataFrameSummary.TYPE_CONSTANT, DataFrameSummary.TYPE_UNIQUE, DataFrameSummary.TYPE_DATE ] self.columns = [ 'dbool1', 'dbool2', 'duniques1', 'duniques2', 'dcategoricals1', 'dcategoricals2', 'dnumerics1', 'dnumerics2', 'dnumerics3', 'dmissing', 'dconstant', 'ddates1', 'ddates2' ] self.df = pd.DataFrame( dict( dbool1=np.random.choice([0, 1], size=self.size), dbool2=np.random.choice(['a', 'b'], size=self.size), duniques1=['x{}'.format(i) for i in range(self.size)], duniques2=['y{}'.format(i) for i in range(self.size)], dcategoricals1=[ 'a'.format(i) if i % 2 == 0 else 'b'.format(i) if i % 3 == 0 else 'c'.format(i) for i in range(self.size) ], dcategoricals2=[ 'x'.format(i) if i % 2 == 0 else 'y'.format(i) if i % 3 == 0 else 'z'.format(i) for i in range(self.size) ], dnumerics1=range(self.size), dnumerics2=range(self.size, 2 * self.size), dnumerics3=list(range(self.size - self.size // 10)) + list(range(-self.size // 10, 0)), dmissing=missing, dconstant=['a'] * self.size, ddates1=pd.date_range('2010-01-01', periods=self.size, freq='1M'), ddates2=pd.date_range('2000-01-01', periods=self.size, freq='1W'), )) self.dfs = DataFrameSummary(self.df) def test_columns_stats(self): """ Test the columns_stats instance variable and the columns of the test dataframe. :return: """ columns_stats = self.dfs.columns_stats print(type(columns_stats)) self.assertIsInstance(columns_stats, pd.core.frame.DataFrame) expected = [ 'dbool1', 'dbool2', 'dcategoricals1', 'dcategoricals2', 'dconstant', 'ddates1', 'ddates2', 'dmissing', 'dnumerics1', 'dnumerics2', 'dnumerics3', 'duniques1', 'duniques2' ] result = columns_stats.columns.tolist() print(result) self.assertEqual(expected, result) def test__is_all_numeric_false(self): """ Test that not all the columns provided in the list are "numeric". It must return "False" :return: """ columns = [ 'dbool1', 'dbool2', 'dcategoricals', 'dconstant', 'ddates', 'dmissing', 'dnumerics1', 'dnumerics2', 'dnumerics3', 'duniques' ] result = self.dfs._is_all_numeric(columns) print(result) self.assertFalse(result) def test__is_all_numeric_true(self): """ Test that all columns passed are "numeric". It must be "True" :return: """ columns = ['dnumerics1', 'dnumerics2', 'dnumerics3'] result = self.dfs._is_all_numeric(columns) print(result) self.assertTrue(result) def test__is_all_numeric_true_missing(self): """ Numeric columns provided this time included NaNs. It muest be "True" :return: """ #: includes missing nan column, which is numeric as well columns = ['dnumerics1', 'dnumerics2', 'dnumerics3', 'dmissing'] result = self.dfs._is_all_numeric(columns) print(result) self.assertTrue(result) def test__get_list_of_type_numeric(self): """ Test that a list of numeric columns matches the test dataframe :return: """ expected = ['dmissing', 'dnumerics1', 'dnumerics2', 'dnumerics3'] result = self.dfs._get_list_of_type("numeric") print(result) print(self.dfs[result]) self.assertTrue(expected == result) def test__get_list_of_type_numeric_generic(self): """ Test that all the columns returning are all of the same `numeric` type :return: """ the_type = "numeric" columns = self.dfs._get_list_of_type(the_type) frame = self.dfs[columns] print(frame) types = frame.ix['types'] set_of_types = set(types.tolist()) result = the_type in set_of_types print(result) self.assertTrue(result) def test_get_numeric_summary(self): """ Test that the columns types reduce to a unique numeric value and matches. :return: """ frame = self.dfs.get_numeric_summary() print(frame) result = self.dfs.TYPE_NUMERIC in set(frame.ix['types']) print(result) self.assertTrue(result) def test__get_list_of_type_boolean(self): """ Test that boolean columns match the type `bool` :return: """ expected = ['dbool1', 'dbool2'] result = self.dfs._get_list_of_type("bool") print(result) self.assertTrue(expected == result) def test_show_dataframe_per_type(self): """ Shows a column, one by one grouping by column type :return: """ for column in self.types: print(column) columns = self.dfs._get_list_of_type(column) # print(self.dfs[columns]) list_of = columns for col in list_of: print(self.dfs[col]) def test__get_list_of_type_bool_generic(self): """ This is an OLD behavior. Now corrected. There is a problem when the list of columns specified is not numeric: what returns when dfs[columns] is specified could be a list of the columns values. No what we are looking for. """ the_type = "bool" columns = self.dfs._get_list_of_type(the_type) print(columns) df = self.dfs[['dbool1', 'dbool2']] print(df) self.assertTrue(df.shape[1] == 2) def test_get_all_series_bool(self): """ Test that boolean summary return the same number of rows. WIth the new behavior the number of rows must be 9 in the case of booleans :return: """ list_of = ['dbool1', 'dbool2'] for col in list_of: ser = self.dfs[col] print ser print ser.shape[0] self.assertTrue(ser.shape[0] == 9) def test_show_columns_types(self): """ Test that the columns in the test dataframe is a subset of the class variable "types" :return: """ self.assertTrue( set(self.dfs.columns_types.index).issubset(self.dfs.types)) def test__is_type_the_same_bool(self): """ Test that the columns passed are of the same type :return: """ columns = ['dbool1', 'dbool2'] list_of_types = self.dfs._is_type_the_same(columns) self.assertTrue(list_of_types) def test__is_type_the_same_many_false(self): """ Tests that the columns passed are NOT all of the same type :return: """ columns = ['dbool1', 'dbool2', 'dnumerics1'] list_of_types = self.dfs._is_type_the_same(columns) self.assertFalse(list_of_types) def test__is_type_the_same_numeric(self): """ Test that the columns passed are all the same :return: """ columns = ['dnumerics1', 'dnumerics2', 'dnumerics3', 'dmissing'] list_of_types = self.dfs._is_type_the_same(columns) self.assertTrue(list_of_types) def test_get_all_the_same_unique(self): """ Test that the unique columns passed are all unique :return: """ columns = ['duniques1', 'duniques2'] self.assertTrue( set(self.dfs[columns].loc['types'].tolist()) == {'unique'}) def test_get_all_the_same_numeric(self): """ Test that all the numeric columns are all numeric """ columns = ['dnumerics1', 'dnumerics2', 'dnumerics3', 'dmissing'] self.assertTrue( set(self.dfs[columns].loc['types'].tolist()) == {'numeric'}) def test_get_all_the_same_categorical(self): """ Tests that all categorical columns reduce to `categorical` :return: """ columns = ['dcategoricals1', 'dcategoricals2'] self.assertTrue( set(self.dfs[columns].loc['types'].tolist()) == {'categorical'}) def test_get_all_the_same_dates(self): """ Test that all the ``date columns reduce to a unique type `date` :return: """ columns = ['ddates1', 'ddates2'] self.assertTrue( set(self.dfs[columns].loc['types'].tolist()) == {'date'})