/
utils.py
125 lines (93 loc) · 3.76 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
'''Collection of useful functions for data analysis
and machine learning modelling
'''
from typing import List
import re
import pandas as pd
import numpy as np
from scipy import stats
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype
def summarize_cats(df: pd.DataFrame) -> pd.DataFrame:
'''Create table summarizing categorical variables'''
print(f"Dataset Shape: {df.shape}")
summary = pd.DataFrame(df.dtypes, columns=['dtypes'])
summary = summary.reset_index()
summary['Column Name'] = summary['index']
summary = summary[['Column Name', 'dtypes']]
summary['Missing'] = df.isnull().sum().values
summary['Uniques'] = df.nunique().values
for name in summary['Column Name'].value_counts().index:
# List unique values
list_uniques = [str(v) for v in df[name].unique()]
summary.loc[summary['Column Name'] == name,
'Values'] = ' '.join(list_uniques)
# Calculate entropy
shares = df[name].value_counts(normalize=True)
summary.loc[summary['Column Name'] == name, 'Entropy'] = round(
stats.entropy(shares, base=2), 2)
return summary
def reduce_mem_usage(
df: pd.DataFrame, cols_exclude: List[str] = []
) -> pd.DataFrame:
'''Iterate through all the columns of a dataframe and modify
the data type to reduce memory usage.
Original code from
https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
'''
start_mem = df.memory_usage().sum() / 1024**2
cols = [c for c in df.columns if c not in cols_exclude]
print(
"Reducing memory for the following columns: ",
cols,
sep='\n'
)
for col in cols:
if is_datetime(df[col]) or is_categorical_dtype(df[col]):
continue
print(f"Reducing memory for {col}")
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == "int":
if c_min > np.iinfo(np.int8).min \
and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min \
and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min \
and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min \
and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype("category")
end_mem = df.memory_usage().sum() / 1024**2
print(
f"Memory usage before: {start_mem:.2f} MB",
f"Memory usage after: {end_mem:.2f} MB "
f"({100 * (start_mem - end_mem) / start_mem:.1f}% decrease)",
sep='\n'
)
return df
def print_full(df: pd.DataFrame, num_rows: int = 100) -> None:
'''Print the first num_rows rows of dataframe in full
Resets display options back to default after printing
'''
pd.set_option('display.max_rows', len(df))
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)
pd.set_option('display.float_format', '{:20,.2f}'.format)
pd.set_option('display.max_colwidth', -1)
display(df.iloc[0:num_rows])
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
pd.reset_option('display.width')
pd.reset_option('display.float_format')
pd.reset_option('display.max_colwidth')
return None