/
utils.py
201 lines (181 loc) · 7.83 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
from openpyxl import load_workbook
import pandas as pd
import numpy as np
import os
import os.path as osp
from IPython.display import display_html
from sklearn.metrics import confusion_matrix
from urllib.request import urlretrieve
import matplotlib.pyplot as plt
import seaborn as sns
def read_excel_in_range(ch_path, fname, sheet_name, start_cell, end_cell):
file_path = osp.join(ch_path, fname)
wb = load_workbook(file_path, read_only=True)
ws = wb[sheet_name]
data_rows = []
for row in ws[start_cell:end_cell]:
data_cols = []
for cell in row:
data_cols.append(cell.value)
data_rows.append(data_cols)
return pd.DataFrame(data_rows[1:], columns=data_rows[0])
def draw_corr_heatmap(cov_matrix, cols):
plt.figure(figsize=(6,6), facecolor='w')
sns.set(font_scale=1.2)
hm = sns.heatmap(cov_matrix,
cmap='coolwarm',
annot=True,
square=True,
fmt='.2f',
yticklabels=cols,
xticklabels=cols)
plt.tight_layout()
plt.show()
def display_side_by_side(dfs: list, names=None, descriptions=None):
if names is None:
names = []
html_str = ''
if names is not None:
html_str += ('<tr>' +
''.join(f'<td style="text-align:center">{name}</td>' for name in names) +
'</tr>')
html_str += ('<tr>' +
''.join(f'<td style="vertical-align:top"> {df.to_html()}</td>'
for df in dfs) +
'</tr>')
if descriptions is not None:
html_str += ('<tr>' +
''.join(f'<td>{description}</td>' for description in descriptions) +
'</tr>')
html_str = f'<table>{html_str}</table>'
html_str = html_str.replace('table', 'table style="display:inline"')
display_html((html_str, ), raw=True)
def classifier_measurement_under_threshs(y_true, y_score, threshs: list):
ratios = pd.DataFrame(columns=threshs, index=['Accuracy', 'TPR', 'TNR', 'FPR', 'Precision', 'F-Score',
'F_0.5-Score', 'F_2-Score',
'tp', 'fn', 'fp', 'tn'])
for threshold in threshs:
is_greater = (pd.DataFrame(y_score)[1] > threshold).astype(int)
tp, fn, fp, tn = np.flip(confusion_matrix(y_true, is_greater, normalize='all')).ravel() * 100
accuracy = (tp+tn)/(tp+fn+fp+tn) * 100
TPR = tp/(tp+fn) * 100
TNR = tn/(fp+tn) * 100
FPR = fp/(fp+tn) * 100
precision = tp/(tp+fp) * 100
fscore = 2*precision*TPR/(precision+TPR)
f05score = 1.25*precision*TPR/(0.25*precision+TPR)
f2score = 5*precision*TPR/(4*precision+TPR)
ratios.loc[:, threshold] = [accuracy, TPR, TNR, FPR, precision, fscore, f05score, f2score, tp, fn, fp, tn]
return ratios.round(2)
class DataLoader:
data = {
"iowa_reduced_scaled": {
"url": "http://www-2.rotman.utoronto.ca/~hull/Second%20edition%20Online%20Files/Iowa%20House%20Price"
"%20Regression%20(Ch3)/Houseprice_data_scaled.csv",
"description": "iowa house price data with features selected & scaled."
},
"iowa_original": {
"url": "http://www-2.rotman.utoronto.ca/~hull/Second%20edition%20Online%20Files/Iowa%20House%20Price"
"%20Regression%20(Ch3)/Original_Data.xlsx",
"description": "iowa house price data with all features & not scaled.",
"excel_kwargs": {
"sheet_name": "Sheet1",
"start_cell": "A4",
"end_cell": "CC2912"
}
},
"age_salary_url": {
"url": "http://www-2.rotman.utoronto.ca/~hull/Second%20edition%20Online%20Files/Salary%20vs%20Age"
"%20Example%20(Chs%201,%203,%206)/Salary%20vs.%20Age%20Example.xlsx",
"description": "age salary data with train/val/test scattered on different sheet.",
"excel_kwargs": {
"train": {
"sheet_name": "Analysis of Training Set (Ch1)",
"start_cell": "B3",
"end_cell": "C13"
},
"val": {
"sheet_name": "Fit to Validation Set (Ch1)",
"start_cell": "B3",
"end_cell": "C13"
},
"test": {
"sheet_name": "Test Set Errors (Ch1)",
"start_cell": "B2",
"end_cell": "C12"
}
}
},
"lending_club_reduced_train_url": {
"url": "http://www-2.rotman.utoronto.ca/~hull/Second%20edition%20Online%20Files/Lending%20Club%20Logistic"
"%20Regression%20(Ch%203)/lendingclub_traindata.xlsx",
"description": "lending club train data with features selected.",
"excel_kwargs": {
"sheet_name": "Sheet1",
"start_cell": "A1",
"end_cell": "E8696"
}
},
"lending_club_reduced_test_url": {
"url": "http://www-2.rotman.utoronto.ca/~hull/Second%20edition%20Online%20Files/Lending%20Club%20Logistic"
"%20Regression%20(Ch%203)/lendingclub_testdata.xlsx",
"description": "lending club test data with features selected.",
"excel_kwargs": {
"sheet_name": "Sheet1",
"start_cell": "A1",
"end_cell": "E8696"
}
},
"lending_club_full_url": {
"url": "http://www-2.rotman.utoronto.ca/~hull/Second%20edition%20Online%20Files/Lending%20Club%20Logistic"
"%20Regression%20(Ch%203)/lending_clubFull_Data_Set.xlsx",
"description": "lending club all features full data",
"excel_kwargs": {
"sheet_name": "Sheet1",
"start_cell": "A1",
"end_cell": "EE25001"
}
},
"country_risk_url": {
"url": "http://www-2.rotman.utoronto.ca/~hull/Second%20edition%20Online%20Files/Country%20risk%20Example"
"%20(Ch2)/countryriskdata.csv",
"description": "country risk data",
},
"new_2019_country_risk_url": {
"url": "http://www-2.rotman.utoronto.ca/~hull/Second%20edition%20Online%20Files/Country%20risk%20Example"
"%20(Ch2)/Country%20Risk%202019%20Data.xlsx",
"description": "new 2019 country risk data",
"excel_kwargs": {
"sheet_name": "Sheet1",
"start_cell": "A2",
"end_cell": "E124"
}
},
}
@classmethod
def list_available_data(cls):
for name in cls.data:
print("\t", name)
@classmethod
def show_description(cls, name):
print(cls.data[name]['description'])
@classmethod
def load_data(cls, name: str, sheet_name=''):
if name not in cls.data:
print("{} is not registered!".format(name))
return None
if not os.path.exists('data'):
os.makedirs('data')
type_of_the_file = cls.data[name]['url'].split('.')[-1]
file_name = '{}.{}'.format(name, type_of_the_file)
data_path = osp.join('data', file_name)
if not os.path.exists(data_path):
_ = urlretrieve(cls.data[name]['url'], data_path)
if type_of_the_file == 'csv':
return pd.read_csv(data_path)
elif type_of_the_file == 'xlsx' and sheet_name:
return read_excel_in_range('data', file_name, **cls.data[name]['excel_kwargs'][sheet_name])
elif type_of_the_file == 'xlsx':
return read_excel_in_range('data', file_name, **cls.data[name]['excel_kwargs'])
else:
return None