def _dump_df_excel(obj, file, **kwargs): '''dump df to excel obj: 2d array like data file: str or file obj: ''' writer = pd.ExcelWriter(file) obj = get_flat_list(obj) sheet_name = kwargs.get('sheet_name') if sheet_name is None: sheet_name = ['sheet' + str(i + 1) for i in range(len(obj))] else: sheet_name = get_flat_list(sheet_name) check_consistent_length(obj, sheet_name) for data, name in zip(obj, sheet_name): try: data = pd.DataFrame(data) kw = get_kwargs(data.to_excel, **kwargs) kw.update({ 'sheet_name': name, 'index': kwargs.get('index', False) }) data.to_excel(writer, **kw) except Exception as e: print(repr(e)) continue writer.save()
def _get_scorer(self, scoring): ''' return sklearn scorer, including custom scorer ''' scorer = {} sk_scoring = [] custom_scorer = get_custom_scorer() for i in get_flat_list(scoring): if i in custom_scorer: scorer.update({i: custom_scorer[i]}) else: sk_scoring.append(i) if len(sk_scoring) > 0: s, _ = _validation._check_multimetric_scoring(self.estimator, scoring=sk_scoring) scorer.update(s) return scorer
def _get_files(dirpath, suffix=None, subfolder=False): ''' return file dict {filename : file} dirpath - str - dir_x to traverse suffix -->extension name, or list of extension names, egg ['.xlsx', 'csv'] - to include file extensions, default None, to include all extensions subfolder --> bool - true to traverse subfolders, False only the given dirpath ''' if subfolder: get_dirs = traverse_all_dirs else: get_dirs = traverse_dir rst = { k: v for k, v in get_dirs(dirpath).items() if os.path.splitext(v)[1] in get_flat_list(suffix) or not suffix } return rst
def plotter_score_path(df_score, title=None, cm=None, style='-.o'): ''' df_score: data frame of scores of metrics ''' # plot data = df_score.select_dtypes(include='number') n = len(data.columns) i, j = plt.rcParams['figure.figsize'] fig, ax = plt.subplots(n, 1, figsize=(i, j + 2.5 * (n // 2))) ax = get_flat_list(ax) if n == 1 else ax if cm is None: cm = plt.get_cmap('tab10') cmlist = [cm(i) for i in np.linspace(0, 1, n)] i = 0 for ax0, col in zip(ax, data.columns): s = data[col] if api.is_numeric_dtype(s): s.plot(ax=ax0, color=cmlist[i], style=style) ax0.fill_between(s.index, s - s.std(), s + s.std(), color='grey', alpha=.3, label=r'{} = {}$\pm$ {}'.format( col, round(s.mean(), 4), round(s.std(), 4))) plt.setp(ax0, ylabel=col) h, l = ax0.get_legend_handles_labels() ax0.legend([h[-1]], [l[-1]]) i += 1 ax[0].set_title(title) ax[-1].set_xlabel('index') plt.tight_layout(rect=(0, 0, 0.98, 0.96)) return fig
def plotter_cv_results_(results, train_style='mo-', test_style='go-.', title=None): '''plot univariate parameter cross validated results after grid search of model return ----- ax, or tuple of ax ''' scoring = results.filter(like='mean_train_').columns scoring = [i.replace('mean_train_', '') for i in scoring] df_param = results.filter(like='param_') param_array = df_param.columns if len(param_array) > 1: print('multi-parameter is encountered ... ') print(df_param.apply(lambda x: pd.Series(pd.unique(x)))) # plot n = len(scoring) i, j = plt.rcParams['figure.figsize'] fig, ax = plt.subplots(n, 1, figsize=(i, j + 2.5 * (n // 2))) ax = get_flat_list(ax) if n == 1 else ax for s, ax0 in zip(scoring, ax): df = results[['mean_train_' + s, 'mean_test_' + s, 'std_test_' + s]] if len(param_array) == 1: df.index = results[param_array[0]] xlabel = param_array[0] num_param = api.is_numeric_dtype(df.index) if not num_param: df.index = np.arange(len(df.index)) else: xlabel = ' + '.join([i.split('__')[-1] for i in param_array]) df.sort_index(inplace=True) # plot mean = df['mean_test_' + s].values std = df.pop('std_test_' + s) x = df.index.get_values() df.plot.line(style=[train_style, test_style], ax=ax0) ax0.fill_between(x, mean - std, mean + std, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') # annotate x_max = df.index[np.argmax(mean)] best_score = np.max(mean) std = np.mean(std) h, l = ax0.get_legend_handles_labels() ax0.legend( [h[-1]], ['score_max= %0.4f $\pm$ %0.2f' % (np.max(mean), np.mean(std))]) ax0.axvline(x_max, linestyle='--', marker='x', color='y') ax0.annotate("%0.4f" % best_score, (x_max, best_score)) ax0.set_xlim(x.min() - 0.5, x.max() + 0.5) plt.setp(ax0, ylabel=s) # set title ax[0].set_title(title, fontsize=13) # use fig legend fig.legend(h, ('train', 'test', r'$\pm$ 1 std. dev.'), loc='upper right', ncol=3, bbox_to_anchor=(0.98, 1)) ax[-1].set_xlabel(xlabel) plt.tight_layout(rect=(0, 0, 1, 0.95)) return ax
def plotter_auc(fpr, tpr, ax=None, alpha=0.95, lw=1.2, curve_label=None, title=None, cm=None): '''plot roc_auc curve given fpr, tpr, or list of fpr, tpr cm: color map default 'tab20' return ---- ax ''' fpr, tpr = get_flat_list(fpr), get_flat_list(tpr) if len(fpr) != len(tpr): raise ValueError("length of fpr and tpr doesn't match") n = len(fpr) names = range(n) if curve_label is None else get_flat_list(curve_label) if len(names) != n: print('n_curve label not match with n_fpr or n_tpr') names = range(n) # -- plot each line if ax is None: fig, ax = plt.subplots(1, 1) aucs = [] kss = [] if cm is None: cm = plt.get_cmap('tab20') cmlist = [cm(i) for i in np.linspace(0, 1, n)] for i in range(n): if len(fpr[i]) != len(tpr[i]): print("length of {}th fpr and tpr doesn't match".format(i)) continue else: auc_score = auc(fpr[i], tpr[i]) ks_score = max(np.array(tpr[i]) - np.array(fpr[i])) aucs.append(auc_score) kss.append(ks_score) ax.plot(fpr[i], tpr[i], color=cmlist[i], alpha=alpha, lw=lw, label='ROC %r (AUC=%0.2f;KS=%0.2f)' % (names[i], auc_score, ks_score)) # plot mean tpr line if n > 1: mean_fpr = np.linspace(0, 1, 100) tprs = [interp(mean_fpr, x, y) for x, y in zip(fpr, tpr)] mean_tpr = np.mean(tprs, axis=0) mean_tpr[0] = 0.0 mean_tpr[-1] = 1.0 mean_auc = np.mean(aucs) std_auc = np.std(aucs) ax.plot(mean_fpr, mean_tpr, 'b-.', alpha=1, lw=1.5, label='Mean ROC(AUC=%0.2f $\pm$ %0.2f)' % (mean_auc, std_auc)) #plot variance std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0) ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.3, label=r'$\pm$ 1 std. dev.') # plot chance line ax.plot([0, 1], [0, 1], 'k--', lw=1.5, label='Chance (AUC=0.5)') # set property if title is None: title = 'Receiver operating characteristic' plt.setp(ax, xlabel='False Positive Rate', ylabel='True Positive Rate', xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title=title) plt.legend(loc="lower right", fontsize='medium', bbox_to_anchor=(1, 0)) plt.tight_layout() return ax
def run_sensitivity(self, train_set=None, title=None, param_grid=-1, refit='roc_auc', scoring=['roc_auc', 'KS'], fit_params={}, n_jobs=2, save_fig=True, **kwargs): ''' - run sensitivity of param_grid (if param_grid=-1, use pre-difined); - update self estimator as best estimator, & update self gridcv_results; - dump plots/spreadsheets parmameters ---- train_set: 2 element tuple, (X, y) of train data param_grid: parameter grid space, if -1, use pipe_grid() to return predifined param_grid **kwargs: GridSearchCV keywords ''' L = locals().copy() L.pop('self') L.pop('param_grid') folder = self.folder #-- if train_set is None: train_set = self._get_dataset('.traindata')[0] else: folder.write(train_set, 'data/0.traindata') if param_grid is -1: param_grid = [] for k, v in self.estimator.named_steps.items(): grid = pipe_grid(k) if grid is not None: param_grid.extend(grid) if len(param_grid) == 0: print('no param_grid found, skip grid search') return # memory cache if hasattr(self.estimator, 'memory'): self.estimator.memory = os.path.relpath( os.path.join(self.folder.path_, 'tempfolder')) X, y = train_set cv_results = [] for i, grid in enumerate(get_flat_list(param_grid)): self.grid_searchcv(X, y=y, param_grid=grid, **get_kwargs(self.grid_searchcv, **L), **kwargs) self.plot_gridcv(save_fig=save_fig, title=str(i)) cv_results.append(self.gridcv_results) print('sensitivity results are being saved... ') title = 0 if title is None else str(title) folder.write(cv_results, 'spreadsheet/GridcvResults{}.xlsx'.format(title)) self.save() self._shut_temp_folder()
def run_test(self, test_set=None, title=None, use_self_bins=True, cv=3, scoring=['roc_auc', 'KS', 'average_precision'], save_fig=True, **kwargs): ''' - run test performance of an estimator; - dump lift curve and ROC curve for test data under self.folder.path_; - optionally dump spreadsheets of calculated data test_set: 2 element tuple (X_test, y_test) or list of them title: title for test_set indicator return ---- series: averaged scoring for each of scoring metrics ''' L = locals().copy() L.pop('self') L.pop('title') folder = self.folder # -- r = 0 if test_set is None: test_set, title = self._get_dataset('.testdata')[0] r -= 1 test_set_list = get_flat_list(test_set) if title is not None: title_list = get_flat_list(title) else: title_list = [str(i) for i in range(len(test_set_list))] check_consistent_length(test_set_list, title_list) if r == 0: folder.write([test_set_list, title_list], 'data/{}.testdata'.format(len(title_list))) testscore = [] for i, j in zip(test_set_list, title_list): # test performance X_test = i[0] y_test = i[1] # plot test auc testcv = self.plot_auc_test(X_test, y_test, title=j, **get_kwargs(self.plot_auc_test, **L, **kwargs)) # plot lift curve test_lift = self.plot_lift(X_test, y_test, title=j, **get_kwargs(self.plot_lift, **L), **kwargs) # test scores scores = self.test_score(X_test, y_test, cv=cv, scoring=scoring) scores['group'] = str(j) testscore.append(scores) if self.verbose > 0: print( 'test cv_score & cv_splits test data are being saved... ') folder.write(testcv[-1], file='spreadsheet/TestSplits{}.xlsx'.format(j)) if test_lift is None: lift = pd.DataFrame() else: lift = test_lift[-1] folder.write( [lift, scores], sheet_name=['lift_curve', 'test_score'], file='spreadsheet/TestPerfomance{}.xlsx'.format(j)) testscore_all = pd.concat(testscore, axis=0, ignore_index=True) fig = plotter_score_path(testscore_all, title='score_path') if save_fig is True: folder.write(fig, 'plots/TestScore_path.pdf') plt.close() if self.verbose > 0 and len(testscore) > 1: folder.write(testscore_all, 'spreadsheet/TestPerformanceAll.xlsx') return testscore_all[scoring].mean()
def plot_auc_test(self, X, y, cv=1, groups=None, title=None, ax=None, save_fig=False): '''plot roc_auc curve for given fitted estimator, must have continuous predictons (decision_function or predict_proba) to evaluate model by roc_auc metrics(iterables of X, y can be passed or X, y can be splited using cv > 1), to assess model fit performance X -2D array or list of 2D ndarrays y -binary or list of class labels cv -int, cross-validation generator or an iterable - if cv>1, generate splits by StratifyKfold method title - title added to plot header as to indicate (X, y) return -------- ax, mean-auc, std-auc, data_splits: list of test data set in the form of DataFrame (combined X & y) ''' L = locals().copy() L.pop('self') estimator = self.estimator # split test set by cv if cv > 1: xs = [] ys = [] data_splits = tuple( _split_cv(X, y=y, cv=cv, groups=groups, random_state=self.seed)) for x_set, y_set in data_splits: xs.append(x_set[1]) ys.append(y_set[1]) L.update({'X': xs, 'y': ys, 'cv': 1}) return self.plot_auc_test(**L) self._check_fitted(estimator) X = get_flat_list(X) y = get_flat_list(y) validation.check_consistent_length(X, y) fprs = [] tprs = [] aucs = [] n_sample = 0 for i in range(len(X)): x0 = X[i] y0 = y[i] y_pre = self._pre_continueous(estimator, x0) fpr, tpr, threshhold = roc_curve(y0, y_pre, drop_intermediate=True) fprs.append(fpr) tprs.append(tpr) aucs.append(auc(fpr, tpr)) n_sample += len(x0) # -- plot if ax is None: fig, ax = plt.subplots(1, 1) ax = plotter_auc(fprs, tprs, ax=ax) header = '-'.join([ _get_estimator_name(estimator), 'testCV', '{} samples'.format(n_sample) ]) if isinstance(title, str): header = '-'.join([title, header]) ax.set_title(header) data_splits = [ pd.concat((pd.DataFrame(i) for i in item), axis=1) for item in zip(X, y) ] if save_fig is True: if isinstance(title, str): plot_name = 'plots/roc_test_' + title + '.pdf' else: plot_name = 'plots/roc_test.pdf' self.folder.write(plt.gcf(), plot_name) plt.close() return ax, np.mean(aucs), np.std(aucs), data_splits