def plotter_auc_y(y_pre, y_true, **kwargs): '''plot roc_auc curve given y_pre, y_true ''' fpr, tpr, threshhold = roc_curve(y_true, y_pre, **get_kwargs(roc_curve, **kwargs)) ax = plotter_auc(fpr, tpr, **get_kwargs(plotter_auc, **kwargs)) return ax
def grid_searchcv(self, X, y, param_grid, scoring='roc_auc', cv=3, refit='roc_auc', return_train_score=True, n_jobs=2, fit_params={}, **kwargs): '''tune hyper parameters of estimator by searching param_grid , update self.estimator & self.gridcv_results return ----- cv_results as DataFrame ''' L = locals().copy() L.pop('self') L.pop('fit_params') L.pop('scoring') scorer = self._get_scorer(scoring) # -- estimator = self.estimator grid = GridSearchCV(estimator, scoring=scorer, **get_kwargs(GridSearchCV, **L), **kwargs) grid.fit(X, y, **fit_params) cv_results = pd.DataFrame(grid.cv_results_) self.estimator = grid.best_estimator_ self.gridcv_results = cv_results return cv_results
def rand_searchcv(self, X, y, param_distributions, scoring='roc_auc', cv=3, refit=None, return_train_score=True, fit_params={}, njobs=2, **kwargs): '''tune hyper parameters of estimaotr by randomly searching param_grid , update self estimator & grid search results return ----- cv_results as DataFrame ''' L = locals().copy() L.pop('self') # -- estimator = self.estimator grid = RandomizedSearchCV(estimator, **get_kwargs(RandomizedSearchCV, **L), **kwargs) grid.fit(X, y, **fit_params) cv_results = pd.DataFrame(grid.cv_results_) self.set_params(estimator=grid.best_estimator_) return cv_results
def cv_validate(self, X, y, scoring='roc_auc', cv=5, return_estimator=False, return_train_score=False, **kwargs): ''' return cross_validate results of estimator(see cross_validate) ----- cv_results: (as DataFrame, allowing for multi-metrics) in the form of 'test_xxx', train_xxx' where 'xxx' is scorer name ''' estimator = self.estimator L = locals().copy() L.pop('self') L.pop('scoring') scorer = self._get_scorer(scoring) # -- cv_results = cross_validate(scoring=scorer, **get_kwargs(cross_validate, **L, **kwargs)) return pd.DataFrame(cv_results)
def _dump_df_excel(obj, file, **kwargs): '''dump df to excel obj: 2d array like data file: str or file obj: ''' writer = pd.ExcelWriter(file) obj = get_flat_list(obj) sheet_name = kwargs.get('sheet_name') if sheet_name is None: sheet_name = ['sheet' + str(i + 1) for i in range(len(obj))] else: sheet_name = get_flat_list(sheet_name) check_consistent_length(obj, sheet_name) for data, name in zip(obj, sheet_name): try: data = pd.DataFrame(data) kw = get_kwargs(data.to_excel, **kwargs) kw.update({ 'sheet_name': name, 'index': kwargs.get('index', False) }) data.to_excel(writer, **kw) except Exception as e: print(repr(e)) continue writer.save()
def _dump_df_csv(obj, file, index=False, **kwargs): ''' dump df to csv ''' try: data = pd.DataFrame(obj) data.to_csv(file, index=index, **get_kwargs(data.to_csv, **kwargs)) except Exception as e: print(repr(e))
def build_stack( # add parameters estimators, preprocessing, proba, propagate_features, # initiation parameters folds=3, shuffle=True, scorer=None, random_state=0, raise_on_exception=True, n_jobs=-1, model_selection=False, sample_size=20, meta_estimator='LogisticRegression', partitions=3, partition_estimator=None, test_size=0.5, ens_type='stack'): '''return stack/blend/subsemble ensemble model instance params ------- see __doc__ of SuperLearner & SuperLearner.add, already wrapped in ''' if scorer is not None: scorer = get_score_fn(scorer) # return scoring functions meta_estimator = get_sk_estimators(meta_estimator) L = locals().copy() # -- ens_class = { 'stack': SuperLearner, 'subsemble': Subsemble, 'blend': BlendEnsemble }[ens_type] ens = ens_class(**get_kwargs(ens_class, **L)) ens.add(**get_kwargs(ens.add, **L)) if meta_estimator is not None: ens.add_meta(meta_estimator, proba=True) return ens
def cv_score(self, X, y, scoring='roc_auc', cv=5, **kwargs): ''' return cross validated score of estimator (see cross_val_score) --------- ''' scorer = self._get_scorer(scoring) return cross_val_score(self.estimator, X=X, y=y, scoring=scorer, cv=cv, **get_kwargs(cross_val_score, **kwargs))
def read(self, file, **kwargs): '''return obj from file supported suffix of file are - ['.xlsx', '.csv', '.pkl', '.txt', '.sql'] file - str or file object - file to read ''' self.file_ = file read_api = _rd_apis(self.file_) try: kw = get_kwargs(read_api, **kwargs) rst = read_api(self.file_, **kw) print("<obj>: '{}' read from '{}\n".format(rst.__class__.__name__, self.file_)) return rst except Exception as e: print("<failure>: file '{}' read failed".format(self.file_)) print(repr(e), '\n')
def plotter_catplot(data, kind='violin', swarm=False, hline=None, subset=None, **kwargs): '''make a distr plot through catplot function :: parameter --------- data (DataFrame): data to generate violin plot through seaborn kind (str): 'violin' default ['violin', 'swarm', 'box', 'bar', 'count'], see swarm (bool): whether to combine a swarmplot, default False hline (int): add a horizontal base line subset (dict): fitler subset of data by column's categorical values kwargs: other keywords to customize ax and to pass to plot functions return -------- g : FacetGrid Returns the FacetGrid object with the plot on it for further tweaking. ''' if subset is not None: data = filter_subset(data, subset) # get plot function key words fn_kws = dict( violin=get_kwargs(sns.violinplot, **kwargs), box=get_kwargs(sns.boxplot, **kwargs), swarm=get_kwargs(sns.swarmplot, **kwargs), bar=get_kwargs(sns.barplot, **kwargs), count=get_kwargs(sns.countplot, **kwargs), cat=get_kwargs(sns.catplot, **kwargs), point=get_kwargs(sns.pointplot, **kwargs), factor=get_kwargs(sns.factorplot, **kwargs), ) plot_fn_kws = fn_kws.get(kind) plot_fn_kws.update(fn_kws.get('cat')) if hline is not None: plot_fn_kws.update(legend_out=False) # plot categorical data g = sns.catplot(data=data, kind=kind, **plot_fn_kws) if swarm: g.map(sns.swarmplot, data=data, ax=g.ax, x=kwargs.get('x'), y=kwargs.get('y'), size=2.5, color='k', alpha=0.3) if hline is not None: g.map(plt.axhline, y=hline, color='red', linestyle='--', label='baseline%s' % hline) g._legend_out = True g.add_legend() ax_kws = dict_diff(kwargs, plot_fn_kws.keys()) if 'savefig' in ax_kws: ax_kws.pop('savefig') if len(ax_kws) > 0: g.set(**ax_kws) # save fig to savefig path if kwargs.get('savefig') is not None: _save_fig(g, kwargs['savefig']) return g
def plotter_facet(data, plot_args, subset=None, kind='distplot', savefig=None, **kwargs): '''plot grids of plots using seaborn Facetgrid :: parameter ----- data : DataFrame Tidy (“long-form”) dataframe where each column is a variable and each row is an observation. subset (dict): fitler subset of data by column's categorical values eg: {col1 : [str1, str2, ...], ...} kind: callable plot fn or str to call plot api in _get_plot_fn plot_args (tuple): (colname2 as x, colname2 as y ) indexed by DataFrame row, col, hue : strings Variables that define subsets of the data, which will be drawn on separate facets in the grid. See the *_order parameters to control the order of levels of this variable. col_wrap : int, optional “Wrap” the column variable at this width, so that the column facets span multiple rows. Incompatible with a row facet. share{x,y} : bool, ‘col’, or ‘row’ optional If true, the facets will share y axes across columns and/or x axes across rows. height : scalar, optional Height (in inches) of each facet. See also: aspect. aspect : scalar, optional Aspect ratio of each facet, so that aspect * height gives the width of each facet in inches. palette : palette name, list, or dict, optional Colors to use for the different levels of the hue variable. Should be something that can be interpreted by color_palette(), or a dictionary mapping hue levels to matplotlib colors. {row,col,hue}_order : lists, optional Order for the levels of the faceting variables. By default, this will be the order that the levels appear in data or, if the variables are pandas categoricals, the category order. ''' if subset is not None: data = filter_subset(data, subset) fn_plot = _get_snsplot(kind) # get facet kwds facet_kws = get_kwargs(sns.FacetGrid, **kwargs) # get fn kwds plot_fn_kws = get_kwargs(fn_plot, **kwargs) # get other than kwds ax_kws = dict_diff(kwargs, facet_kws.keys() | plot_fn_kws.keys()) # generate grid g = sns.FacetGrid(data, **facet_kws) # map plot function g.map(fn_plot, *plot_args, **plot_fn_kws) if len(ax_kws) > 0: g.set(**ax_kws) g.add_legend() if savefig: _save_fig(g, savefig) return g
def run_sensitivity(self, train_set=None, title=None, param_grid=-1, refit='roc_auc', scoring=['roc_auc', 'KS'], fit_params={}, n_jobs=2, save_fig=True, **kwargs): ''' - run sensitivity of param_grid (if param_grid=-1, use pre-difined); - update self estimator as best estimator, & update self gridcv_results; - dump plots/spreadsheets parmameters ---- train_set: 2 element tuple, (X, y) of train data param_grid: parameter grid space, if -1, use pipe_grid() to return predifined param_grid **kwargs: GridSearchCV keywords ''' L = locals().copy() L.pop('self') L.pop('param_grid') folder = self.folder #-- if train_set is None: train_set = self._get_dataset('.traindata')[0] else: folder.write(train_set, 'data/0.traindata') if param_grid is -1: param_grid = [] for k, v in self.estimator.named_steps.items(): grid = pipe_grid(k) if grid is not None: param_grid.extend(grid) if len(param_grid) == 0: print('no param_grid found, skip grid search') return # memory cache if hasattr(self.estimator, 'memory'): self.estimator.memory = os.path.relpath( os.path.join(self.folder.path_, 'tempfolder')) X, y = train_set cv_results = [] for i, grid in enumerate(get_flat_list(param_grid)): self.grid_searchcv(X, y=y, param_grid=grid, **get_kwargs(self.grid_searchcv, **L), **kwargs) self.plot_gridcv(save_fig=save_fig, title=str(i)) cv_results.append(self.gridcv_results) print('sensitivity results are being saved... ') title = 0 if title is None else str(title) folder.write(cv_results, 'spreadsheet/GridcvResults{}.xlsx'.format(title)) self.save() self._shut_temp_folder()
def run_test(self, test_set=None, title=None, use_self_bins=True, cv=3, scoring=['roc_auc', 'KS', 'average_precision'], save_fig=True, **kwargs): ''' - run test performance of an estimator; - dump lift curve and ROC curve for test data under self.folder.path_; - optionally dump spreadsheets of calculated data test_set: 2 element tuple (X_test, y_test) or list of them title: title for test_set indicator return ---- series: averaged scoring for each of scoring metrics ''' L = locals().copy() L.pop('self') L.pop('title') folder = self.folder # -- r = 0 if test_set is None: test_set, title = self._get_dataset('.testdata')[0] r -= 1 test_set_list = get_flat_list(test_set) if title is not None: title_list = get_flat_list(title) else: title_list = [str(i) for i in range(len(test_set_list))] check_consistent_length(test_set_list, title_list) if r == 0: folder.write([test_set_list, title_list], 'data/{}.testdata'.format(len(title_list))) testscore = [] for i, j in zip(test_set_list, title_list): # test performance X_test = i[0] y_test = i[1] # plot test auc testcv = self.plot_auc_test(X_test, y_test, title=j, **get_kwargs(self.plot_auc_test, **L, **kwargs)) # plot lift curve test_lift = self.plot_lift(X_test, y_test, title=j, **get_kwargs(self.plot_lift, **L), **kwargs) # test scores scores = self.test_score(X_test, y_test, cv=cv, scoring=scoring) scores['group'] = str(j) testscore.append(scores) if self.verbose > 0: print( 'test cv_score & cv_splits test data are being saved... ') folder.write(testcv[-1], file='spreadsheet/TestSplits{}.xlsx'.format(j)) if test_lift is None: lift = pd.DataFrame() else: lift = test_lift[-1] folder.write( [lift, scores], sheet_name=['lift_curve', 'test_score'], file='spreadsheet/TestPerfomance{}.xlsx'.format(j)) testscore_all = pd.concat(testscore, axis=0, ignore_index=True) fig = plotter_score_path(testscore_all, title='score_path') if save_fig is True: folder.write(fig, 'plots/TestScore_path.pdf') plt.close() if self.verbose > 0 and len(testscore) > 1: folder.write(testscore_all, 'spreadsheet/TestPerformanceAll.xlsx') return testscore_all[scoring].mean()
def run_train(self, train_set=None, title='Train', scoring=['roc_auc', 'KS'], q=None, bins=None, max_leaf_nodes=None, fit_params={}, cv=3, save_fig=True, **kwargs): ''' - run train performance of an estimator; - dump lift curve and ROC curve for train data under self.folder.path_; - optionally dump spreadsheets of calculated data train_set: 2 element tuple, (X, y) of train data cv: n of cross validation folder, if cv==1, no cross validation fit_params -other fit parameters of estimator return ---- series: averaged train score for each scoring metrics ''' L = locals().copy() L.pop('self') folder = self.folder # -- title = title if title is not None else 0 if train_set is None: train_set = self._get_dataset('.traindata')[0] else: folder.write(train_set, 'data/0.traindata') # trainning X = train_set[0] y = train_set[1] traincv = self.plot_auc_traincv( X, y, **get_kwargs(self.plot_auc_traincv, **L), **fit_params) self.fit(X, y, **fit_params) if any([max_leaf_nodes, q, bins]): lift_data = self.plot_lift(X, y, **get_kwargs(self.plot_lift, **L), **kwargs) lift = lift_data[-1] else: lift = pd.DataFrame() cv_score = self.cv_validate(X, y, **get_kwargs(self.cv_validate, **L), **kwargs) if self.verbose > 0: print('train data & cv_score & cv_splits data are being saved...') folder.write([lift, cv_score], 'spreadsheet/TrainPerfomance{}.xlsx'.format(title), sheet_name=['liftcurve', 'train_score']) folder.write(traincv[-1], 'spreadsheet/TrainSplits{}.xlsx'.format(title)) fig = plotter_score_path(cv_score, title='TrainScore_path') if save_fig is True: folder.write(fig, 'plots/TrainScore_path.pdf') plt.close() return cv_score.mean()