Пример #1
0
def plotter_auc_y(y_pre, y_true, **kwargs):
    '''plot roc_auc curve given y_pre, y_true
    '''
    fpr, tpr, threshhold = roc_curve(y_true, y_pre,
                                     **get_kwargs(roc_curve, **kwargs))
    ax = plotter_auc(fpr, tpr, **get_kwargs(plotter_auc, **kwargs))
    return ax
Пример #2
0
    def grid_searchcv(self,
                      X,
                      y,
                      param_grid,
                      scoring='roc_auc',
                      cv=3,
                      refit='roc_auc',
                      return_train_score=True,
                      n_jobs=2,
                      fit_params={},
                      **kwargs):
        '''tune hyper parameters of estimator by searching param_grid
        , update self.estimator & self.gridcv_results
        
        return
        -----
        cv_results as DataFrame
        '''
        L = locals().copy()
        L.pop('self')
        L.pop('fit_params')
        L.pop('scoring')
        scorer = self._get_scorer(scoring)
        # --
        estimator = self.estimator
        grid = GridSearchCV(estimator,
                            scoring=scorer,
                            **get_kwargs(GridSearchCV, **L),
                            **kwargs)

        grid.fit(X, y, **fit_params)
        cv_results = pd.DataFrame(grid.cv_results_)
        self.estimator = grid.best_estimator_
        self.gridcv_results = cv_results
        return cv_results
Пример #3
0
 def rand_searchcv(self,
                   X,
                   y,
                   param_distributions,
                   scoring='roc_auc',
                   cv=3,
                   refit=None,
                   return_train_score=True,
                   fit_params={},
                   njobs=2,
                   **kwargs):
     '''tune hyper parameters of estimaotr by randomly searching param_grid
     , update self estimator & grid search results     
     return
     -----
     cv_results as DataFrame
     '''
     L = locals().copy()
     L.pop('self')
     # --
     estimator = self.estimator
     grid = RandomizedSearchCV(estimator,
                               **get_kwargs(RandomizedSearchCV, **L),
                               **kwargs)
     grid.fit(X, y, **fit_params)
     cv_results = pd.DataFrame(grid.cv_results_)
     self.set_params(estimator=grid.best_estimator_)
     return cv_results
Пример #4
0
 def cv_validate(self,
                 X,
                 y,
                 scoring='roc_auc',
                 cv=5,
                 return_estimator=False,
                 return_train_score=False,
                 **kwargs):
     '''       
     return cross_validate results of estimator(see cross_validate)
     -----
     cv_results: 
         (as DataFrame, allowing for multi-metrics) in the form of
         'test_xxx', train_xxx' where  'xxx' is scorer name
     '''
     estimator = self.estimator
     L = locals().copy()
     L.pop('self')
     L.pop('scoring')
     scorer = self._get_scorer(scoring)
     # --
     cv_results = cross_validate(scoring=scorer,
                                 **get_kwargs(cross_validate, **L,
                                              **kwargs))
     return pd.DataFrame(cv_results)
Пример #5
0
def _dump_df_excel(obj, file, **kwargs):
    '''dump df to excel
    
    obj: 
        2d array like data
    file:
        str or file obj:        
    '''
    writer = pd.ExcelWriter(file)
    obj = get_flat_list(obj)

    sheet_name = kwargs.get('sheet_name')

    if sheet_name is None:
        sheet_name = ['sheet' + str(i + 1) for i in range(len(obj))]
    else:
        sheet_name = get_flat_list(sheet_name)
        check_consistent_length(obj, sheet_name)

    for data, name in zip(obj, sheet_name):
        try:
            data = pd.DataFrame(data)
            kw = get_kwargs(data.to_excel, **kwargs)
            kw.update({
                'sheet_name': name,
                'index': kwargs.get('index', False)
            })
            data.to_excel(writer, **kw)
        except Exception as e:
            print(repr(e))
            continue
    writer.save()
Пример #6
0
def _dump_df_csv(obj, file, index=False, **kwargs):
    ''' dump df to csv
    '''
    try:
        data = pd.DataFrame(obj)
        data.to_csv(file, index=index, **get_kwargs(data.to_csv, **kwargs))
    except Exception as e:
        print(repr(e))
def build_stack(
        # add parameters
        estimators,
        preprocessing,
        proba,
        propagate_features,
        # initiation parameters
        folds=3,
        shuffle=True,
        scorer=None,
        random_state=0,
        raise_on_exception=True,
        n_jobs=-1,
        model_selection=False,
        sample_size=20,
        meta_estimator='LogisticRegression',
        partitions=3,
        partition_estimator=None,
        test_size=0.5,
        ens_type='stack'):
    '''return stack/blend/subsemble ensemble model instance
    
    params 
    -------
    see __doc__ of SuperLearner & SuperLearner.add, already wrapped in
    '''
    if scorer is not None:
        scorer = get_score_fn(scorer)  # return scoring functions
    meta_estimator = get_sk_estimators(meta_estimator)

    L = locals().copy()
    # --
    ens_class = {
        'stack': SuperLearner,
        'subsemble': Subsemble,
        'blend': BlendEnsemble
    }[ens_type]
    ens = ens_class(**get_kwargs(ens_class, **L))

    ens.add(**get_kwargs(ens.add, **L))

    if meta_estimator is not None:
        ens.add_meta(meta_estimator, proba=True)

    return ens
Пример #8
0
 def cv_score(self, X, y, scoring='roc_auc', cv=5, **kwargs):
     '''
     return cross validated score of estimator (see cross_val_score)
     ---------
     '''
     scorer = self._get_scorer(scoring)
     return cross_val_score(self.estimator,
                            X=X,
                            y=y,
                            scoring=scorer,
                            cv=cv,
                            **get_kwargs(cross_val_score, **kwargs))
Пример #9
0
 def read(self, file, **kwargs):
     '''return obj from file
     
     supported suffix of file are
     - ['.xlsx', '.csv', '.pkl', '.txt', '.sql']       
     file - str or file object
         - file to read
     '''
     self.file_ = file
     read_api = _rd_apis(self.file_)
     try:
         kw = get_kwargs(read_api, **kwargs)
         rst = read_api(self.file_, **kw)
         print("<obj>: '{}' read from '{}\n".format(rst.__class__.__name__,
                                                    self.file_))
         return rst
     except Exception as e:
         print("<failure>: file '{}' read failed".format(self.file_))
         print(repr(e), '\n')
Пример #10
0
def plotter_catplot(data,
                    kind='violin',
                    swarm=False,
                    hline=None,
                    subset=None,
                    **kwargs):
    '''make a distr plot through catplot function ::    

    parameter
    ---------
    data (DataFrame):
        data to generate violin plot through seaborn
    kind (str): 'violin' default
        ['violin', 'swarm', 'box', 'bar', 'count'], see 
    swarm (bool):
        whether to combine a swarmplot, default False
    hline (int):
        add a horizontal base line 
    subset (dict):
        fitler subset of data by column's categorical values
    kwargs:
        other keywords to customize ax and to pass to plot functions
    
    return
    --------    
        g : FacetGrid
            Returns the FacetGrid object with the plot on it for further 
            tweaking.
    '''
    if subset is not None:
        data = filter_subset(data, subset)

    # get plot function key words
    fn_kws = dict(
        violin=get_kwargs(sns.violinplot, **kwargs),
        box=get_kwargs(sns.boxplot, **kwargs),
        swarm=get_kwargs(sns.swarmplot, **kwargs),
        bar=get_kwargs(sns.barplot, **kwargs),
        count=get_kwargs(sns.countplot, **kwargs),
        cat=get_kwargs(sns.catplot, **kwargs),
        point=get_kwargs(sns.pointplot, **kwargs),
        factor=get_kwargs(sns.factorplot, **kwargs),
    )
    plot_fn_kws = fn_kws.get(kind)
    plot_fn_kws.update(fn_kws.get('cat'))

    if hline is not None:
        plot_fn_kws.update(legend_out=False)
    # plot categorical data
    g = sns.catplot(data=data, kind=kind, **plot_fn_kws)

    if swarm:
        g.map(sns.swarmplot,
              data=data,
              ax=g.ax,
              x=kwargs.get('x'),
              y=kwargs.get('y'),
              size=2.5,
              color='k',
              alpha=0.3)
    if hline is not None:
        g.map(plt.axhline,
              y=hline,
              color='red',
              linestyle='--',
              label='baseline%s' % hline)
        g._legend_out = True
        g.add_legend()

    ax_kws = dict_diff(kwargs, plot_fn_kws.keys())
    if 'savefig' in ax_kws:
        ax_kws.pop('savefig')
    if len(ax_kws) > 0:
        g.set(**ax_kws)
    # save fig to savefig path
    if kwargs.get('savefig') is not None:
        _save_fig(g, kwargs['savefig'])
    return g
Пример #11
0
def plotter_facet(data,
                  plot_args,
                  subset=None,
                  kind='distplot',
                  savefig=None,
                  **kwargs):
    '''plot grids of plots using seaborn Facetgrid ::
    
    parameter
    -----
    data : DataFrame
    
        Tidy (“long-form”) dataframe where each column is a variable and each 
        row is an observation.

    subset (dict):
        fitler subset of data by column's categorical values
        eg: {col1 : [str1, str2, ...], ...}
        
    kind:
        callable plot fn or str to call plot api in _get_plot_fn
        
    plot_args (tuple):
        (colname2 as x, colname2 as y ) indexed by DataFrame   
        
    row, col, hue : strings
    
        Variables that define subsets of the data, which will be drawn on 
        separate facets in the grid. See the *_order parameters to control 
        the order of levels of this variable.
    
    col_wrap : int, optional
    
        “Wrap” the column variable at this width, so that the column facets
        span multiple rows. Incompatible with a row facet.
    
    share{x,y} : bool, ‘col’, or ‘row’ optional
    
        If true, the facets will share y axes across columns and/or x axes 
        across rows.
    
    height : scalar, optional
    
        Height (in inches) of each facet. See also: aspect.
    
    aspect : scalar, optional
    
        Aspect ratio of each facet, so that aspect * height gives the width of 
        each facet in inches.
    
    palette : palette name, list, or dict, optional
    
        Colors to use for the different levels of the hue variable. 
        Should be something that can be interpreted by color_palette(), or a dictionary mapping hue levels to matplotlib colors.
    
    {row,col,hue}_order : lists, optional
    
        Order for the levels of the faceting variables. By default, 
        this will be the order that the levels appear in data or, if the variables are pandas categoricals, the category order.

    '''
    if subset is not None:
        data = filter_subset(data, subset)

    fn_plot = _get_snsplot(kind)
    # get facet kwds
    facet_kws = get_kwargs(sns.FacetGrid, **kwargs)
    # get fn kwds
    plot_fn_kws = get_kwargs(fn_plot, **kwargs)
    # get other than kwds
    ax_kws = dict_diff(kwargs, facet_kws.keys() | plot_fn_kws.keys())
    # generate grid
    g = sns.FacetGrid(data, **facet_kws)
    # map plot function
    g.map(fn_plot, *plot_args, **plot_fn_kws)

    if len(ax_kws) > 0:
        g.set(**ax_kws)

    g.add_legend()

    if savefig:
        _save_fig(g, savefig)
    return g
Пример #12
0
    def run_sensitivity(self,
                        train_set=None,
                        title=None,
                        param_grid=-1,
                        refit='roc_auc',
                        scoring=['roc_auc', 'KS'],
                        fit_params={},
                        n_jobs=2,
                        save_fig=True,
                        **kwargs):
        '''
        - run sensitivity of param_grid (if param_grid=-1, use pre-difined); 
        - update self estimator as best estimator, & update self gridcv_results;
        - dump plots/spreadsheets
        
        parmameters
        ----
        train_set: 
            2 element tuple, (X, y) of train data
        param_grid:
            parameter grid space, if -1, use pipe_grid() to return predifined 
            param_grid
        **kwargs:
            GridSearchCV keywords
        '''

        L = locals().copy()
        L.pop('self')
        L.pop('param_grid')
        folder = self.folder
        #--
        if train_set is None:
            train_set = self._get_dataset('.traindata')[0]
        else:
            folder.write(train_set, 'data/0.traindata')

        if param_grid is -1:
            param_grid = []
            for k, v in self.estimator.named_steps.items():
                grid = pipe_grid(k)
                if grid is not None:
                    param_grid.extend(grid)

        if len(param_grid) == 0:
            print('no param_grid found, skip grid search')
            return

        # memory cache
        if hasattr(self.estimator, 'memory'):
            self.estimator.memory = os.path.relpath(
                os.path.join(self.folder.path_, 'tempfolder'))

        X, y = train_set
        cv_results = []
        for i, grid in enumerate(get_flat_list(param_grid)):
            self.grid_searchcv(X,
                               y=y,
                               param_grid=grid,
                               **get_kwargs(self.grid_searchcv, **L),
                               **kwargs)
            self.plot_gridcv(save_fig=save_fig, title=str(i))
            cv_results.append(self.gridcv_results)

        print('sensitivity results are being saved... ')
        title = 0 if title is None else str(title)
        folder.write(cv_results,
                     'spreadsheet/GridcvResults{}.xlsx'.format(title))
        self.save()
        self._shut_temp_folder()
Пример #13
0
    def run_test(self,
                 test_set=None,
                 title=None,
                 use_self_bins=True,
                 cv=3,
                 scoring=['roc_auc', 'KS', 'average_precision'],
                 save_fig=True,
                 **kwargs):
        '''
        - run test performance of an estimator; 
        - dump lift curve and ROC curve for test data under self.folder.path_; 
        - optionally dump spreadsheets of calculated data
        
        test_set:
            2 element tuple (X_test, y_test) or list of them
        title:
            title for test_set indicator
        
        return
        ----
            series: averaged scoring for each of scoring metrics
        '''
        L = locals().copy()
        L.pop('self')
        L.pop('title')
        folder = self.folder
        # --

        r = 0
        if test_set is None:
            test_set, title = self._get_dataset('.testdata')[0]
            r -= 1

        test_set_list = get_flat_list(test_set)
        if title is not None:
            title_list = get_flat_list(title)
        else:
            title_list = [str(i) for i in range(len(test_set_list))]
        check_consistent_length(test_set_list, title_list)
        if r == 0:
            folder.write([test_set_list, title_list],
                         'data/{}.testdata'.format(len(title_list)))

        testscore = []
        for i, j in zip(test_set_list, title_list):
            # test performance
            X_test = i[0]
            y_test = i[1]
            # plot test auc
            testcv = self.plot_auc_test(X_test,
                                        y_test,
                                        title=j,
                                        **get_kwargs(self.plot_auc_test, **L,
                                                     **kwargs))
            # plot lift curve
            test_lift = self.plot_lift(X_test,
                                       y_test,
                                       title=j,
                                       **get_kwargs(self.plot_lift, **L),
                                       **kwargs)
            # test scores
            scores = self.test_score(X_test, y_test, cv=cv, scoring=scoring)
            scores['group'] = str(j)
            testscore.append(scores)
            if self.verbose > 0:
                print(
                    'test cv_score & cv_splits test data are being saved... ')
                folder.write(testcv[-1],
                             file='spreadsheet/TestSplits{}.xlsx'.format(j))
                if test_lift is None:
                    lift = pd.DataFrame()
                else:
                    lift = test_lift[-1]
                folder.write(
                    [lift, scores],
                    sheet_name=['lift_curve', 'test_score'],
                    file='spreadsheet/TestPerfomance{}.xlsx'.format(j))

        testscore_all = pd.concat(testscore, axis=0, ignore_index=True)
        fig = plotter_score_path(testscore_all, title='score_path')
        if save_fig is True:
            folder.write(fig, 'plots/TestScore_path.pdf')
            plt.close()
        if self.verbose > 0 and len(testscore) > 1:
            folder.write(testscore_all, 'spreadsheet/TestPerformanceAll.xlsx')

        return testscore_all[scoring].mean()
Пример #14
0
    def run_train(self,
                  train_set=None,
                  title='Train',
                  scoring=['roc_auc', 'KS'],
                  q=None,
                  bins=None,
                  max_leaf_nodes=None,
                  fit_params={},
                  cv=3,
                  save_fig=True,
                  **kwargs):
        '''
        - run train performance of an estimator; 
        - dump lift curve and ROC curve for train data under self.folder.path_; 
        - optionally dump spreadsheets of calculated data
        
        train_set: 
            2 element tuple, (X, y) of train data
        cv:
           n of cross validation folder, if cv==1, no cross validation        
        fit_params
            -other fit parameters of estimator
            
        return
        ----
        series: averaged train score for each scoring metrics

        '''
        L = locals().copy()
        L.pop('self')
        folder = self.folder
        # --
        title = title if title is not None else 0
        if train_set is None:
            train_set = self._get_dataset('.traindata')[0]
        else:
            folder.write(train_set, 'data/0.traindata')

        # trainning
        X = train_set[0]
        y = train_set[1]
        traincv = self.plot_auc_traincv(
            X, y, **get_kwargs(self.plot_auc_traincv, **L), **fit_params)

        self.fit(X, y, **fit_params)
        if any([max_leaf_nodes, q, bins]):
            lift_data = self.plot_lift(X, y, **get_kwargs(self.plot_lift, **L),
                                       **kwargs)
            lift = lift_data[-1]
        else:
            lift = pd.DataFrame()

        cv_score = self.cv_validate(X, y, **get_kwargs(self.cv_validate, **L),
                                    **kwargs)
        if self.verbose > 0:
            print('train data & cv_score & cv_splits data are being saved...')
            folder.write([lift, cv_score],
                         'spreadsheet/TrainPerfomance{}.xlsx'.format(title),
                         sheet_name=['liftcurve', 'train_score'])
            folder.write(traincv[-1],
                         'spreadsheet/TrainSplits{}.xlsx'.format(title))
        fig = plotter_score_path(cv_score, title='TrainScore_path')
        if save_fig is True:
            folder.write(fig, 'plots/TrainScore_path.pdf')
            plt.close()
        return cv_score.mean()