Пример #1
0
def _dfinfo(dataframe):
    '''Returns a dataframe with statistical info about the given `dataframe`
    '''
    infocols = ['Min', 'Median', 'Max', '#NAs', '#<1Perc.', '#>99Perc.']
    defaultcolvalues = [np.nan, np.nan, np.nan, 0, 0, 0]
    sum_df = odict()
    # if _dfr.empty
    for col in floatingcols(dataframe):
        colvalues = defaultcolvalues
        if not dataframe.empty:
            q01 = np.nanquantile(dataframe[col], 0.01)
            q99 = np.nanquantile(dataframe[col], 0.99)
            df1, df99 = dataframe[(dataframe[col] <
                                   q01)], dataframe[(dataframe[col] > q99)]
            colvalues = [
                np.nanmin(dataframe[col]),  # Min
                np.nanquantile(dataframe[col], 0.5),  # Median
                np.nanmax(dataframe[col]),  # Max
                (~np.isfinite(dataframe[col])).sum(),  # #NAs
                len(df1),  # #<1Perc.
                len(df99)  # @>99Perc.
            ]

        sum_df[col] = {i: v for i, v in zip(infocols, colvalues)}

    return pd.DataFrame(data=list(sum_df.values()),
                        columns=infocols,
                        index=list(sum_df.keys()))
Пример #2
0
def dfinfo(dataframe, asstring=True):
    '''Returns a a dataframe with info about the given `dataframe` representing
    a given dataset (if asstring=False) or, if asstring=True, a string
    representing the dataframe.
    '''
    classnames = CLASSNAMES
    class_selectors = CLASS_SELECTORS

    sum_dfs = odict()
    for classname, class_selector in zip(classnames, class_selectors):
        _dfr = dataframe[class_selector(dataframe)]
        title = "Class '%s': %d of %d instances" % (classname, len(_dfr),
                                                    len(dataframe))
        sum_dfs[title] = _dfinfo(_dfr)

    # return a MultiIndex DataFrame:
    if not asstring:
        return pd.concat(sum_dfs.values(),
                         axis=0,
                         keys=sum_dfs.keys(),
                         sort=False)

    allstrs = []
    for (key, val) in sum_dfs.items():
        allstrs.extend(['', key, '-' * len(key), val.to_string()])
    return '\n'.join(allstrs)
Пример #3
0
 def model_params(model_filename):
     '''Converts the given model_filename (or absolute path) into a dict
     of key -> tuple of **strings** (single values parameters will be mapped
     to a 1-element tuple)
     '''
     pth = basename(model_filename)
     pth_, ext = splitext(pth)
     if ext == '.sklmodel':
         pth = pth_
     pars = pth.split('&')
     ret = odict()
     for par in pars:
         key, val = par.split('=')
         ret[uq(key)] = tuple(uq(_) for _ in val.split(','))
     return ret
Пример #4
0
 def set_classifiers_paths(self, classifiers_paths):
     '''Builds and returns from this object parameters a list of N arguments
     to be passed to `_create_save_classifier`.
     Skips classifiers whose prediction dataframe already exists. 
     See `run_evaluation` for details
     '''
     ret = odict()
     for clfpath in classifiers_paths:
         outdir = splitext(clfpath)[0]
         destpath = join(outdir, basename(self.testset_filepath))
         if isfile(destpath):
             continue
         elif not isdir(dirname(destpath)):
             makedirs(dirname(destpath))
         if not isdir(dirname(destpath)):
             continue
         feats = self.model_params(clfpath)['feats']
         ret[clfpath] = (destpath, feats)
     self.classifiers_paths = ret
Пример #5
0
    def model_filename(clf_class, tr_set, *features, **clf_params):
        '''converts the given argument to a model filename, with extension
        .sklmodel'''
        pars = odict()
        pars['clf'] = [str(clf_class.__name__)]
        pars['tr_set'] = [
            str(_) for _ in
            (tr_set if isinstance(tr_set, (list, tuple)) else [tr_set])
        ]
        pars['feats'] = [str(_) for _ in sorted(features)]
        for key in sorted(clf_params):
            val = clf_params[key]
            pars[q(key, safe=_safe)] = [
                str(_) for _ in
                (val if isinstance(val, (list, tuple)) else [val])
            ]

        return '&'.join("%s=%s" % (k, ','.join(q(_, safe=_safe) for _ in v))
                        for k, v in pars.items()) + '.sklmodel'
Пример #6
0
def _get_summary_evaluationmetrics_mp(clfdir_and_testname):
    clfdir, testname = clfdir_and_testname
    filepath = join(clfdir, testname)
    predicted_df = pd.read_hdf(filepath, columns=[OUTLIER_COL, PREDICT_COL])
    cols = tuple(_METRIC_COLUMNS)
    try:
        # parse the clf file name (which is the directory name of the
        # prediction dataframe we want to calculate metrics from), and
        # try to guess if they can be floats or ints:
        ret = odict()
        for key, value in TestParam.model_params(clfdir).items():
            # value is a tuple. First thing is to define how to store it
            # in a pandas DataFrame. Use its string method without brackets
            # (so that e.g., ['a', 'b'] will be stored as 'a,b' and
            # ['abc'] will be stored as 'abc':
            stored_value = ",".join(str(_) for _ in value)
            if len(value) == 1:
                try:
                    stored_value = int(value[0])
                except (ValueError, TypeError):
                    try:
                        stored_value = float(value[0])
                    except (ValueError, TypeError):
                        pass
            ret[key] = stored_value

        ret[cols[0]] = log_loss(predicted_df)
        ret[cols[1]] = roc_auc_score(predicted_df)
        ret[cols[2]] = average_precision_score(predicted_df)
        ret[cols[3]] = roc_curve(predicted_df)[-1]
        ret[cols[4]] = precision_recall_curve(predicted_df)[-1]

        return clfdir, testname, ret

    except Exception as exc:
        return clfdir, testname, exc