示例#1
0
文件: dataset.py 项目: rizac/sod
def dfinfo(dataframe, asstring=True):
    '''Returns a a dataframe with info about the given `dataframe` representing
    a given dataset
    '''
    dinfo = dataset_info(dataframe)
    classes = {c: dinfo.class_selector[c] for c in dinfo.classnames}
    sum_dfs = odict()
    empty_classes = set()
    infocols = ['Min', 'Median', 'Max', '#NAs', '#<1Perc.', '#>99Perc.']
    for classname, class_selector in classes.items():
        sum_df = odict()
        _dfr = dataframe[class_selector(dataframe)]
        if _dfr.empty:
            empty_classes.add(classname)
            continue
        # if _dfr.empty
        for col in floatingcols(dataframe):
            q01 = np.nanquantile(_dfr[col], 0.01)
            q99 = np.nanquantile(_dfr[col], 0.99)
            df1, df99 = _dfr[(_dfr[col] < q01)], _dfr[(_dfr[col] > q99)]
            # segs1 = len(pd.unique(df1[ID_COL]))
            # segs99 = len(pd.unique(df99[ID_COL]))
            # stas1 = len(pd.unique(df1['station_id']))
            # stas99 = len(pd.unique(df99['station_id']))

            sum_df[col] = {
                infocols[0]: np.nanmin(_dfr[col]),
                infocols[1]: np.nanquantile(_dfr[col], 0.5),
                infocols[2]: np.nanmax(_dfr[col]),
                infocols[3]: (~np.isfinite(_dfr[col])).sum(),
                infocols[4]: len(df1),
                infocols[5]: len(df99)
                # columns[5]: stas1 + stas99,
            }
        sum_dfs[classname + " (%d instances)" % len(_dfr)] = \
            pd.DataFrame(data=list(sum_df.values()),
                         columns=infocols,
                         index=list(sum_df.keys()))

#     return df2str(pd.DataFrame(data, columns=columns, index=index))
    if not asstring:
        return pd.concat(sum_dfs.values(), axis=0, keys=sum_dfs.keys(),
                         sort=False)

    allstrs = []
    for (key, val) in sum_dfs.items():
        allstrs.extend(['', key, val.to_string()])
    return '\n'.join(allstrs)
示例#2
0
文件: dataset.py 项目: rizac/sod
def _dfinfo(dataframe, asstring=True):
    '''Returns a a dataframe with info about the given `dataframe`
    '''

    infocols = ['Min', 'Median', 'Max', '#NAs', '#<1Perc.', '#>99Perc.']
    sum_df = odict()
    # if _dfr.empty
    for col in floatingcols(dataframe):
        q01 = np.nanquantile(dataframe[col], 0.01)
        q99 = np.nanquantile(dataframe[col], 0.99)
        df1, df99 = dataframe[(dataframe[col] < q01)], dataframe[(dataframe[col] > q99)]
        # segs1 = len(pd.unique(df1[ID_COL]))
        # segs99 = len(pd.unique(df99[ID_COL]))
        # stas1 = len(pd.unique(df1['station_id']))
        # stas99 = len(pd.unique(df99['station_id']))

        sum_df[col] = {
            infocols[0]: np.nanmin(dataframe[col]),
            infocols[1]: np.nanquantile(dataframe[col], 0.5),
            infocols[2]: np.nanmax(dataframe[col]),
            infocols[3]: (~np.isfinite(dataframe[col])).sum(),
            infocols[4]: len(df1),
            infocols[5]: len(df99)
            # columns[5]: stas1 + stas99,
        }
    return pd.DataFrame(data=list(sum_df.values()),
                        columns=infocols,
                        index=list(sum_df.keys()))
示例#3
0
文件: dataset.py 项目: rizac/sod
def _dfinfo(dataframe):
    '''Returns a dataframe with statistical info about the given `dataframe`
    '''
    infocols = ['Min', 'Median', 'Max', '#NAs', '#<1Perc.', '#>99Perc.']
    defaultcolvalues = [np.nan, np.nan, np.nan, 0, 0, 0]
    sum_df = odict()
    # if _dfr.empty
    for col in floatingcols(dataframe):
        colvalues = defaultcolvalues
        if not dataframe.empty:
            q01 = np.nanquantile(dataframe[col], 0.01)
            q99 = np.nanquantile(dataframe[col], 0.99)
            df1, df99 = dataframe[(dataframe[col] <
                                   q01)], dataframe[(dataframe[col] > q99)]
            colvalues = [
                np.nanmin(dataframe[col]),  # Min
                np.nanquantile(dataframe[col], 0.5),  # Median
                np.nanmax(dataframe[col]),  # Max
                (~np.isfinite(dataframe[col])).sum(),  # #NAs
                len(df1),  # #<1Perc.
                len(df99)  # @>99Perc.
            ]

        sum_df[col] = {i: v for i, v in zip(infocols, colvalues)}

    return pd.DataFrame(data=list(sum_df.values()),
                        columns=infocols,
                        index=list(sum_df.keys()))
示例#4
0
文件: dataset.py 项目: rizac/sod
def dfinfo(dataframe, asstring=True):
    '''Returns a a dataframe with info about the given `dataframe` representing
    a given dataset (if asstring=False) or, if asstring=True, a string
    representing the dataframe.
    '''
    classnames = CLASSNAMES
    class_selectors = CLASS_SELECTORS

    sum_dfs = odict()
    for classname, class_selector in zip(classnames, class_selectors):
        _dfr = dataframe[class_selector(dataframe)]
        title = "Class '%s': %d of %d instances" % (classname, len(_dfr),
                                                    len(dataframe))
        sum_dfs[title] = _dfinfo(_dfr)

    # return a MultiIndex DataFrame:
    if not asstring:
        return pd.concat(sum_dfs.values(),
                         axis=0,
                         keys=sum_dfs.keys(),
                         sort=False)

    allstrs = []
    for (key, val) in sum_dfs.items():
        allstrs.extend(['', key, '-' * len(key), val.to_string()])
    return '\n'.join(allstrs)
示例#5
0
 def read_all_pred_dfs():
     '''dict of pred_dataframe paths -> st_time'''
     ret = []
     for fle in listdir(tmpdir):
         _ = join(tmpdir, fle, 'allset_test.hdf_')
         if isfile(_):
             ret.append(_)
     ret2 = odict()
     for _ in sorted(ret):
         ret2[_] = stat(_).st_mtime
     return ret2
示例#6
0
文件: evaluation.py 项目: rizac/sod
 def model_params(model_filename):
     '''Converts the given model_filename (or absolute path) into a dict
     of key -> tuple of **strings** (single values parameters will be mapped
     to a 1-element tuple)
     '''
     pth = basename(model_filename)
     pth_, ext = splitext(pth)
     if ext == '.sklmodel':
         pth = pth_
     pars = pth.split('&')
     ret = odict()
     for par in pars:
         key, val = par.split('=')
         ret[uq(key)] = tuple(uq(_) for _ in val.split(','))
     return ret
示例#7
0
文件: evaluation.py 项目: rizac/sod
 def set_classifiers_paths(self, classifiers_paths):
     '''Builds and returns from this object parameters a list of N arguments
     to be passed to `_create_save_classifier`.
     Skips classifiers whose prediction dataframe already exists. 
     See `run_evaluation` for details
     '''
     ret = odict()
     for clfpath in classifiers_paths:
         outdir = splitext(clfpath)[0]
         destpath = join(outdir, basename(self.testset_filepath))
         if isfile(destpath):
             continue
         elif not isdir(dirname(destpath)):
             makedirs(dirname(destpath))
         if not isdir(dirname(destpath)):
             continue
         feats = self.model_params(clfpath)['feats']
         ret[clfpath] = (destpath, feats)
     self.classifiers_paths = ret
示例#8
0
文件: evaluation.py 项目: rizac/sod
    def model_filename(clf_class, tr_set, *features, **clf_params):
        '''converts the given argument to a model filename, with extension
        .sklmodel'''
        pars = odict()
        pars['clf'] = [str(clf_class.__name__)]
        pars['tr_set'] = [
            str(_)
            for _ in (tr_set if isinstance(tr_set, (list,
                                                    tuple)) else [tr_set])
        ]
        pars['feats'] = [str(_) for _ in sorted(features)]
        for key in sorted(clf_params):
            val = clf_params[key]
            pars[q(key, safe=_safe)] = [
                str(_)
                for _ in (val if isinstance(val, (list, tuple)) else [val])
            ]

        return '&'.join("%s=%s" % (k, ','.join(q(_, safe=_safe) for _ in v))
                        for k, v in pars.items()) + '.sklmodel'
示例#9
0
文件: evaluation.py 项目: rizac/sod
def _get_summary_evaluationmetrics_mp(clfdir_and_testname):
    clfdir, testname = clfdir_and_testname
    filepath = join(clfdir, testname)
    predicted_df = pd.read_hdf(filepath, columns=[OUTLIER_COL, PREDICT_COL])
    cols = tuple(_METRIC_COLUMNS)
    try:
        # parse the clf file name (which is the directory name of the
        # prediction dataframe we want to calculate metrics from), and
        # try to guess if they can be floats or ints:
        ret = odict()
        for key, value in TestParam.model_params(clfdir).items():
            # value is a tuple. First thing is to define how to store it
            # in a pandas DataFrame. Use its string method without brackets
            # (so that e.g., ['a', 'b'] will be stored as 'a,b' and
            # ['abc'] will be stored as 'abc':
            stored_value = ",".join(str(_) for _ in value)
            if len(value) == 1:
                try:
                    stored_value = int(value[0])
                except (ValueError, TypeError):
                    try:
                        stored_value = float(value[0])
                    except (ValueError, TypeError):
                        pass
            ret[key] = stored_value

        ret[cols[0]] = log_loss(predicted_df)
        ret[cols[1]] = roc_auc_score(predicted_df)
        ret[cols[2]] = average_precision_score(predicted_df)
        ret[cols[3]] = roc_curve(predicted_df)[-1]
        ret[cols[4]] = precision_recall_curve(predicted_df)[-1]

        return clfdir, testname, ret

    except Exception as exc:
        return clfdir, testname, exc