def dfinfo(dataframe, asstring=True): '''Returns a a dataframe with info about the given `dataframe` representing a given dataset ''' dinfo = dataset_info(dataframe) classes = {c: dinfo.class_selector[c] for c in dinfo.classnames} sum_dfs = odict() empty_classes = set() infocols = ['Min', 'Median', 'Max', '#NAs', '#<1Perc.', '#>99Perc.'] for classname, class_selector in classes.items(): sum_df = odict() _dfr = dataframe[class_selector(dataframe)] if _dfr.empty: empty_classes.add(classname) continue # if _dfr.empty for col in floatingcols(dataframe): q01 = np.nanquantile(_dfr[col], 0.01) q99 = np.nanquantile(_dfr[col], 0.99) df1, df99 = _dfr[(_dfr[col] < q01)], _dfr[(_dfr[col] > q99)] # segs1 = len(pd.unique(df1[ID_COL])) # segs99 = len(pd.unique(df99[ID_COL])) # stas1 = len(pd.unique(df1['station_id'])) # stas99 = len(pd.unique(df99['station_id'])) sum_df[col] = { infocols[0]: np.nanmin(_dfr[col]), infocols[1]: np.nanquantile(_dfr[col], 0.5), infocols[2]: np.nanmax(_dfr[col]), infocols[3]: (~np.isfinite(_dfr[col])).sum(), infocols[4]: len(df1), infocols[5]: len(df99) # columns[5]: stas1 + stas99, } sum_dfs[classname + " (%d instances)" % len(_dfr)] = \ pd.DataFrame(data=list(sum_df.values()), columns=infocols, index=list(sum_df.keys())) # return df2str(pd.DataFrame(data, columns=columns, index=index)) if not asstring: return pd.concat(sum_dfs.values(), axis=0, keys=sum_dfs.keys(), sort=False) allstrs = [] for (key, val) in sum_dfs.items(): allstrs.extend(['', key, val.to_string()]) return '\n'.join(allstrs)
def _dfinfo(dataframe, asstring=True): '''Returns a a dataframe with info about the given `dataframe` ''' infocols = ['Min', 'Median', 'Max', '#NAs', '#<1Perc.', '#>99Perc.'] sum_df = odict() # if _dfr.empty for col in floatingcols(dataframe): q01 = np.nanquantile(dataframe[col], 0.01) q99 = np.nanquantile(dataframe[col], 0.99) df1, df99 = dataframe[(dataframe[col] < q01)], dataframe[(dataframe[col] > q99)] # segs1 = len(pd.unique(df1[ID_COL])) # segs99 = len(pd.unique(df99[ID_COL])) # stas1 = len(pd.unique(df1['station_id'])) # stas99 = len(pd.unique(df99['station_id'])) sum_df[col] = { infocols[0]: np.nanmin(dataframe[col]), infocols[1]: np.nanquantile(dataframe[col], 0.5), infocols[2]: np.nanmax(dataframe[col]), infocols[3]: (~np.isfinite(dataframe[col])).sum(), infocols[4]: len(df1), infocols[5]: len(df99) # columns[5]: stas1 + stas99, } return pd.DataFrame(data=list(sum_df.values()), columns=infocols, index=list(sum_df.keys()))
def _dfinfo(dataframe): '''Returns a dataframe with statistical info about the given `dataframe` ''' infocols = ['Min', 'Median', 'Max', '#NAs', '#<1Perc.', '#>99Perc.'] defaultcolvalues = [np.nan, np.nan, np.nan, 0, 0, 0] sum_df = odict() # if _dfr.empty for col in floatingcols(dataframe): colvalues = defaultcolvalues if not dataframe.empty: q01 = np.nanquantile(dataframe[col], 0.01) q99 = np.nanquantile(dataframe[col], 0.99) df1, df99 = dataframe[(dataframe[col] < q01)], dataframe[(dataframe[col] > q99)] colvalues = [ np.nanmin(dataframe[col]), # Min np.nanquantile(dataframe[col], 0.5), # Median np.nanmax(dataframe[col]), # Max (~np.isfinite(dataframe[col])).sum(), # #NAs len(df1), # #<1Perc. len(df99) # @>99Perc. ] sum_df[col] = {i: v for i, v in zip(infocols, colvalues)} return pd.DataFrame(data=list(sum_df.values()), columns=infocols, index=list(sum_df.keys()))
def dfinfo(dataframe, asstring=True): '''Returns a a dataframe with info about the given `dataframe` representing a given dataset (if asstring=False) or, if asstring=True, a string representing the dataframe. ''' classnames = CLASSNAMES class_selectors = CLASS_SELECTORS sum_dfs = odict() for classname, class_selector in zip(classnames, class_selectors): _dfr = dataframe[class_selector(dataframe)] title = "Class '%s': %d of %d instances" % (classname, len(_dfr), len(dataframe)) sum_dfs[title] = _dfinfo(_dfr) # return a MultiIndex DataFrame: if not asstring: return pd.concat(sum_dfs.values(), axis=0, keys=sum_dfs.keys(), sort=False) allstrs = [] for (key, val) in sum_dfs.items(): allstrs.extend(['', key, '-' * len(key), val.to_string()]) return '\n'.join(allstrs)
def read_all_pred_dfs(): '''dict of pred_dataframe paths -> st_time''' ret = [] for fle in listdir(tmpdir): _ = join(tmpdir, fle, 'allset_test.hdf_') if isfile(_): ret.append(_) ret2 = odict() for _ in sorted(ret): ret2[_] = stat(_).st_mtime return ret2
def model_params(model_filename): '''Converts the given model_filename (or absolute path) into a dict of key -> tuple of **strings** (single values parameters will be mapped to a 1-element tuple) ''' pth = basename(model_filename) pth_, ext = splitext(pth) if ext == '.sklmodel': pth = pth_ pars = pth.split('&') ret = odict() for par in pars: key, val = par.split('=') ret[uq(key)] = tuple(uq(_) for _ in val.split(',')) return ret
def set_classifiers_paths(self, classifiers_paths): '''Builds and returns from this object parameters a list of N arguments to be passed to `_create_save_classifier`. Skips classifiers whose prediction dataframe already exists. See `run_evaluation` for details ''' ret = odict() for clfpath in classifiers_paths: outdir = splitext(clfpath)[0] destpath = join(outdir, basename(self.testset_filepath)) if isfile(destpath): continue elif not isdir(dirname(destpath)): makedirs(dirname(destpath)) if not isdir(dirname(destpath)): continue feats = self.model_params(clfpath)['feats'] ret[clfpath] = (destpath, feats) self.classifiers_paths = ret
def model_filename(clf_class, tr_set, *features, **clf_params): '''converts the given argument to a model filename, with extension .sklmodel''' pars = odict() pars['clf'] = [str(clf_class.__name__)] pars['tr_set'] = [ str(_) for _ in (tr_set if isinstance(tr_set, (list, tuple)) else [tr_set]) ] pars['feats'] = [str(_) for _ in sorted(features)] for key in sorted(clf_params): val = clf_params[key] pars[q(key, safe=_safe)] = [ str(_) for _ in (val if isinstance(val, (list, tuple)) else [val]) ] return '&'.join("%s=%s" % (k, ','.join(q(_, safe=_safe) for _ in v)) for k, v in pars.items()) + '.sklmodel'
def _get_summary_evaluationmetrics_mp(clfdir_and_testname): clfdir, testname = clfdir_and_testname filepath = join(clfdir, testname) predicted_df = pd.read_hdf(filepath, columns=[OUTLIER_COL, PREDICT_COL]) cols = tuple(_METRIC_COLUMNS) try: # parse the clf file name (which is the directory name of the # prediction dataframe we want to calculate metrics from), and # try to guess if they can be floats or ints: ret = odict() for key, value in TestParam.model_params(clfdir).items(): # value is a tuple. First thing is to define how to store it # in a pandas DataFrame. Use its string method without brackets # (so that e.g., ['a', 'b'] will be stored as 'a,b' and # ['abc'] will be stored as 'abc': stored_value = ",".join(str(_) for _ in value) if len(value) == 1: try: stored_value = int(value[0]) except (ValueError, TypeError): try: stored_value = float(value[0]) except (ValueError, TypeError): pass ret[key] = stored_value ret[cols[0]] = log_loss(predicted_df) ret[cols[1]] = roc_auc_score(predicted_df) ret[cols[2]] = average_precision_score(predicted_df) ret[cols[3]] = roc_curve(predicted_df)[-1] ret[cols[4]] = precision_recall_curve(predicted_df)[-1] return clfdir, testname, ret except Exception as exc: return clfdir, testname, exc