def _dfinfo(dataframe): '''Returns a dataframe with statistical info about the given `dataframe` ''' infocols = ['Min', 'Median', 'Max', '#NAs', '#<1Perc.', '#>99Perc.'] defaultcolvalues = [np.nan, np.nan, np.nan, 0, 0, 0] sum_df = odict() # if _dfr.empty for col in floatingcols(dataframe): colvalues = defaultcolvalues if not dataframe.empty: q01 = np.nanquantile(dataframe[col], 0.01) q99 = np.nanquantile(dataframe[col], 0.99) df1, df99 = dataframe[(dataframe[col] < q01)], dataframe[(dataframe[col] > q99)] colvalues = [ np.nanmin(dataframe[col]), # Min np.nanquantile(dataframe[col], 0.5), # Median np.nanmax(dataframe[col]), # Max (~np.isfinite(dataframe[col])).sum(), # #NAs len(df1), # #<1Perc. len(df99) # @>99Perc. ] sum_df[col] = {i: v for i, v in zip(infocols, colvalues)} return pd.DataFrame(data=list(sum_df.values()), columns=infocols, index=list(sum_df.keys()))
def dfinfo(dataframe, asstring=True): '''Returns a a dataframe with info about the given `dataframe` representing a given dataset (if asstring=False) or, if asstring=True, a string representing the dataframe. ''' classnames = CLASSNAMES class_selectors = CLASS_SELECTORS sum_dfs = odict() for classname, class_selector in zip(classnames, class_selectors): _dfr = dataframe[class_selector(dataframe)] title = "Class '%s': %d of %d instances" % (classname, len(_dfr), len(dataframe)) sum_dfs[title] = _dfinfo(_dfr) # return a MultiIndex DataFrame: if not asstring: return pd.concat(sum_dfs.values(), axis=0, keys=sum_dfs.keys(), sort=False) allstrs = [] for (key, val) in sum_dfs.items(): allstrs.extend(['', key, '-' * len(key), val.to_string()]) return '\n'.join(allstrs)
def model_params(model_filename): '''Converts the given model_filename (or absolute path) into a dict of key -> tuple of **strings** (single values parameters will be mapped to a 1-element tuple) ''' pth = basename(model_filename) pth_, ext = splitext(pth) if ext == '.sklmodel': pth = pth_ pars = pth.split('&') ret = odict() for par in pars: key, val = par.split('=') ret[uq(key)] = tuple(uq(_) for _ in val.split(',')) return ret
def set_classifiers_paths(self, classifiers_paths): '''Builds and returns from this object parameters a list of N arguments to be passed to `_create_save_classifier`. Skips classifiers whose prediction dataframe already exists. See `run_evaluation` for details ''' ret = odict() for clfpath in classifiers_paths: outdir = splitext(clfpath)[0] destpath = join(outdir, basename(self.testset_filepath)) if isfile(destpath): continue elif not isdir(dirname(destpath)): makedirs(dirname(destpath)) if not isdir(dirname(destpath)): continue feats = self.model_params(clfpath)['feats'] ret[clfpath] = (destpath, feats) self.classifiers_paths = ret
def model_filename(clf_class, tr_set, *features, **clf_params): '''converts the given argument to a model filename, with extension .sklmodel''' pars = odict() pars['clf'] = [str(clf_class.__name__)] pars['tr_set'] = [ str(_) for _ in (tr_set if isinstance(tr_set, (list, tuple)) else [tr_set]) ] pars['feats'] = [str(_) for _ in sorted(features)] for key in sorted(clf_params): val = clf_params[key] pars[q(key, safe=_safe)] = [ str(_) for _ in (val if isinstance(val, (list, tuple)) else [val]) ] return '&'.join("%s=%s" % (k, ','.join(q(_, safe=_safe) for _ in v)) for k, v in pars.items()) + '.sklmodel'
def _get_summary_evaluationmetrics_mp(clfdir_and_testname): clfdir, testname = clfdir_and_testname filepath = join(clfdir, testname) predicted_df = pd.read_hdf(filepath, columns=[OUTLIER_COL, PREDICT_COL]) cols = tuple(_METRIC_COLUMNS) try: # parse the clf file name (which is the directory name of the # prediction dataframe we want to calculate metrics from), and # try to guess if they can be floats or ints: ret = odict() for key, value in TestParam.model_params(clfdir).items(): # value is a tuple. First thing is to define how to store it # in a pandas DataFrame. Use its string method without brackets # (so that e.g., ['a', 'b'] will be stored as 'a,b' and # ['abc'] will be stored as 'abc': stored_value = ",".join(str(_) for _ in value) if len(value) == 1: try: stored_value = int(value[0]) except (ValueError, TypeError): try: stored_value = float(value[0]) except (ValueError, TypeError): pass ret[key] = stored_value ret[cols[0]] = log_loss(predicted_df) ret[cols[1]] = roc_auc_score(predicted_df) ret[cols[2]] = average_precision_score(predicted_df) ret[cols[3]] = roc_curve(predicted_df)[-1] ret[cols[4]] = precision_recall_curve(predicted_df)[-1] return clfdir, testname, ret except Exception as exc: return clfdir, testname, exc