def cl_stats(axis, u, name1, name2): """Compute statistics per target """ stats = {'min': np.min(spcl, axis=axis), 'max': np.max(spcl, axis=axis), 'mean': np.mean(spcl, axis=axis), 'std': np.std(spcl, axis=axis), '#%s' % name2: np.sum(spcl>0, axis=axis)} entries = [' ' + name1, 'mean', 'std', 'min', 'max', '#%s' % name2] table = [ entries ] for i, l in enumerate(u): d = {' ' + name1 : l} d.update(dict([ (k, stats[k][i]) for k in stats.keys()])) table.append( [ ('%.3g', '%s')[isinstance(d[e], basestring)] % d[e] for e in entries] ) return '\nSummary for %s across %s\n' % (name1, name2) \ + table2string(table)
def _compute(self): """Compute stats and string representation """ # Do actual computation order = self.order seq = list(self._seq) # assure list nsamples = len(seq) # # of samples/targets utargets = sorted(list(set(seq))) # unique targets ntargets = len(utargets) # # of targets # mapping for targets targets_map = dict([(l, i) for i, l in enumerate(utargets)]) # map sequence first seqm = [targets_map[i] for i in seq] # npertarget = np.bincount(seqm) res = dict(utargets=utargets) # Estimate counter-balance cbcounts = np.zeros((order, ntargets, ntargets), dtype=int) for cb in xrange(order): for i, j in zip(seqm[:-(cb+1)], seqm[cb+1:]): cbcounts[cb, i, j] += 1 res['cbcounts'] = cbcounts """ Lets compute relative counter-balancing Ideally, npertarget[i]/ntargets should precede each target """ # Autocorrelation corr = [] # for all possible shifts: for shift in xrange(1, nsamples): shifted = seqm[shift:] + seqm[:shift] # ??? User pearsonsr with p may be? corr += [np.corrcoef(seqm, shifted)[0, 1]] # ??? report high (anti)correlations? res['corrcoef'] = corr = np.array(corr) res['sumabscorr'] = sumabscorr = np.sum(np.abs(corr)) self.update(res) # Assign textual summary # XXX move into a helper function and do on demand t = [ [""] * (1 + self.order*(ntargets+1)) for i in xrange(ntargets+1) ] t[0][0] = "Targets/Order" for i, l in enumerate(utargets): t[i+1][0] = '%s:' % l for cb in xrange(order): t[0][1+cb*(ntargets+1)] = "O%d" % (cb+1) for i in xrange(ntargets+1): t[i][(cb+1)*(ntargets+1)] = " | " m = cbcounts[cb] # ??? there should be better way to get indexes ind = np.where(~np.isnan(m)) for i, j in zip(*ind): t[1+i][1+cb*(ntargets+1)+j] = '%d' % m[i, j] sout = "Sequence statistics for %d entries" \ " from set %s\n" % (len(seq), utargets) + \ "Counter-balance table for orders up to %d:\n" % order \ + table2string(t) sout += "Correlations: min=%.2g max=%.2g mean=%.2g sum(abs)=%.2g" \ % (min(corr), max(corr), np.mean(corr), sumabscorr) self._str_stats = sout
def as_string(self, short=False, header=True, summary=True, description=False): """'Pretty print' the matrix Parameters ---------- short : bool if True, ignores the rest of the parameters and provides consise 1 line summary header : bool print header of the table summary : bool print summary (accuracy) description : bool print verbose description of presented statistics """ if len(self.sets) == 0: return "Empty" self.compute() # some shortcuts labels = self.__labels labels_map_rev = self.__labels_map_rev matrix = self.__matrix labels_rev = [] if labels_map_rev is not None: labels_rev = [','.join([str(x) for x in labels_map_rev[l]]) for l in labels] out = StringIO() # numbers of different entries Nlabels = len(labels) Nsamples = self.__Nsamples.astype(int) stats = self._stats if short: return "%(# of sets)d sets %(# of labels)d labels " \ " ACC:%(ACC).2f" \ % stats Ndigitsmax = int(ceil(log10(max(Nsamples)))) Nlabelsmax = max( [len(str(x)) for x in labels] ) # length of a single label/value L = max(Ndigitsmax+2, Nlabelsmax) #, len("100.00%")) res = "" stats_perpredict = ["P'", "N'", 'FP', 'FN', 'PPV', 'NPV', 'TPR', 'SPC', 'FDR', 'MCC'] # print AUC only if ROC was computed if self.ROC is not None: stats_perpredict += [ 'AUC' ] stats_pertarget = ['P', 'N', 'TP', 'TN'] stats_summary = ['ACC', 'ACC%', '# of sets'] #prefixlen = Nlabelsmax + 2 + Ndigitsmax + 1 prefixlen = Nlabelsmax + 1 pref = ' '*(prefixlen) # empty prefix if matrix.shape != (Nlabels, Nlabels): raise ValueError, \ "Number of labels %d doesn't correspond the size" + \ " of a confusion matrix %s" % (Nlabels, matrix.shape) # list of lists of what is printed printed = [] underscores = [" %s" % ("-" * L)] * Nlabels if header: # labels printed.append(['@l----------. '] + labels_rev) printed.append(['@lpredictions\\targets'] + labels) # underscores printed.append(['@l `------'] \ + underscores + stats_perpredict) # matrix itself for i, line in enumerate(matrix): l = labels[i] if labels_rev != []: l = '@r%10s / %s' % (labels_rev[i], l) printed.append( [l] + [ str(x) for x in line ] + [ _p2(stats[x][i]) for x in stats_perpredict]) if summary: ## Various alternative schemes ;-) # printed.append([''] + underscores) # printed.append(['@lPer target \ Means:'] + underscores + \ # [_p2(x) for x in mean_stats]) # printed.append(['Means:'] + [''] * len(labels) # + [_p2(x) for x in mean_stats]) printed.append(['@lPer target:'] + underscores) for stat in stats_pertarget: printed.append([stat] + [ _p2(stats[stat][i]) for i in xrange(Nlabels)]) # compute mean stats # XXX refactor to expose them in stats as well, as # mean(FCC) mean_stats = np.mean(np.array([stats[k] for k in stats_perpredict]), axis=1) printed.append(['@lSummary \ Means:'] + underscores + [_p2(stats['mean(%s)' % x]) for x in stats_perpredict]) if 'CHI^2' in self.stats: chi2t = stats['CHI^2'] printed.append(['CHI^2'] + [_p2(chi2t[0])] + ['p:'] + ['%.2g' % chi2t[1]]) for stat in stats_summary: printed.append([stat] + [_p2(stats[stat])]) table2string(printed, out) if description: out.write("\nStatistics computed in 1-vs-rest fashion per each " \ "target.\n") out.write("Abbreviations (for details see " \ "http://en.wikipedia.org/wiki/ROC_curve):\n") for d, val, eq in self._STATS_DESCRIPTION: out.write(" %-3s: %s\n" % (d, val)) if eq is not None: out.write(" " + eq + "\n") #out.write("%s" % printed) result = out.getvalue() out.close() return result
def summary_targets(dataset, targets_attr='targets', chunks_attr='chunks', maxc=30, maxt=20): """Provide summary statistics over the targets and chunks Parameters ---------- dataset : `Dataset` Dataset to operate on targets_attr : str, optional Name of sample attributes of targets chunks_attr : str, optional Name of sample attributes of chunks -- independent groups of samples maxc : int Maximal number of chunks when provide details maxt : int Maximal number of targets when provide details """ # We better avoid bound function since if people only # imported Dataset without miscfx it would fail spcl = get_samples_per_chunk_target( dataset, targets_attr=targets_attr, chunks_attr=chunks_attr) # XXX couldn't they be unordered? ul = dataset.sa[targets_attr].unique.tolist() uc = dataset.sa[chunks_attr].unique.tolist() s = "" if len(ul) < maxt and len(uc) < maxc: s += "\nCounts of targets in each chunk:" # only in a reasonable case do printing table = [[' %s\\%s' % (chunks_attr, targets_attr)] + ul] table += [[''] + ['---'] * len(ul)] for c, counts in zip(uc, spcl): table.append([ str(c) ] + counts.tolist()) s += '\n' + table2string(table) else: s += "No details due to large number of targets or chunks. " \ "Increase maxc and maxt if desired" def cl_stats(axis, u, name1, name2): """Compute statistics per target """ stats = {'min': np.min(spcl, axis=axis), 'max': np.max(spcl, axis=axis), 'mean': np.mean(spcl, axis=axis), 'std': np.std(spcl, axis=axis), '#%s' % name2: np.sum(spcl>0, axis=axis)} entries = [' ' + name1, 'mean', 'std', 'min', 'max', '#%s' % name2] table = [ entries ] for i, l in enumerate(u): d = {' ' + name1 : l} d.update(dict([ (k, stats[k][i]) for k in stats.keys()])) table.append( [ ('%.3g', '%s')[isinstance(d[e], basestring)] % d[e] for e in entries] ) return '\nSummary for %s across %s\n' % (name1, name2) \ + table2string(table) if len(ul) < maxt: s += cl_stats(0, ul, targets_attr, chunks_attr) if len(uc) < maxc: s += cl_stats(1, uc, chunks_attr, targets_attr) return s
def as_string(self, short=False, header=True, summary=True, description=False): """'Pretty print' the statistics""" if len(self.sets) == 0: return "Empty" self.compute() stats = self.stats if short: if short == 'very': # " RMSE/RMP_t:%(RMSE/RMP_t).2f" \ return "%(# of sets)d sets CCe=%(CCe).2f p=%(CCp).2g" \ " RMSE:%(RMSE).2f" \ " Summary (stacked data): " \ "CCe=%(Summary CCe).2f p=%(Summary CCp).2g" \ % stats else: return "%(# of sets)d sets CCe=%(CCe).2f+-%(CCe_std).3f" \ " RMSE=%(RMSE).2f+-%(RMSE_std).3f" \ " RMSE/RMP_t=%(RMSE/RMP_t).2f+-%(RMSE/RMP_t_std).3f" \ % stats stats_data = ['RMP_t', 'STD_t', 'RMP_p', 'STD_p'] # CCp needs tune up of format so excluded stats_ = ['CCe', 'RMSE', 'RMSE/RMP_t'] stats_summary = ['# of sets'] out = StringIO() printed = [] if header: # labels printed.append(['Statistics', 'Mean', 'Std', 'Min', 'Max']) # underscores printed.append(['----------', '-----', '-----', '-----', '-----']) def print_stats(printed, stats_): # Statistics itself for stat in stats_: s = [stat] for suffix in ['', '_std', '_min', '_max']: s += [ _p2(stats[stat+suffix], 3) ] printed.append(s) printed.append(["Data: "]) print_stats(printed, stats_data) printed.append(["Results: "]) print_stats(printed, stats_) printed.append(["Summary: "]) printed.append(["CCe", _p2(stats['Summary CCe']), "", "p=", '%g' % stats['Summary CCp']]) printed.append(["RMSE", _p2(stats['Summary RMSE'])]) printed.append(["RMSE/RMP_t", _p2(stats['Summary RMSE/RMP_t'])]) if summary: for stat in stats_summary: printed.append([stat] + [_p2(stats[stat])]) table2string(printed, out) if description: out.write("\nDescription of printed statistics.\n" " Suffixes: _t - targets, _p - predictions\n") for d, val, eq in self._STATS_DESCRIPTION: out.write(" %-3s: %s\n" % (d, val)) if eq is not None: out.write(" " + eq + "\n") result = out.getvalue() out.close() return result