def summary_targets(dataset, targets_attr="targets", chunks_attr="chunks", maxc=30, maxt=20): """Provide summary statistics over the targets and chunks Parameters ---------- dataset : `Dataset` Dataset to operate on targets_attr : str, optional Name of sample attributes of targets chunks_attr : str, optional Name of sample attributes of chunks -- independent groups of samples maxc : int Maximal number of chunks when provide details maxt : int Maximal number of targets when provide details """ # We better avoid bound function since if people only # imported Dataset without miscfx it would fail spcl = get_samples_per_chunk_target(dataset, targets_attr=targets_attr, chunks_attr=chunks_attr) # XXX couldn't they be unordered? ul = dataset.sa[targets_attr].unique.tolist() uc = dataset.sa[chunks_attr].unique.tolist() s = "" if len(ul) < maxt and len(uc) < maxc: s += "\nCounts of targets in each chunk:" # only in a reasonable case do printing table = [[" %s\\%s" % (chunks_attr, targets_attr)] + ul] table += [[""] + ["---"] * len(ul)] for c, counts in zip(uc, spcl): table.append([str(c)] + counts.tolist()) s += "\n" + table2string(table) else: s += "No details due to large number of targets or chunks. " "Increase maxc and maxt if desired" def cl_stats(axis, u, name1, name2): """Compute statistics per target """ stats = { "min": np.min(spcl, axis=axis), "max": np.max(spcl, axis=axis), "mean": np.mean(spcl, axis=axis), "std": np.std(spcl, axis=axis), "#%s" % name2: np.sum(spcl > 0, axis=axis), } entries = [" " + name1, "mean", "std", "min", "max", "#%s" % name2] table = [entries] for i, l in enumerate(u): d = {" " + name1: l} d.update(dict([(k, stats[k][i]) for k in stats.keys()])) table.append([("%.3g", "%s")[isinstance(d[e], basestring)] % d[e] for e in entries]) return "\nSummary for %s across %s\n" % (name1, name2) + table2string(table) if len(ul) < maxt: s += cl_stats(0, ul, targets_attr, chunks_attr) if len(uc) < maxc: s += cl_stats(1, uc, chunks_attr, targets_attr) return s
def cl_stats(axis, u, name1, name2): """Compute statistics per target """ stats = { "min": np.min(spcl, axis=axis), "max": np.max(spcl, axis=axis), "mean": np.mean(spcl, axis=axis), "std": np.std(spcl, axis=axis), "#%s" % name2: np.sum(spcl > 0, axis=axis), } entries = [" " + name1, "mean", "std", "min", "max", "#%s" % name2] table = [entries] for i, l in enumerate(u): d = {" " + name1: l} d.update(dict([(k, stats[k][i]) for k in stats.keys()])) table.append([("%.3g", "%s")[isinstance(d[e], basestring)] % d[e] for e in entries]) return "\nSummary for %s across %s\n" % (name1, name2) + table2string(table)
def cl_stats(axis, u, name1, name2): """Compute statistics per target """ stats = {'min': np.min(spcl, axis=axis), 'max': np.max(spcl, axis=axis), 'mean': np.mean(spcl, axis=axis), 'std': np.std(spcl, axis=axis), '#%s' % name2: np.sum(spcl>0, axis=axis)} entries = [' ' + name1, 'mean', 'std', 'min', 'max', '#%s' % name2] table = [ entries ] for i, l in enumerate(u): d = {' ' + name1 : l} d.update(dict([ (k, stats[k][i]) for k in stats.keys()])) table.append( [ ('%.3g', '%s')[isinstance(d[e], basestring)] % d[e] for e in entries] ) return '\nSummary for %s across %s\n' % (name1, name2) \ + table2string(table)
def _compute(self): """Compute stats and string representation """ # Do actual computation order = self.order seq = list(self._seq) # assure list nsamples = len(seq) # # of samples/targets utargets = sorted(list(set(seq))) # unique targets ntargets = len(utargets) # # of targets # mapping for targets targets_map = dict([(l, i) for i, l in enumerate(utargets)]) # map sequence first seqm = [targets_map[i] for i in seq] # npertarget = np.bincount(seqm) res = dict(utargets=utargets) # Estimate counter-balance cbcounts = np.zeros((order, ntargets, ntargets), dtype=int) for cb in xrange(order): for i, j in zip(seqm[:-(cb + 1)], seqm[cb + 1:]): cbcounts[cb, i, j] += 1 res['cbcounts'] = cbcounts """ Lets compute relative counter-balancing Ideally, npertarget[i]/ntargets should precede each target """ # Autocorrelation corr = [] # for all possible shifts: for shift in xrange(1, nsamples): shifted = seqm[shift:] + seqm[:shift] # ??? User pearsonsr with p may be? corr += [np.corrcoef(seqm, shifted)[0, 1]] # ??? report high (anti)correlations? res['corrcoef'] = corr = np.array(corr) res['sumabscorr'] = sumabscorr = np.sum(np.abs(corr)) self.update(res) # Assign textual summary # XXX move into a helper function and do on demand t = [[""] * (1 + self.order * (ntargets + 1)) for i in xrange(ntargets + 1)] t[0][0] = "Targets/Order" for i, l in enumerate(utargets): t[i + 1][0] = '%s:' % l for cb in xrange(order): t[0][1 + cb * (ntargets + 1)] = "O%d" % (cb + 1) for i in xrange(ntargets + 1): t[i][(cb + 1) * (ntargets + 1)] = " | " m = cbcounts[cb] # ??? there should be better way to get indexes ind = np.where(~np.isnan(m)) for i, j in zip(*ind): t[1 + i][1 + cb * (ntargets + 1) + j] = '%d' % m[i, j] sout = "Sequence statistics for %d entries" \ " from set %s\n" % (len(seq), utargets) + \ "Counter-balance table for orders up to %d:\n" % order \ + table2string(t) if len(corr): sout += "Correlations: min=%.2g max=%.2g mean=%.2g sum(abs)=%.2g" \ % (min(corr), max(corr), np.mean(corr), sumabscorr) self._str_stats = sout
def summary_targets(dataset, targets_attr='targets', chunks_attr='chunks', maxc=30, maxt=20): """Provide summary statistics over the targets and chunks Parameters ---------- dataset : `Dataset` Dataset to operate on targets_attr : str, optional Name of sample attributes of targets chunks_attr : str, optional Name of sample attributes of chunks -- independent groups of samples maxc : int Maximal number of chunks when provide details maxt : int Maximal number of targets when provide details """ # We better avoid bound function since if people only # imported Dataset without miscfx it would fail spcl = get_samples_per_chunk_target(dataset, targets_attr=targets_attr, chunks_attr=chunks_attr) # XXX couldn't they be unordered? ul = dataset.sa[targets_attr].unique.tolist() uc = dataset.sa[chunks_attr].unique.tolist() s = "" if len(ul) < maxt and len(uc) < maxc: s += "\nCounts of targets in each chunk:" # only in a reasonable case do printing table = [[' %s\\%s' % (chunks_attr, targets_attr)] + ul] table += [[''] + ['---'] * len(ul)] for c, counts in zip(uc, spcl): table.append([str(c)] + counts.tolist()) s += '\n' + table2string(table) else: s += "No details due to large number of targets or chunks. " \ "Increase maxc and maxt if desired" def cl_stats(axis, u, name1, name2): """Compute statistics per target """ stats = { 'min': np.min(spcl, axis=axis), 'max': np.max(spcl, axis=axis), 'mean': np.mean(spcl, axis=axis), 'std': np.std(spcl, axis=axis), '#%s' % name2: np.sum(spcl > 0, axis=axis) } entries = [' ' + name1, 'mean', 'std', 'min', 'max', '#%s' % name2] table = [entries] for i, l in enumerate(u): d = {' ' + name1: l} d.update(dict([(k, stats[k][i]) for k in stats.keys()])) table.append([ ('%.3g', '%s')[isinstance(d[e], basestring) or d[e] is None] % d[e] for e in entries ]) return '\nSummary for %s across %s\n' % (name1, name2) \ + table2string(table) if len(ul) < maxt: s += cl_stats(0, ul, targets_attr, chunks_attr) if len(uc) < maxc: s += cl_stats(1, uc, chunks_attr, targets_attr) return s
def _compute(self): """Compute stats and string representation """ # Do actual computation order = self.order seq = list(self._seq) # assure list nsamples = len(seq) # # of samples/targets utargets = sorted(list(set(seq))) # unique targets ntargets = len(utargets) # # of targets # mapping for targets targets_map = dict([(l, i) for i, l in enumerate(utargets)]) # map sequence first seqm = [targets_map[i] for i in seq] # npertarget = np.bincount(seqm) res = dict(utargets=utargets) # Estimate counter-balance cbcounts = np.zeros((order, ntargets, ntargets), dtype=int) for cb in xrange(order): for i, j in zip(seqm[: -(cb + 1)], seqm[cb + 1 :]): cbcounts[cb, i, j] += 1 res["cbcounts"] = cbcounts """ Lets compute relative counter-balancing Ideally, npertarget[i]/ntargets should precede each target """ # Autocorrelation corr = [] # for all possible shifts: for shift in xrange(1, nsamples): shifted = seqm[shift:] + seqm[:shift] # ??? User pearsonsr with p may be? corr += [np.corrcoef(seqm, shifted)[0, 1]] # ??? report high (anti)correlations? res["corrcoef"] = corr = np.array(corr) res["sumabscorr"] = sumabscorr = np.sum(np.abs(corr)) self.update(res) # Assign textual summary # XXX move into a helper function and do on demand t = [[""] * (1 + self.order * (ntargets + 1)) for i in xrange(ntargets + 1)] t[0][0] = "Targets/Order" for i, l in enumerate(utargets): t[i + 1][0] = "%s:" % l for cb in xrange(order): t[0][1 + cb * (ntargets + 1)] = "O%d" % (cb + 1) for i in xrange(ntargets + 1): t[i][(cb + 1) * (ntargets + 1)] = " | " m = cbcounts[cb] # ??? there should be better way to get indexes ind = np.where(~np.isnan(m)) for i, j in zip(*ind): t[1 + i][1 + cb * (ntargets + 1) + j] = "%d" % m[i, j] sout = "Sequence statistics for %d entries" " from set %s\n" % ( len(seq), utargets, ) + "Counter-balance table for orders up to %d:\n" % order + table2string(t) if len(corr): sout += "Correlations: min=%.2g max=%.2g mean=%.2g sum(abs)=%.2g" % ( min(corr), max(corr), np.mean(corr), sumabscorr, ) self._str_stats = sout