Пример #1
0
 def cl_stats(axis, u, name1, name2):
     """Compute statistics per target
     """
     stats = {'min': np.min(spcl, axis=axis),
              'max': np.max(spcl, axis=axis),
              'mean': np.mean(spcl, axis=axis),
              'std': np.std(spcl, axis=axis),
              '#%s' % name2: np.sum(spcl>0, axis=axis)}
     entries = ['  ' + name1, 'mean', 'std', 'min', 'max', '#%s' % name2]
     table = [ entries ]
     for i, l in enumerate(u):
         d = {'  ' + name1 : l}
         d.update(dict([ (k, stats[k][i]) for k in stats.keys()]))
         table.append( [ ('%.3g', '%s')[isinstance(d[e], basestring)]
                         % d[e] for e in entries] )
     return '\nSummary for %s across %s\n' % (name1, name2) \
            + table2string(table)
Пример #2
0
    def _compute(self):
        """Compute stats and string representation
        """
        # Do actual computation
        order = self.order
        seq = list(self._seq)               # assure list
        nsamples = len(seq)                 # # of samples/targets
        utargets = sorted(list(set(seq)))    # unique targets
        ntargets = len(utargets)              # # of targets

        # mapping for targets
        targets_map = dict([(l, i) for i, l in enumerate(utargets)])

        # map sequence first
        seqm = [targets_map[i] for i in seq]
        # npertarget = np.bincount(seqm)

        res = dict(utargets=utargets)
        # Estimate counter-balance
        cbcounts = np.zeros((order, ntargets, ntargets), dtype=int)
        for cb in xrange(order):
            for i, j in zip(seqm[:-(cb+1)], seqm[cb+1:]):
                cbcounts[cb, i, j] += 1
        res['cbcounts'] = cbcounts

        """
        Lets compute relative counter-balancing
        Ideally, npertarget[i]/ntargets should precede each target
        """
        # Autocorrelation
        corr = []
        # for all possible shifts:
        for shift in xrange(1, nsamples):
            shifted = seqm[shift:] + seqm[:shift]
            # ??? User pearsonsr with p may be?
            corr += [np.corrcoef(seqm, shifted)[0, 1]]
            # ??? report high (anti)correlations?
        res['corrcoef'] = corr = np.array(corr)
        res['sumabscorr'] = sumabscorr = np.sum(np.abs(corr))
        self.update(res)

        # Assign textual summary
        # XXX move into a helper function and do on demand
        t = [ [""] * (1 + self.order*(ntargets+1)) for i in xrange(ntargets+1) ]
        t[0][0] = "Targets/Order"
        for i, l  in enumerate(utargets):
            t[i+1][0] = '%s:' % l
        for cb in xrange(order):
            t[0][1+cb*(ntargets+1)] = "O%d" % (cb+1)
            for i  in xrange(ntargets+1):
                t[i][(cb+1)*(ntargets+1)] = " | "
            m = cbcounts[cb]
            # ??? there should be better way to get indexes
            ind = np.where(~np.isnan(m))
            for i, j in zip(*ind):
                t[1+i][1+cb*(ntargets+1)+j] = '%d' % m[i, j]

        sout = "Sequence statistics for %d entries" \
               " from set %s\n" % (len(seq), utargets) + \
               "Counter-balance table for orders up to %d:\n" % order \
               + table2string(t)
        sout += "Correlations: min=%.2g max=%.2g mean=%.2g sum(abs)=%.2g" \
                % (min(corr), max(corr), np.mean(corr), sumabscorr)
        self._str_stats = sout
Пример #3
0
    def as_string(self, short=False, header=True, summary=True,
                 description=False):
        """'Pretty print' the matrix

        Parameters
        ----------
        short : bool
          if True, ignores the rest of the parameters and provides consise
          1 line summary
        header : bool
          print header of the table
        summary : bool
          print summary (accuracy)
        description : bool
          print verbose description of presented statistics
        """
        if len(self.sets) == 0:
            return "Empty"

        self.compute()

        # some shortcuts
        labels = self.__labels
        labels_map_rev = self.__labels_map_rev
        matrix = self.__matrix

        labels_rev = []
        if labels_map_rev is not None:
            labels_rev = [','.join([str(x) for x in labels_map_rev[l]])
                                   for l in labels]

        out = StringIO()
        # numbers of different entries
        Nlabels = len(labels)
        Nsamples = self.__Nsamples.astype(int)

        stats = self._stats
        if short:
            return "%(# of sets)d sets %(# of labels)d labels " \
                   " ACC:%(ACC).2f" \
                   % stats

        Ndigitsmax = int(ceil(log10(max(Nsamples))))
        Nlabelsmax = max( [len(str(x)) for x in labels] )

        # length of a single label/value
        L = max(Ndigitsmax+2, Nlabelsmax) #, len("100.00%"))
        res = ""

        stats_perpredict = ["P'", "N'", 'FP', 'FN', 'PPV', 'NPV', 'TPR',
                            'SPC', 'FDR', 'MCC']
        # print AUC only if ROC was computed
        if self.ROC is not None: stats_perpredict += [ 'AUC' ]
        stats_pertarget = ['P', 'N', 'TP', 'TN']
        stats_summary = ['ACC', 'ACC%', '# of sets']


        #prefixlen = Nlabelsmax + 2 + Ndigitsmax + 1
        prefixlen = Nlabelsmax + 1
        pref = ' '*(prefixlen) # empty prefix

        if matrix.shape != (Nlabels, Nlabels):
            raise ValueError, \
                  "Number of labels %d doesn't correspond the size" + \
                  " of a confusion matrix %s" % (Nlabels, matrix.shape)

        # list of lists of what is printed
        printed = []
        underscores = [" %s" % ("-" * L)] * Nlabels
        if header:
            # labels
            printed.append(['@l----------.        '] + labels_rev)
            printed.append(['@lpredictions\\targets'] + labels)
            # underscores
            printed.append(['@l            `------'] \
                           + underscores + stats_perpredict)

        # matrix itself
        for i, line in enumerate(matrix):
            l = labels[i]
            if labels_rev != []:
                l = '@r%10s / %s' % (labels_rev[i], l)
            printed.append(
                [l] +
                [ str(x) for x in line ] +
                [ _p2(stats[x][i]) for x in stats_perpredict])

        if summary:
            ## Various alternative schemes ;-)
            # printed.append([''] + underscores)
            # printed.append(['@lPer target \ Means:'] + underscores + \
            #               [_p2(x) for x in mean_stats])
            # printed.append(['Means:'] + [''] * len(labels)
            #                + [_p2(x) for x in mean_stats])
            printed.append(['@lPer target:'] + underscores)
            for stat in stats_pertarget:
                printed.append([stat] + [
                    _p2(stats[stat][i]) for i in xrange(Nlabels)])

            # compute mean stats
            # XXX refactor to expose them in stats as well, as
            #     mean(FCC)
            mean_stats = np.mean(np.array([stats[k] for k in stats_perpredict]),
                                axis=1)
            printed.append(['@lSummary \ Means:'] + underscores
                           + [_p2(stats['mean(%s)' % x])
                              for x in stats_perpredict])

            if 'CHI^2' in self.stats:
                chi2t = stats['CHI^2']
                printed.append(['CHI^2'] + [_p2(chi2t[0])]
                               + ['p:'] + ['%.2g' % chi2t[1]])

            for stat in stats_summary:
                printed.append([stat] + [_p2(stats[stat])])

        table2string(printed, out)

        if description:
            out.write("\nStatistics computed in 1-vs-rest fashion per each " \
                      "target.\n")
            out.write("Abbreviations (for details see " \
                      "http://en.wikipedia.org/wiki/ROC_curve):\n")
            for d, val, eq in self._STATS_DESCRIPTION:
                out.write(" %-3s: %s\n" % (d, val))
                if eq is not None:
                    out.write("      " + eq + "\n")

        #out.write("%s" % printed)
        result = out.getvalue()
        out.close()
        return result
Пример #4
0
def summary_targets(dataset, targets_attr='targets', chunks_attr='chunks',
                    maxc=30, maxt=20):
    """Provide summary statistics over the targets and chunks

    Parameters
    ----------
    dataset : `Dataset`
      Dataset to operate on
    targets_attr : str, optional
      Name of sample attributes of targets
    chunks_attr : str, optional
      Name of sample attributes of chunks -- independent groups of samples
    maxc : int
      Maximal number of chunks when provide details
    maxt : int
      Maximal number of targets when provide details
    """
    # We better avoid bound function since if people only
    # imported Dataset without miscfx it would fail
    spcl = get_samples_per_chunk_target(
        dataset, targets_attr=targets_attr, chunks_attr=chunks_attr)
    # XXX couldn't they be unordered?
    ul = dataset.sa[targets_attr].unique.tolist()
    uc = dataset.sa[chunks_attr].unique.tolist()
    s = ""
    if len(ul) < maxt and len(uc) < maxc:
        s += "\nCounts of targets in each chunk:"
        # only in a reasonable case do printing
        table = [['  %s\\%s' % (chunks_attr, targets_attr)] + ul]
        table += [[''] + ['---'] * len(ul)]
        for c, counts in zip(uc, spcl):
            table.append([ str(c) ] + counts.tolist())
        s += '\n' + table2string(table)
    else:
        s += "No details due to large number of targets or chunks. " \
             "Increase maxc and maxt if desired"


    def cl_stats(axis, u, name1, name2):
        """Compute statistics per target
        """
        stats = {'min': np.min(spcl, axis=axis),
                 'max': np.max(spcl, axis=axis),
                 'mean': np.mean(spcl, axis=axis),
                 'std': np.std(spcl, axis=axis),
                 '#%s' % name2: np.sum(spcl>0, axis=axis)}
        entries = ['  ' + name1, 'mean', 'std', 'min', 'max', '#%s' % name2]
        table = [ entries ]
        for i, l in enumerate(u):
            d = {'  ' + name1 : l}
            d.update(dict([ (k, stats[k][i]) for k in stats.keys()]))
            table.append( [ ('%.3g', '%s')[isinstance(d[e], basestring)]
                            % d[e] for e in entries] )
        return '\nSummary for %s across %s\n' % (name1, name2) \
               + table2string(table)

    if len(ul) < maxt:
        s += cl_stats(0, ul, targets_attr, chunks_attr)
    if len(uc) < maxc:
        s += cl_stats(1, uc, chunks_attr, targets_attr)
    return s
Пример #5
0
    def as_string(self, short=False, header=True,  summary=True,
                 description=False):
        """'Pretty print' the statistics"""

        if len(self.sets) == 0:
            return "Empty"

        self.compute()

        stats = self.stats

        if short:
            if short == 'very':
                # " RMSE/RMP_t:%(RMSE/RMP_t).2f" \
                return "%(# of sets)d sets CCe=%(CCe).2f p=%(CCp).2g" \
                       " RMSE:%(RMSE).2f" \
                       " Summary (stacked data): " \
                       "CCe=%(Summary CCe).2f p=%(Summary CCp).2g" \
                       % stats
            else:
                return "%(# of sets)d sets CCe=%(CCe).2f+-%(CCe_std).3f" \
                       " RMSE=%(RMSE).2f+-%(RMSE_std).3f" \
                       " RMSE/RMP_t=%(RMSE/RMP_t).2f+-%(RMSE/RMP_t_std).3f" \
                       % stats

        stats_data = ['RMP_t', 'STD_t', 'RMP_p', 'STD_p']
        # CCp needs tune up of format so excluded
        stats_ = ['CCe', 'RMSE', 'RMSE/RMP_t']
        stats_summary = ['# of sets']

        out = StringIO()

        printed = []
        if header:
            # labels
            printed.append(['Statistics', 'Mean', 'Std', 'Min', 'Max'])
            # underscores
            printed.append(['----------', '-----', '-----', '-----', '-----'])

        def print_stats(printed, stats_):
            # Statistics itself
            for stat in stats_:
                s = [stat]
                for suffix in ['', '_std', '_min', '_max']:
                    s += [ _p2(stats[stat+suffix], 3) ]
                printed.append(s)

        printed.append(["Data:     "])
        print_stats(printed, stats_data)
        printed.append(["Results:  "])
        print_stats(printed, stats_)
        printed.append(["Summary:  "])
        printed.append(["CCe", _p2(stats['Summary CCe']), "", "p=", '%g' % stats['Summary CCp']])
        printed.append(["RMSE", _p2(stats['Summary RMSE'])])
        printed.append(["RMSE/RMP_t", _p2(stats['Summary RMSE/RMP_t'])])

        if summary:
            for stat in stats_summary:
                printed.append([stat] + [_p2(stats[stat])])

        table2string(printed, out)

        if description:
            out.write("\nDescription of printed statistics.\n"
                      " Suffixes: _t - targets, _p - predictions\n")

            for d, val, eq in self._STATS_DESCRIPTION:
                out.write(" %-3s: %s\n" % (d, val))
                if eq is not None:
                    out.write("      " + eq + "\n")

        result = out.getvalue()
        out.close()
        return result