示例#1
0
    def read_mute_freqs(self, mute_freq_dir):
        # NOTE these are mute freqs, not branch lengths, but it's ok for now
        for mtype in ['all',] + utils.regions:
            infname = mute_freq_dir + '/' + mtype + '-mean-mute-freqs.csv'
            self.branch_lengths[mtype] = {}
            self.branch_lengths[mtype]['lengths'], self.branch_lengths[mtype]['probs'] = [], []
            mutehist = Hist(fname=infname)
            self.branch_lengths[mtype]['mean'] = mutehist.get_mean()

            # if mutehist.GetBinContent(0) > 0.0 or mutehist.GetBinContent(mutehist.GetNbinsX()+1) > 0.0:
            #     print 'WARNING nonzero under/overflow bins read from %s' % infname
            mutehist.normalize(include_overflows=False, overflow_eps_to_ignore=1e-2)  # if it was written with overflows included, it'll need to be renormalized
            check_sum = 0.0
            for ibin in range(1, mutehist.n_bins + 1):  # ignore under/overflow bins
                freq = mutehist.get_bin_centers()[ibin]
                branch_length = self.convert_observed_changes_to_branch_length(float(freq))
                prob = mutehist.bin_contents[ibin]
                self.branch_lengths[mtype]['lengths'].append(branch_length)
                self.branch_lengths[mtype]['probs'].append(prob)
                check_sum += self.branch_lengths[mtype]['probs'][-1]
            if not utils.is_normed(check_sum):
                raise Exception('not normalized %f' % check_sum)

        if self.args.debug:
            print '  mean branch lengths'
            for mtype in ['all',] + utils.regions:
                print '     %4s %7.3f (ratio %7.3f)' % (mtype, self.branch_lengths[mtype]['mean'], self.branch_lengths[mtype]['mean'] / self.branch_lengths['all']['mean'])
def make_mean_hist(hists, debug=False):
    """ return the hist with bin contents the mean over <hists> of each bin """
    binvals = {}
    for hist in hists:
        if debug:
            print '    sub',
        for ib in range(0, hist.n_bins + 2):
            low_edge = hist.low_edges[ib]
            if low_edge not in binvals:
                binvals[low_edge] = 0.
            binvals[low_edge] += hist.bin_contents[ib]
            if debug:
                print '   ', low_edge, hist.bin_contents[ib],
        if debug:
            print ''
    binlist = sorted(binvals.keys())
    meanhist = Hist(len(binlist) - 2, binlist[1], binlist[-1], binlist[1:-1])
    if debug:
        print '   mean',
    for ib in range(len(binlist)):
        meanhist.set_ibin(ib, binvals[binlist[ib]])
        if debug:
            print '   ', meanhist.low_edges[ib], meanhist.bin_contents[ib],
    if debug:
        print ''

    meanhist.normalize()
    return meanhist
示例#3
0
    def array2data2D(self,
                     data,
                     weights=None,
                     normed=False,
                     binning=1,
                     reBin=None):
        """
        Convert array of data to internal format
        - Designed for arrays of raw, un-binned data.
        - If you pass values here from an existing histogram ('weights' is not None
          and the 'data' param is just bin centers), it is possible to re-bin
          this histogram using the 'reBin' keyword
        """
        try:
            x = data['x']
            y = data['y']
        except TypeError:
            x = data[0]
            y = data[1]

        data, bins_x, bins_y = np.histogram2d(x,
                                              y,
                                              bins=binning,
                                              normed=normed,
                                              weights=weights)

        results = Hist()
        results.content = data.flatten()  # data is a ndarray (nxbins,nybins)
        results.bins = {'x': bins_x, 'y': bins_y}

        xcenter, ycenter = tools.dummy_bins2D(tools.midpoints(bins_x),
                                              tools.midpoints(bins_y))
        xwidth, ywidth = tools.dummy_bins2D(tools.widths(bins_x),
                                            tools.widths(bins_y))

        results.center = {'x': xcenter, 'y': ycenter}
        results.width = {'x': xwidth, 'y': ywidth}

        results.error = np.sqrt(data)
        if weights is not None:
            # scipy.stats to get sumw2 (x,y,weights should have the same shape)
            results.error = results.sumw2_2D(xdata=x, ydata=y, values=weights)

        if reBin is not None:
            # re-binning after making data from array, likely that the user
            # passed in binned data and wants to re-bin.
            results.Rebin2D(reBin)
            if normed: results.normalize()  # normalize after re-binning

        results.xbins = results.bins['x']
        results.ybins = results.bins['y']
        results.xcenter = results.center['x']
        results.ycenter = results.center['y']
        results.xwidth = results.width['x']
        results.ywidth = results.width['y']

        return results


## THE END ##
示例#4
0
    def array2data(self,
                   data,
                   weights=None,
                   normed=False,
                   binning=1,
                   reBin=None):
        """
        Convert array of data to internal format
        - Designed for arrays of raw, un-binned data.
        - If you pass values here from an existing histogram ('weights' is not None
          and the 'data' param is just bin centers), it is possible to re-bin
          this histogram using the 'reBin' keyword
        """
        data, bins = np.histogram(data,
                                  bins=binning,
                                  weights=weights,
                                  normed=normed)

        results = Hist()
        results.content = data
        results.bins = bins
        results.center = tools.midpoints(bins)
        results.width = tools.widths(bins)

        results.error = np.sqrt(data)
        if weights is not None:
            # numpy digitize to get sumw2
            results.error = results.sumw2_1D(xdata=data, values=weights)

        if reBin is not None:
            results.Rebin(reBin)
            if normed: results.normalize()  # normalize after re-binning

        return results
示例#5
0
    def get_mute_hist(self, mtype):
        if self.args.mutate_from_scratch:
            mean_mute_val = self.args.scratch_mute_freq
            if self.args.same_mute_freq_for_all_seqs:
                hist = Hist(1, mean_mute_val - utils.eps,
                            mean_mute_val + utils.eps)
                hist.fill(mean_mute_val)
            else:
                n_entries = 500
                length_vals = [
                    v
                    for v in numpy.random.exponential(mean_mute_val, n_entries)
                ]  # count doesn't work on numpy.ndarray objects
                max_val = 0.8  # this is arbitrary, but you shouldn't be calling this with anything that gets a significant number anywhere near there, anyway
                if length_vals.count(max_val):
                    print '%s lots of really high mutation rates treegenerator::get_mute_hist()' % utils.color(
                        'yellow', 'warning')
                length_vals = [min(v, max_val) for v in length_vals]
                hist = Hist(30, 0., max_val)
                for val in length_vals:
                    hist.fill(val)
                hist.normalize()
        else:
            hist = Hist(fname=self.parameter_dir + '/' + mtype +
                        '-mean-mute-freqs.csv')

        return hist
示例#6
0
def make_mean_hist(hists, debug=False):
    """ return the hist with bin contents the mean over <hists> of each bin """
    binvals = {}
    for hist in hists:
        if debug:
            print '    sub',
        for ib in range(0, hist.n_bins + 2):
            low_edge = hist.low_edges[ib]
            if low_edge not in binvals:
                binvals[low_edge] = 0.
            binvals[low_edge] += hist.bin_contents[ib]
            if debug:
                print '   ', low_edge, hist.bin_contents[ib],
        if debug:
            print ''
    binlist = sorted(binvals.keys())
    meanhist = Hist(len(binlist) - 2, binlist[1], binlist[-1], binlist[1 : -1])
    if debug:
        print '   mean',
    for ib in range(len(binlist)):
        meanhist.set_ibin(ib, binvals[binlist[ib]])
        if debug:
            print '   ', meanhist.low_edges[ib], meanhist.bin_contents[ib],
    if debug:
        print ''

    meanhist.normalize()
    return meanhist
示例#7
0
def make_hist_from_dict_of_counts(values, var_type, hist_label, log='', xmin_force=0.0, xmax_force=0.0, normalize=False, sort=False):
    """ Fill a histogram with values from a dictionary (each key will correspond to one bin) """
    assert var_type == 'int' or var_type == 'string'  # floats should be handled by Hist class in hist.py

    if len(values) == 0:
        print 'WARNING no values for %s in make_hist' % hist_label
        return TH1D(hist_label, '', 1, 0, 1)

    bin_labels = sorted(values)
    if not sort and var_type == 'string':  # for strings, sort so most common value is to left side
        bin_labels = sorted(values, key=values.get, reverse=True)

    if var_type == 'string':
        n_bins = len(values)
    else:
        n_bins = bin_labels[-1] - bin_labels[0] + 1

    hist = None
    xbins = [0. for _ in range(n_bins+1)]  # NOTE the +1 is 'cause you need the lower edge of the overflow bin
    if xmin_force == xmax_force:  # if boundaries aren't set explicitly, work out what they should be
        if var_type == 'string':
            set_bins(bin_labels, n_bins, 'x' in log, xbins, var_type)
            hist = Hist(n_bins, xbins[0], xbins[-1], xbins=xbins)
        else:
            hist = Hist(n_bins, bin_labels[0] - 0.5, bin_labels[-1] + 0.5)  # for integers, just go from the first to the last bin label (they're sorted)
    else:
      hist = Hist(n_bins, xmin_force, xmax_force)

    for ival in range(len(values)):
        if var_type == 'string':
            label = bin_labels[ival]
            ibin = ival + 1
        else:
            label = ''
            ibin = hist.find_bin(bin_labels[ival])
        hist.set_ibin(ibin, values[bin_labels[ival]], error=math.sqrt(values[bin_labels[ival]]), label=label)
  
    # make sure there's no overflows
    if hist.bin_contents[0] != 0.0 or hist.bin_contents[-1] != 0.0:
        for ibin in range(hist.n_bins + 2):
            print '%d %f %f' % (ibin, hist.low_edges[ibin], hist.bin_contents[ibin])
        raise Exception('overflows in ' + hist_label)

    if normalize:
        hist.normalize()
        hist.ytitle = 'freq'
    else:
        hist.ytitle = 'counts'
    
    roothist = make_hist_from_my_hist_class(hist, hist_label)
    return roothist
示例#8
0
def make_hist_from_dict_of_counts(values, var_type, hist_label, log='', xmin_force=0.0, xmax_force=0.0, normalize=False, sort=False):
    """ Fill a histogram with values from a dictionary (each key will correspond to one bin) """
    assert var_type == 'int' or var_type == 'string'  # floats should be handled by Hist class in hist.py

    if len(values) == 0:
        print 'WARNING no values for %s in make_hist' % hist_label
        return Hist(1, 0, 1)

    bin_labels = sorted(values)
    if not sort and var_type == 'string':  # for strings, sort so most common value is to left side
        bin_labels = sorted(values, key=values.get, reverse=True)

    if var_type == 'string':
        n_bins = len(values)
    else:
        n_bins = bin_labels[-1] - bin_labels[0] + 1

    hist = None
    xbins = [0. for _ in range(n_bins+1)]  # NOTE the +1 is 'cause you need the lower edge of the overflow bin
    if xmin_force == xmax_force:  # if boundaries aren't set explicitly, work out what they should be
        if var_type == 'string':
            set_bins(bin_labels, n_bins, 'x' in log, xbins, var_type)
            hist = Hist(n_bins, xbins[0], xbins[-1], xbins=xbins)
        else:
            hist = Hist(n_bins, bin_labels[0] - 0.5, bin_labels[-1] + 0.5)  # for integers, just go from the first to the last bin label (they're sorted)
    else:
      hist = Hist(n_bins, xmin_force, xmax_force)

    for ival in range(len(values)):
        if var_type == 'string':
            label = bin_labels[ival]
            ibin = ival + 1
        else:
            label = ''
            ibin = hist.find_bin(bin_labels[ival])
        hist.set_ibin(ibin, values[bin_labels[ival]], error=math.sqrt(values[bin_labels[ival]]), label=label)
  
    # make sure there's no overflows
    if hist.bin_contents[0] != 0.0 or hist.bin_contents[-1] != 0.0:
        for ibin in range(hist.n_bins + 2):
            print '%d %f %f' % (ibin, hist.low_edges[ibin], hist.bin_contents[ibin])
        raise Exception('overflows in ' + hist_label)

    if normalize:
        hist.normalize()
        hist.ytitle = 'freq'
    else:
        hist.ytitle = 'counts'
    
    return hist
示例#9
0
    def hist2data(self, histo, reBin=None, normed=False):
        """Convert ROOT histogram for internal use."""
        bin_contents, bin_edges = histo.numpy()

        results = Hist()
        results.content = bin_contents
        results.bins = bin_edges
        results.center = tools.midpoints(bin_edges)
        results.width = tools.widths(bin_edges)

        if len(histo.variances) > 0:
            results.error = histo.variances
        else:
            results.error = np.sqrt(bin_contents)

        if reBin is not None:
            results.Rebin(reBin)
        if normed: results.normalize()

        return results
示例#10
0
    def hist2data2D(self, histo, reBin=None, normed=False):
        """Convert ROOT histogram for internal use."""
        bin_contents, (xbin_edges, ybin_edges) = histo.numpy()
        bin_contents = bin_contents.T

        if len(histo.allvariances) > 0:
            bin_errors = histo.allvariances[
                1:-1, 1:
                -1]  # variances() doesn't produce correct values in 2D right now
        else:
            bin_errors = np.sqrt(bin_contents)

        xbin_centers, ybin_centers = tools.dummy_bins2D(
            tools.midpoints(xbin_edges), tools.midpoints(ybin_edges))
        xbin_widths, ybin_widths = tools.dummy_bins2D(tools.widths(xbin_edges),
                                                      tools.widths(ybin_edges))

        results = Hist()
        results.content = bin_contents.flatten()
        results.error = bin_errors.flatten()
        results.bins = {'x': xbin_edges, 'y': ybin_edges}
        results.center = {'x': xbin_centers, 'y': ybin_centers}
        results.width = {'x': xbin_widths, 'y': ybin_widths}

        if reBin is not None:
            results.Rebin2D(reBin)
        if normed: results.normalize()

        # Set extra attributes (placeholders for the moment)
        results.xbins = results.bins['x']
        results.ybins = results.bins['y']
        results.xcenter = results.center['x']
        results.ycenter = results.center['y']
        results.xwidth = results.width['x']
        results.ywidth = results.width['y']

        return results


## THE END ##
示例#11
0
def make_mean_hist(hists, debug=False):
    """ return the hist with bin contents the mean over <hists> of each bin """
    binvals = {}
    all_data = None
    for hist in hists:
        if debug:
            print '    sub',
        for ib in range(0, hist.n_bins + 2):
            low_edge = hist.low_edges[ib]
            if low_edge not in binvals:
                binvals[low_edge] = 0.
            binvals[low_edge] += hist.bin_contents[ib]
            if debug:
                print '   ', low_edge, hist.bin_contents[ib],
        if all_data is not None and hist.all_data is None:
            raise Exception('tried to average hists with and without all_data set')
        if hist.all_data is not None:
            if all_data is None:
                all_data = []
            all_data += hist.all_data
        if debug:
            print ''
    binlist = sorted(binvals.keys())
    meanhist = Hist(len(binlist) - 2, binlist[1], binlist[-1], binlist[1 : -1])
    meanhist.all_data = all_data
    if debug:
        print '   mean',
    for ib in range(len(binlist)):
        meanhist.set_ibin(ib, binvals[binlist[ib]])
        if debug:
            print '   ', meanhist.low_edges[ib], meanhist.bin_contents[ib],
    if debug:
        print ''

    meanhist.normalize()
    return meanhist
示例#12
0
    plt.scatter(xvals, yvals, alpha=0.4)
    chfcns.mpl_finish(ax,
                      args.plotdir,
                      'imax-vs-max-abs-diff',
                      xlabel='break point',
                      ylabel='abs mfreq diff')
    plt.close()

    fig, ax = chfcns.mpl_init()
    xmin, xmax = 0., 0.65
    for sample, chfo in chfos.items():
        hmaxval = Hist(45,
                       xmin,
                       xmax,
                       value_list=[chfo[u]['max_abs_diff'] for u in chfo])
        hmaxval.normalize()
        hmaxval.mpl_plot(ax, color=colors[sample], label=sample)
    chfcns.mpl_finish(ax,
                      args.plotdir,
                      'mfreq-diff',
                      xlabel='abs mfreq diff',
                      ylabel='freq',
                      xbounds=(xmin - 0.02, xmax),
                      leg_loc=(0.5, 0.6))

    fig, ax = chfcns.mpl_init()
    xmin, xmax = 0., 300
    for sample, chfo in chfos.items():
        himax = Hist(75,
                     xmin,
                     xmax,