def read_mute_freqs(self, mute_freq_dir): # NOTE these are mute freqs, not branch lengths, but it's ok for now for mtype in ['all',] + utils.regions: infname = mute_freq_dir + '/' + mtype + '-mean-mute-freqs.csv' self.branch_lengths[mtype] = {} self.branch_lengths[mtype]['lengths'], self.branch_lengths[mtype]['probs'] = [], [] mutehist = Hist(fname=infname) self.branch_lengths[mtype]['mean'] = mutehist.get_mean() # if mutehist.GetBinContent(0) > 0.0 or mutehist.GetBinContent(mutehist.GetNbinsX()+1) > 0.0: # print 'WARNING nonzero under/overflow bins read from %s' % infname mutehist.normalize(include_overflows=False, overflow_eps_to_ignore=1e-2) # if it was written with overflows included, it'll need to be renormalized check_sum = 0.0 for ibin in range(1, mutehist.n_bins + 1): # ignore under/overflow bins freq = mutehist.get_bin_centers()[ibin] branch_length = self.convert_observed_changes_to_branch_length(float(freq)) prob = mutehist.bin_contents[ibin] self.branch_lengths[mtype]['lengths'].append(branch_length) self.branch_lengths[mtype]['probs'].append(prob) check_sum += self.branch_lengths[mtype]['probs'][-1] if not utils.is_normed(check_sum): raise Exception('not normalized %f' % check_sum) if self.args.debug: print ' mean branch lengths' for mtype in ['all',] + utils.regions: print ' %4s %7.3f (ratio %7.3f)' % (mtype, self.branch_lengths[mtype]['mean'], self.branch_lengths[mtype]['mean'] / self.branch_lengths['all']['mean'])
def make_mean_hist(hists, debug=False): """ return the hist with bin contents the mean over <hists> of each bin """ binvals = {} for hist in hists: if debug: print ' sub', for ib in range(0, hist.n_bins + 2): low_edge = hist.low_edges[ib] if low_edge not in binvals: binvals[low_edge] = 0. binvals[low_edge] += hist.bin_contents[ib] if debug: print ' ', low_edge, hist.bin_contents[ib], if debug: print '' binlist = sorted(binvals.keys()) meanhist = Hist(len(binlist) - 2, binlist[1], binlist[-1], binlist[1:-1]) if debug: print ' mean', for ib in range(len(binlist)): meanhist.set_ibin(ib, binvals[binlist[ib]]) if debug: print ' ', meanhist.low_edges[ib], meanhist.bin_contents[ib], if debug: print '' meanhist.normalize() return meanhist
def array2data2D(self, data, weights=None, normed=False, binning=1, reBin=None): """ Convert array of data to internal format - Designed for arrays of raw, un-binned data. - If you pass values here from an existing histogram ('weights' is not None and the 'data' param is just bin centers), it is possible to re-bin this histogram using the 'reBin' keyword """ try: x = data['x'] y = data['y'] except TypeError: x = data[0] y = data[1] data, bins_x, bins_y = np.histogram2d(x, y, bins=binning, normed=normed, weights=weights) results = Hist() results.content = data.flatten() # data is a ndarray (nxbins,nybins) results.bins = {'x': bins_x, 'y': bins_y} xcenter, ycenter = tools.dummy_bins2D(tools.midpoints(bins_x), tools.midpoints(bins_y)) xwidth, ywidth = tools.dummy_bins2D(tools.widths(bins_x), tools.widths(bins_y)) results.center = {'x': xcenter, 'y': ycenter} results.width = {'x': xwidth, 'y': ywidth} results.error = np.sqrt(data) if weights is not None: # scipy.stats to get sumw2 (x,y,weights should have the same shape) results.error = results.sumw2_2D(xdata=x, ydata=y, values=weights) if reBin is not None: # re-binning after making data from array, likely that the user # passed in binned data and wants to re-bin. results.Rebin2D(reBin) if normed: results.normalize() # normalize after re-binning results.xbins = results.bins['x'] results.ybins = results.bins['y'] results.xcenter = results.center['x'] results.ycenter = results.center['y'] results.xwidth = results.width['x'] results.ywidth = results.width['y'] return results ## THE END ##
def array2data(self, data, weights=None, normed=False, binning=1, reBin=None): """ Convert array of data to internal format - Designed for arrays of raw, un-binned data. - If you pass values here from an existing histogram ('weights' is not None and the 'data' param is just bin centers), it is possible to re-bin this histogram using the 'reBin' keyword """ data, bins = np.histogram(data, bins=binning, weights=weights, normed=normed) results = Hist() results.content = data results.bins = bins results.center = tools.midpoints(bins) results.width = tools.widths(bins) results.error = np.sqrt(data) if weights is not None: # numpy digitize to get sumw2 results.error = results.sumw2_1D(xdata=data, values=weights) if reBin is not None: results.Rebin(reBin) if normed: results.normalize() # normalize after re-binning return results
def get_mute_hist(self, mtype): if self.args.mutate_from_scratch: mean_mute_val = self.args.scratch_mute_freq if self.args.same_mute_freq_for_all_seqs: hist = Hist(1, mean_mute_val - utils.eps, mean_mute_val + utils.eps) hist.fill(mean_mute_val) else: n_entries = 500 length_vals = [ v for v in numpy.random.exponential(mean_mute_val, n_entries) ] # count doesn't work on numpy.ndarray objects max_val = 0.8 # this is arbitrary, but you shouldn't be calling this with anything that gets a significant number anywhere near there, anyway if length_vals.count(max_val): print '%s lots of really high mutation rates treegenerator::get_mute_hist()' % utils.color( 'yellow', 'warning') length_vals = [min(v, max_val) for v in length_vals] hist = Hist(30, 0., max_val) for val in length_vals: hist.fill(val) hist.normalize() else: hist = Hist(fname=self.parameter_dir + '/' + mtype + '-mean-mute-freqs.csv') return hist
def make_mean_hist(hists, debug=False): """ return the hist with bin contents the mean over <hists> of each bin """ binvals = {} for hist in hists: if debug: print ' sub', for ib in range(0, hist.n_bins + 2): low_edge = hist.low_edges[ib] if low_edge not in binvals: binvals[low_edge] = 0. binvals[low_edge] += hist.bin_contents[ib] if debug: print ' ', low_edge, hist.bin_contents[ib], if debug: print '' binlist = sorted(binvals.keys()) meanhist = Hist(len(binlist) - 2, binlist[1], binlist[-1], binlist[1 : -1]) if debug: print ' mean', for ib in range(len(binlist)): meanhist.set_ibin(ib, binvals[binlist[ib]]) if debug: print ' ', meanhist.low_edges[ib], meanhist.bin_contents[ib], if debug: print '' meanhist.normalize() return meanhist
def make_hist_from_dict_of_counts(values, var_type, hist_label, log='', xmin_force=0.0, xmax_force=0.0, normalize=False, sort=False): """ Fill a histogram with values from a dictionary (each key will correspond to one bin) """ assert var_type == 'int' or var_type == 'string' # floats should be handled by Hist class in hist.py if len(values) == 0: print 'WARNING no values for %s in make_hist' % hist_label return TH1D(hist_label, '', 1, 0, 1) bin_labels = sorted(values) if not sort and var_type == 'string': # for strings, sort so most common value is to left side bin_labels = sorted(values, key=values.get, reverse=True) if var_type == 'string': n_bins = len(values) else: n_bins = bin_labels[-1] - bin_labels[0] + 1 hist = None xbins = [0. for _ in range(n_bins+1)] # NOTE the +1 is 'cause you need the lower edge of the overflow bin if xmin_force == xmax_force: # if boundaries aren't set explicitly, work out what they should be if var_type == 'string': set_bins(bin_labels, n_bins, 'x' in log, xbins, var_type) hist = Hist(n_bins, xbins[0], xbins[-1], xbins=xbins) else: hist = Hist(n_bins, bin_labels[0] - 0.5, bin_labels[-1] + 0.5) # for integers, just go from the first to the last bin label (they're sorted) else: hist = Hist(n_bins, xmin_force, xmax_force) for ival in range(len(values)): if var_type == 'string': label = bin_labels[ival] ibin = ival + 1 else: label = '' ibin = hist.find_bin(bin_labels[ival]) hist.set_ibin(ibin, values[bin_labels[ival]], error=math.sqrt(values[bin_labels[ival]]), label=label) # make sure there's no overflows if hist.bin_contents[0] != 0.0 or hist.bin_contents[-1] != 0.0: for ibin in range(hist.n_bins + 2): print '%d %f %f' % (ibin, hist.low_edges[ibin], hist.bin_contents[ibin]) raise Exception('overflows in ' + hist_label) if normalize: hist.normalize() hist.ytitle = 'freq' else: hist.ytitle = 'counts' roothist = make_hist_from_my_hist_class(hist, hist_label) return roothist
def make_hist_from_dict_of_counts(values, var_type, hist_label, log='', xmin_force=0.0, xmax_force=0.0, normalize=False, sort=False): """ Fill a histogram with values from a dictionary (each key will correspond to one bin) """ assert var_type == 'int' or var_type == 'string' # floats should be handled by Hist class in hist.py if len(values) == 0: print 'WARNING no values for %s in make_hist' % hist_label return Hist(1, 0, 1) bin_labels = sorted(values) if not sort and var_type == 'string': # for strings, sort so most common value is to left side bin_labels = sorted(values, key=values.get, reverse=True) if var_type == 'string': n_bins = len(values) else: n_bins = bin_labels[-1] - bin_labels[0] + 1 hist = None xbins = [0. for _ in range(n_bins+1)] # NOTE the +1 is 'cause you need the lower edge of the overflow bin if xmin_force == xmax_force: # if boundaries aren't set explicitly, work out what they should be if var_type == 'string': set_bins(bin_labels, n_bins, 'x' in log, xbins, var_type) hist = Hist(n_bins, xbins[0], xbins[-1], xbins=xbins) else: hist = Hist(n_bins, bin_labels[0] - 0.5, bin_labels[-1] + 0.5) # for integers, just go from the first to the last bin label (they're sorted) else: hist = Hist(n_bins, xmin_force, xmax_force) for ival in range(len(values)): if var_type == 'string': label = bin_labels[ival] ibin = ival + 1 else: label = '' ibin = hist.find_bin(bin_labels[ival]) hist.set_ibin(ibin, values[bin_labels[ival]], error=math.sqrt(values[bin_labels[ival]]), label=label) # make sure there's no overflows if hist.bin_contents[0] != 0.0 or hist.bin_contents[-1] != 0.0: for ibin in range(hist.n_bins + 2): print '%d %f %f' % (ibin, hist.low_edges[ibin], hist.bin_contents[ibin]) raise Exception('overflows in ' + hist_label) if normalize: hist.normalize() hist.ytitle = 'freq' else: hist.ytitle = 'counts' return hist
def hist2data(self, histo, reBin=None, normed=False): """Convert ROOT histogram for internal use.""" bin_contents, bin_edges = histo.numpy() results = Hist() results.content = bin_contents results.bins = bin_edges results.center = tools.midpoints(bin_edges) results.width = tools.widths(bin_edges) if len(histo.variances) > 0: results.error = histo.variances else: results.error = np.sqrt(bin_contents) if reBin is not None: results.Rebin(reBin) if normed: results.normalize() return results
def hist2data2D(self, histo, reBin=None, normed=False): """Convert ROOT histogram for internal use.""" bin_contents, (xbin_edges, ybin_edges) = histo.numpy() bin_contents = bin_contents.T if len(histo.allvariances) > 0: bin_errors = histo.allvariances[ 1:-1, 1: -1] # variances() doesn't produce correct values in 2D right now else: bin_errors = np.sqrt(bin_contents) xbin_centers, ybin_centers = tools.dummy_bins2D( tools.midpoints(xbin_edges), tools.midpoints(ybin_edges)) xbin_widths, ybin_widths = tools.dummy_bins2D(tools.widths(xbin_edges), tools.widths(ybin_edges)) results = Hist() results.content = bin_contents.flatten() results.error = bin_errors.flatten() results.bins = {'x': xbin_edges, 'y': ybin_edges} results.center = {'x': xbin_centers, 'y': ybin_centers} results.width = {'x': xbin_widths, 'y': ybin_widths} if reBin is not None: results.Rebin2D(reBin) if normed: results.normalize() # Set extra attributes (placeholders for the moment) results.xbins = results.bins['x'] results.ybins = results.bins['y'] results.xcenter = results.center['x'] results.ycenter = results.center['y'] results.xwidth = results.width['x'] results.ywidth = results.width['y'] return results ## THE END ##
def make_mean_hist(hists, debug=False): """ return the hist with bin contents the mean over <hists> of each bin """ binvals = {} all_data = None for hist in hists: if debug: print ' sub', for ib in range(0, hist.n_bins + 2): low_edge = hist.low_edges[ib] if low_edge not in binvals: binvals[low_edge] = 0. binvals[low_edge] += hist.bin_contents[ib] if debug: print ' ', low_edge, hist.bin_contents[ib], if all_data is not None and hist.all_data is None: raise Exception('tried to average hists with and without all_data set') if hist.all_data is not None: if all_data is None: all_data = [] all_data += hist.all_data if debug: print '' binlist = sorted(binvals.keys()) meanhist = Hist(len(binlist) - 2, binlist[1], binlist[-1], binlist[1 : -1]) meanhist.all_data = all_data if debug: print ' mean', for ib in range(len(binlist)): meanhist.set_ibin(ib, binvals[binlist[ib]]) if debug: print ' ', meanhist.low_edges[ib], meanhist.bin_contents[ib], if debug: print '' meanhist.normalize() return meanhist
plt.scatter(xvals, yvals, alpha=0.4) chfcns.mpl_finish(ax, args.plotdir, 'imax-vs-max-abs-diff', xlabel='break point', ylabel='abs mfreq diff') plt.close() fig, ax = chfcns.mpl_init() xmin, xmax = 0., 0.65 for sample, chfo in chfos.items(): hmaxval = Hist(45, xmin, xmax, value_list=[chfo[u]['max_abs_diff'] for u in chfo]) hmaxval.normalize() hmaxval.mpl_plot(ax, color=colors[sample], label=sample) chfcns.mpl_finish(ax, args.plotdir, 'mfreq-diff', xlabel='abs mfreq diff', ylabel='freq', xbounds=(xmin - 0.02, xmax), leg_loc=(0.5, 0.6)) fig, ax = chfcns.mpl_init() xmin, xmax = 0., 300 for sample, chfo in chfos.items(): himax = Hist(75, xmin, xmax,