def make_mean_hist(hists, debug=False): """ return the hist with bin contents the mean over <hists> of each bin """ binvals = {} for hist in hists: if debug: print ' sub', for ib in range(0, hist.n_bins + 2): low_edge = hist.low_edges[ib] if low_edge not in binvals: binvals[low_edge] = 0. binvals[low_edge] += hist.bin_contents[ib] if debug: print ' ', low_edge, hist.bin_contents[ib], if debug: print '' binlist = sorted(binvals.keys()) meanhist = Hist(len(binlist) - 2, binlist[1], binlist[-1], binlist[1:-1]) if debug: print ' mean', for ib in range(len(binlist)): meanhist.set_ibin(ib, binvals[binlist[ib]]) if debug: print ' ', meanhist.low_edges[ib], meanhist.bin_contents[ib], if debug: print '' meanhist.normalize() return meanhist
def make_mean_hist(hists, debug=False): """ return the hist with bin contents the mean over <hists> of each bin """ binvals = {} for hist in hists: if debug: print ' sub', for ib in range(0, hist.n_bins + 2): low_edge = hist.low_edges[ib] if low_edge not in binvals: binvals[low_edge] = 0. binvals[low_edge] += hist.bin_contents[ib] if debug: print ' ', low_edge, hist.bin_contents[ib], if debug: print '' binlist = sorted(binvals.keys()) meanhist = Hist(len(binlist) - 2, binlist[1], binlist[-1], binlist[1 : -1]) if debug: print ' mean', for ib in range(len(binlist)): meanhist.set_ibin(ib, binvals[binlist[ib]]) if debug: print ' ', meanhist.low_edges[ib], meanhist.bin_contents[ib], if debug: print '' meanhist.normalize() return meanhist
def make_hist_from_dict_of_counts(values, var_type, hist_label, log='', xmin_force=0.0, xmax_force=0.0, normalize=False, sort=False): """ Fill a histogram with values from a dictionary (each key will correspond to one bin) """ assert var_type == 'int' or var_type == 'string' # floats should be handled by Hist class in hist.py if len(values) == 0: print 'WARNING no values for %s in make_hist' % hist_label return TH1D(hist_label, '', 1, 0, 1) bin_labels = sorted(values) if not sort and var_type == 'string': # for strings, sort so most common value is to left side bin_labels = sorted(values, key=values.get, reverse=True) if var_type == 'string': n_bins = len(values) else: n_bins = bin_labels[-1] - bin_labels[0] + 1 hist = None xbins = [0. for _ in range(n_bins+1)] # NOTE the +1 is 'cause you need the lower edge of the overflow bin if xmin_force == xmax_force: # if boundaries aren't set explicitly, work out what they should be if var_type == 'string': set_bins(bin_labels, n_bins, 'x' in log, xbins, var_type) hist = Hist(n_bins, xbins[0], xbins[-1], xbins=xbins) else: hist = Hist(n_bins, bin_labels[0] - 0.5, bin_labels[-1] + 0.5) # for integers, just go from the first to the last bin label (they're sorted) else: hist = Hist(n_bins, xmin_force, xmax_force) for ival in range(len(values)): if var_type == 'string': label = bin_labels[ival] ibin = ival + 1 else: label = '' ibin = hist.find_bin(bin_labels[ival]) hist.set_ibin(ibin, values[bin_labels[ival]], error=math.sqrt(values[bin_labels[ival]]), label=label) # make sure there's no overflows if hist.bin_contents[0] != 0.0 or hist.bin_contents[-1] != 0.0: for ibin in range(hist.n_bins + 2): print '%d %f %f' % (ibin, hist.low_edges[ibin], hist.bin_contents[ibin]) raise Exception('overflows in ' + hist_label) if normalize: hist.normalize() hist.ytitle = 'freq' else: hist.ytitle = 'counts' roothist = make_hist_from_my_hist_class(hist, hist_label) return roothist
def make_hist_from_dict_of_counts(values, var_type, hist_label, log='', xmin_force=0.0, xmax_force=0.0, normalize=False, sort=False): """ Fill a histogram with values from a dictionary (each key will correspond to one bin) """ assert var_type == 'int' or var_type == 'string' # floats should be handled by Hist class in hist.py if len(values) == 0: print 'WARNING no values for %s in make_hist' % hist_label return Hist(1, 0, 1) bin_labels = sorted(values) if not sort and var_type == 'string': # for strings, sort so most common value is to left side bin_labels = sorted(values, key=values.get, reverse=True) if var_type == 'string': n_bins = len(values) else: n_bins = bin_labels[-1] - bin_labels[0] + 1 hist = None xbins = [0. for _ in range(n_bins+1)] # NOTE the +1 is 'cause you need the lower edge of the overflow bin if xmin_force == xmax_force: # if boundaries aren't set explicitly, work out what they should be if var_type == 'string': set_bins(bin_labels, n_bins, 'x' in log, xbins, var_type) hist = Hist(n_bins, xbins[0], xbins[-1], xbins=xbins) else: hist = Hist(n_bins, bin_labels[0] - 0.5, bin_labels[-1] + 0.5) # for integers, just go from the first to the last bin label (they're sorted) else: hist = Hist(n_bins, xmin_force, xmax_force) for ival in range(len(values)): if var_type == 'string': label = bin_labels[ival] ibin = ival + 1 else: label = '' ibin = hist.find_bin(bin_labels[ival]) hist.set_ibin(ibin, values[bin_labels[ival]], error=math.sqrt(values[bin_labels[ival]]), label=label) # make sure there's no overflows if hist.bin_contents[0] != 0.0 or hist.bin_contents[-1] != 0.0: for ibin in range(hist.n_bins + 2): print '%d %f %f' % (ibin, hist.low_edges[ibin], hist.bin_contents[ibin]) raise Exception('overflows in ' + hist_label) if normalize: hist.normalize() hist.ytitle = 'freq' else: hist.ytitle = 'counts' return hist
def plot(self, plotdir, only_csv=False, only_overall=False): if not self.finalized: self.finalize() overall_plotdir = plotdir + '/overall' for gene in self.freqs: if only_overall: continue freqs = self.freqs[gene] if len(freqs) == 0: if gene not in glutils.dummy_d_genes.values(): print ' %s no mutefreqer obs for %s' % (utils.color('red', 'warning'), utils.color_gene(gene)) continue sorted_positions = sorted(freqs.keys()) genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='position', ytitle='mut freq', title=gene) for position in sorted_positions: hi_diff = abs(freqs[position]['freq'] - freqs[position]['freq_hi_err']) lo_diff = abs(freqs[position]['freq'] - freqs[position]['freq_lo_err']) err = 0.5*(hi_diff + lo_diff) genehist.set_ibin(genehist.find_bin(position), freqs[position]['freq'], error=err) xline = None figsize = [7, 4] if utils.get_region(gene) in utils.conserved_codons[self.glfo['chain']]: codon = utils.conserved_codons[self.glfo['chain']][utils.get_region(gene)] xline = self.glfo[codon + '-positions'][gene] if utils.get_region(gene) == 'v': figsize[0] *= 3.5 elif utils.get_region(gene) == 'j': figsize[0] *= 2 plotting.draw_no_root(self.per_gene_mean_rates[gene], plotdir=plotdir + '/per-gene/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, only_csv=only_csv, shift_overflows=True) # per-position plots: plotting.draw_no_root(genehist, plotdir=plotdir + '/per-gene-per-position/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv, shift_overflows=True) # # per-position, per-base plots: # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # needs translation to mpl UPDATE fcn is fixed, but I can't be bothered uncommenting this at the moment # make mean mute freq hists for rstr in ['all', 'cdr3'] + utils.regions: if rstr == 'all': bounds = (0.0, 0.4) else: bounds = (0.0, 0.6 if rstr == 'd' else 0.4) plotting.draw_no_root(self.mean_rates[rstr], plotname=rstr+'_mean-freq', plotdir=overall_plotdir, stats='mean', bounds=bounds, write_csv=True, only_csv=only_csv, shift_overflows=True) plotting.draw_no_root(self.mean_n_muted[rstr], plotname=rstr+'_mean-n-muted', plotdir=overall_plotdir, stats='mean', write_csv=True, only_csv=only_csv, shift_overflows=True) if not only_csv: # write html file and fix permissiions for substr in self.subplotdirs: plotting.make_html(plotdir + '/' + substr)
def make_mean_hist(hists): """ return the hist with bin contents the mean over <hists> of each bin """ binvals = {} for hist in hists: # I could probably do this with list comprehensions or something, but this way handles different bin bounds for ib in range(0, hist.n_bins + 2): low_edge = hist.low_edges[ib] if low_edge not in binvals: binvals[low_edge] = [] binvals[low_edge].append(hist.bin_contents[ib]) binlist = sorted(binvals.keys()) meanhist = Hist(len(binlist) - 2, binlist[1], binlist[-1], xbins=binlist[1 :]) for ib in range(len(binlist)): vlist = binvals[binlist[ib]] meanhist.set_ibin(ib, numpy.mean(vlist), error=(numpy.std(vlist, ddof=1) / math.sqrt(len(vlist)))) # meanhist.normalize() return meanhist
def plot(self, base_plotdir, cyst_positions=None, tryp_positions=None, only_csv=False): if not self.finalized: self.finalize() plotdir = base_plotdir + '/mute-freqs' overall_plotdir = plotdir + '/overall' utils.prep_dir(overall_plotdir, multilings=('*.csv', '*.svg')) for region in utils.regions: utils.prep_dir(plotdir + '/' + region, multilings=('*.csv', '*.svg')) # utils.prep_dir(plotdir + '/' + region + '-per-base/plots', multilings=('*.csv', '*.png')) if self.tigger: utils.prep_dir(plotdir + '/tigger', multilings=('*.csv', '*.svg')) for gene in self.freqs: freqs = self.freqs[gene] sorted_positions = sorted(freqs.keys()) genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='fixme', ytitle='fixme') #, title=utils.sanitize_name(gene)) for position in sorted_positions: hi_diff = abs(freqs[position]['freq'] - freqs[position]['freq_hi_err']) lo_diff = abs(freqs[position]['freq'] - freqs[position]['freq_lo_err']) err = 0.5*(hi_diff + lo_diff) genehist.set_ibin(genehist.find_bin(position), freqs[position]['freq'], error=err) xline = None figsize = [3, 3] if utils.get_region(gene) == 'v' and cyst_positions is not None: xline = cyst_positions[gene] figsize[0] *= 3.5 elif utils.get_region(gene) == 'j' and tryp_positions is not None: xline = tryp_positions[gene] figsize[0] *= 2 plotting.draw_no_root(genehist, plotdir=plotdir + '/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv) # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # needs translation to mpl # make mean mute freq hists plotting.draw_no_root(self.mean_rates['all'], plotname='all-mean-freq', plotdir=overall_plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv) for region in utils.regions: plotting.draw_no_root(self.mean_rates[region], plotname=region+'-mean-freq', plotdir=overall_plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv) if self.tigger: self.tigger_plot(only_csv) if not only_csv: # write html file and fix permissiions plotting.make_html(overall_plotdir) for region in utils.regions: plotting.make_html(plotdir + '/' + region, n_columns=1)
def make_fraction_plot(hright, hwrong, plotdir, plotname, xlabel, ylabel, xbounds, only_csv=False, write_csv=False): if 'fraction_uncertainty' not in sys.modules: import fraction_uncertainty # NOTE should really merge this with draw_no_root() xvals = hright.get_bin_centers() #ignore_overflows=True) right = hright.bin_contents wrong = hwrong.bin_contents yvals = [float(r) / (r + w) if r + w > 0. else 0. for r, w in zip(right, wrong)] # remove values corresponding to bins with no entries while yvals.count(0.) > 0: iv = yvals.index(0.) xvals.pop(iv) right.pop(iv) wrong.pop(iv) yvals.pop(iv) tmphilos = [sys.modules['fraction_uncertainty'].err(r, r + w) for r, w in zip(right, wrong)] yerrs = [err[1] - err[0] for err in tmphilos] # print '%s' % region # for iv in range(len(xvals)): # print ' %5.2f %5.0f / %5.0f = %5.2f +/- %.3f' % (xvals[iv], right[iv], right[iv] + wrong[iv], yvals[iv], yerrs[iv]) if write_csv: hist_for_csv = Hist(hright.n_bins, hright.xmin, hright.xmax) bincenters = hright.get_bin_centers() for ibin in range(hright.n_bins): bcenter = bincenters[ibin] if bcenter in xvals: # if we didn't remove it iy = xvals.index(bcenter) hist_for_csv.set_ibin(ibin, yvals[iy], error=yerrs[iy]) hist_for_csv.write(plotdir + '/' + plotname + '.csv') if not only_csv: fig, ax = mpl_init() ax.errorbar(xvals, yvals, yerr=yerrs, markersize=10, linewidth=1, marker='.') if xlabel == 'support': ax.plot((0, 1), (0, 1), color='black', linestyle='--', linewidth=3) # line with slope 1 and intercept 0 mpl_finish(ax, plotdir, plotname, xlabel=xlabel, ylabel=ylabel, title=plotconfig.plot_titles.get(plotname, plotname), xbounds=xbounds, ybounds=(-0.1, 1.1)) plt.close()
def plot(self, base_plotdir, cyst_positions=None, tryp_positions=None, only_csv=False): if not self.finalized: self.finalize() plotdir = base_plotdir + '/mute-freqs' utils.prep_dir(plotdir + '/plots', multilings=('*.csv', '*.svg')) for region in utils.regions: utils.prep_dir(plotdir + '/' + region + '/plots', multilings=('*.csv', '*.svg')) # utils.prep_dir(plotdir + '/' + region + '-per-base/plots', multilings=('*.csv', '*.png')) for gene in self.counts: counts, plotting_info = self.counts[gene], self.plotting_info[gene] sorted_positions = sorted(counts) genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='fixme', ytitle='fixme') #, title=utils.sanitize_name(gene)) for position in sorted_positions: hi_diff = abs(counts[position]['freq'] - counts[position]['freq_hi_err']) lo_diff = abs(counts[position]['freq'] - counts[position]['freq_lo_err']) err = 0.5*(hi_diff + lo_diff) genehist.set_ibin(genehist.find_bin(position), counts[position]['freq'], error=err) xline = None figsize = [3, 3] if utils.get_region(gene) == 'v' and cyst_positions is not None: xline = cyst_positions[gene]['cysteine-position'] figsize[0] *= 3.5 elif utils.get_region(gene) == 'j' and tryp_positions is not None: xline = int(tryp_positions[gene]) figsize[0] *= 2 plotting.draw_no_root(genehist, plotdir=plotdir + '/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv) # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # needs translation to mpl # make mean mute freq hists plotting.draw_no_root(self.mean_rates['all'], plotname='all-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv) for region in utils.regions: plotting.draw_no_root(self.mean_rates[region], plotname=region+'-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv) if not only_csv: # write html file and fix permissiions check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg']) for region in utils.regions: check_call(['./bin/makeHtml', plotdir + '/' + region, '1', 'null', 'svg']) # check_call(['./bin/makeHtml', plotdir + '/' + region + '-per-base', '1', 'null', 'png']) check_call(['./bin/permissify-www', plotdir]) # NOTE this should really permissify starting a few directories higher up
def make_mean_hist(hists, debug=False): """ return the hist with bin contents the mean over <hists> of each bin """ binvals = {} all_data = None for hist in hists: if debug: print ' sub', for ib in range(0, hist.n_bins + 2): low_edge = hist.low_edges[ib] if low_edge not in binvals: binvals[low_edge] = 0. binvals[low_edge] += hist.bin_contents[ib] if debug: print ' ', low_edge, hist.bin_contents[ib], if all_data is not None and hist.all_data is None: raise Exception('tried to average hists with and without all_data set') if hist.all_data is not None: if all_data is None: all_data = [] all_data += hist.all_data if debug: print '' binlist = sorted(binvals.keys()) meanhist = Hist(len(binlist) - 2, binlist[1], binlist[-1], binlist[1 : -1]) meanhist.all_data = all_data if debug: print ' mean', for ib in range(len(binlist)): meanhist.set_ibin(ib, binvals[binlist[ib]]) if debug: print ' ', meanhist.low_edges[ib], meanhist.bin_contents[ib], if debug: print '' meanhist.normalize() return meanhist
def plot(self, plotdir, only_csv=False, only_overall=False): import plotting if not self.finalized: self.finalize() overall_plotdir = plotdir + '/overall' for gene in self.freqs: if only_overall: continue freqs = self.freqs[gene] if len(freqs) == 0: if gene not in glutils.dummy_d_genes.values(): print ' %s no mutefreqer obs for %s' % (utils.color( 'red', 'warning'), utils.color_gene(gene)) continue sorted_positions = sorted(freqs.keys()) genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='position', ytitle='mut freq', title=gene) for position in sorted_positions: hi_diff = abs(freqs[position]['freq'] - freqs[position]['freq_hi_err']) lo_diff = abs(freqs[position]['freq'] - freqs[position]['freq_lo_err']) err = 0.5 * (hi_diff + lo_diff) genehist.set_ibin(genehist.find_bin(position), freqs[position]['freq'], error=err) xline = None figsize = [7, 4] if utils.get_region(gene) in utils.conserved_codons[ self.glfo['locus']]: xline = utils.cdn_pos(self.glfo, utils.get_region(gene), gene) if utils.get_region(gene) == 'v': figsize[0] *= 3.5 elif utils.get_region(gene) == 'j': figsize[0] *= 2 plotting.draw_no_root(self.per_gene_mean_rates[gene], plotdir=plotdir + '/per-gene/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, only_csv=only_csv, shift_overflows=True) # per-position plots: plotting.draw_no_root(genehist, plotdir=plotdir + '/per-gene-per-position/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv, shift_overflows=True) # # per-position, per-base plots: # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # needs translation to mpl UPDATE fcn is fixed, but I can't be bothered uncommenting this at the moment # make mean mute freq hists for rstr in ['all', 'cdr3'] + utils.regions: if rstr == 'all': bounds = (0.0, 0.4) else: bounds = (0.0, 0.6 if rstr == 'd' else 0.4) plotting.draw_no_root(self.mean_rates[rstr], plotname=rstr + '_mean-freq', plotdir=overall_plotdir, stats='mean', bounds=bounds, write_csv=True, only_csv=only_csv, shift_overflows=True) plotting.draw_no_root(self.mean_n_muted[rstr], plotname=rstr + '_mean-n-muted', plotdir=overall_plotdir, stats='mean', write_csv=True, only_csv=only_csv, shift_overflows=True) if not only_csv: # write html file and fix permissiions for substr in self.subplotdirs: plotting.make_html(plotdir + '/' + substr)
def make_hist_from_dict_of_counts( values, var_type, hist_label, is_log_x=False, xmin_force=0.0, xmax_force=0.0, sort_by_counts=False, default_n_bins=30 ): # default_n_bins is only used if is_log_x set we're doing auto log bins """ Fill a histogram with values from a dictionary (each key will correspond to one bin) """ assert var_type == 'int' or var_type == 'string' # floats should be handled by Hist class in hist.py if len(values) == 0: print 'WARNING no values for %s in make_hist' % hist_label return Hist(1, 0, 1) bin_labels = sorted( values ) # by default sort by keys in dict (i.e. these aren't usually actually string "labels") if sort_by_counts: # instead sort by counts bin_labels = sorted(values, key=values.get, reverse=True) if var_type == 'string': n_bins = len(values) else: n_bins = bin_labels[-1] - bin_labels[ 0] + 1 if not is_log_x else default_n_bins hist = None xbins = [ 0. for _ in range(n_bins + 1) ] # NOTE the +1 is 'cause you need the lower edge of the overflow bin if xmin_force == xmax_force: # if boundaries aren't set explicitly, work out what they should be if var_type == 'string': set_bins(bin_labels, n_bins, is_log_x, xbins, var_type) hist = Hist(n_bins, xbins[0], xbins[-1], xbins=xbins) else: if is_log_x: # get automatic log-spaced bins set_bins(bin_labels, n_bins, is_log_x, xbins, var_type) hist = Hist(n_bins, xbins[0], xbins[-1], xbins=xbins) else: hist = Hist( n_bins, bin_labels[0] - 0.5, bin_labels[-1] + 0.5 ) # for integers, just go from the first to the last bin label (they're sorted) else: hist = Hist(n_bins, xmin_force, xmax_force) for ival in range(len(values)): if var_type == 'string': label = bin_labels[ival] ibin = ival + 1 else: label = '' ibin = hist.find_bin(bin_labels[ival]) hist.set_ibin(ibin, values[bin_labels[ival]], error=math.sqrt(values[bin_labels[ival]]), label=label) # make sure there's no overflows if hist.bin_contents[0] != 0.0 or hist.bin_contents[-1] != 0.0: for ibin in range(hist.n_bins + 2): print '%d %f %f' % (ibin, hist.low_edges[ibin], hist.bin_contents[ibin]) raise Exception('overflows in ' + hist_label) return hist