def make_bool_hist(n_true, n_false, hist_label): """ fill a two-bin histogram with the fraction false in the first bin and the fraction true in the second """ hist = TH1D(hist_label, '', 2, -0.5, 1.5) hist.Sumw2() true_frac = float(n_true) / (n_true + n_false) hist.SetBinContent(1, true_frac) true_bounds = fraction_uncertainty.err(n_true, n_true + n_false, use_beta=True) hist.SetBinError( 1, max(abs(true_frac - true_bounds[0]), abs(true_bounds[1] - true_bounds[1]))) false_frac = float(n_false) / (n_true + n_false) hist.SetBinContent(2, false_frac) false_bounds = fraction_uncertainty.err(n_false, n_true + n_false, use_beta=True) hist.SetBinError( 2, max(abs(false_frac - false_bounds[0]), abs(false_bounds[1] - false_bounds[1]))) hist.GetXaxis().SetNdivisions(0) hist.GetXaxis().SetBinLabel(1, 'right') hist.GetXaxis().SetBinLabel(2, 'wrong') hist.GetXaxis().SetLabelSize(0.1) return hist
def finalize(self, calculate_uncertainty=True): """ convert from counts to mut freqs """ assert not self.finalized self.n_cached, self.n_not_cached = 0, 0 for gene in self.counts: self.freqs[gene], self.plotting_info[gene] = {}, [] # NOTE <counts> hold the overall (not per-base) frequencies, while <freqs> holds the per-base frequencies counts, freqs, plotting_info = self.counts[gene], self.freqs[ gene], self.plotting_info[gene] sorted_positions = sorted(counts) for position in sorted_positions: freqs[position] = {} plotting_info.append({}) plotting_info[-1]['name'] = utils.sanitize_name( gene) + '_' + str(position) plotting_info[-1]['nuke_freqs'] = {} n_conserved, n_mutated = 0, 0 for nuke in utils.nukes: nuke_freq = float( counts[position][nuke]) / counts[position]['total'] freqs[position][nuke] = nuke_freq plotting_info[-1]['nuke_freqs'][nuke] = nuke_freq if calculate_uncertainty: # it's kinda slow errs = fraction_uncertainty.err( counts[position][nuke], counts[position]['total']) if errs[2]: self.n_cached += 1 else: self.n_not_cached += 1 # print nuke_freq, errs[0], errs[1], '(', counts[position][nuke], ',', counts[position]['total'], ')' assert errs[ 0] <= nuke_freq # these checks are probably unnecessary. EDIT and totally saved my ass about ten minutes after writing the previous statement assert nuke_freq <= errs[1] freqs[position][nuke + '_lo_err'] = errs[0] freqs[position][nuke + '_hi_err'] = errs[1] if nuke == counts[position]['gl_nuke']: n_conserved += counts[position][nuke] else: n_mutated += counts[position][nuke] # sum over A,C,G,T # uncert = fraction_uncertainty.err(obs, total) # uncertainty for each nuke counts[position]['freq'] = float( n_mutated) / counts[position]['total'] mutated_fraction_err = (0.0, 0.0) if calculate_uncertainty: # it's kinda slow mutated_fraction_err = fraction_uncertainty.err( n_mutated, counts[position]['total']) if mutated_fraction_err[2]: self.n_cached += 1 else: self.n_not_cached += 1 counts[position]['freq_lo_err'] = mutated_fraction_err[0] counts[position]['freq_hi_err'] = mutated_fraction_err[1] self.mean_rates['all'].normalize() for region in utils.regions: self.mean_rates[region].normalize() self.finalized = True
def finalize(self, calculate_uncertainty=True): """ convert from counts to mut freqs """ assert not self.finalized self.n_cached, self.n_not_cached = 0, 0 for gene in self.counts: self.freqs[gene], self.plotting_info[gene] = {}, [] # NOTE <counts> hold the overall (not per-base) frequencies, while <freqs> holds the per-base frequencies counts, freqs, plotting_info = self.counts[gene], self.freqs[gene], self.plotting_info[gene] sorted_positions = sorted(counts) for position in sorted_positions: freqs[position] = {} plotting_info.append({}) plotting_info[-1]['name'] = utils.sanitize_name(gene) + '_' + str(position) plotting_info[-1]['nuke_freqs'] = {} n_conserved, n_mutated = 0, 0 for nuke in utils.nukes: nuke_freq = float(counts[position][nuke]) / counts[position]['total'] freqs[position][nuke] = nuke_freq plotting_info[-1]['nuke_freqs'][nuke] = nuke_freq if calculate_uncertainty: # it's kinda slow errs = fraction_uncertainty.err(counts[position][nuke], counts[position]['total']) if errs[2]: self.n_cached += 1 else: self.n_not_cached += 1 # print nuke_freq, errs[0], errs[1], '(', counts[position][nuke], ',', counts[position]['total'], ')' assert errs[0] <= nuke_freq # these checks are probably unnecessary. EDIT and totally saved my ass about ten minutes after writing the previous statement assert nuke_freq <= errs[1] freqs[position][nuke + '_lo_err'] = errs[0] freqs[position][nuke + '_hi_err'] = errs[1] if nuke == counts[position]['gl_nuke']: n_conserved += counts[position][nuke] else: n_mutated += counts[position][nuke] # sum over A,C,G,T # uncert = fraction_uncertainty.err(obs, total) # uncertainty for each nuke counts[position]['freq'] = float(n_mutated) / counts[position]['total'] mutated_fraction_err = (0.0, 0.0) if calculate_uncertainty: # it's kinda slow mutated_fraction_err = fraction_uncertainty.err(n_mutated, counts[position]['total']) if mutated_fraction_err[2]: self.n_cached += 1 else: self.n_not_cached += 1 counts[position]['freq_lo_err'] = mutated_fraction_err[0] counts[position]['freq_hi_err'] = mutated_fraction_err[1] self.mean_rates['all'].normalize(overflow_warn=False) # we expect overflows in mute freq hists, so no need to warn us for region in utils.regions: self.mean_rates[region].normalize(overflow_warn=False) # for gene in self.tmpcounts: # for position in self.tmpcounts[gene]: # self.tmpcounts[gene][position]['muted'].divide_by(self.tmpcounts[gene][position]['total'], debug=False) self.finalized = True
def plot(self, plotdir): utils.prep_dir(plotdir + '/plots', wildling=None, multilings=['*.csv', '*.svg', '*.root']) for column in self.values: if self.only_correct_gene_fractions and column not in bool_columns: continue if column in bool_columns: right = self.values[column]['right'] wrong = self.values[column]['wrong'] errs = fraction_uncertainty.err(right, right+wrong) print ' %s\n correct up to allele: %4d / %-4d = %4.4f (-%.3f, +%.3f)' % (column, right, right+wrong, float(right) / (right + wrong), errs[0], errs[1]) hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column) plotting.draw(hist, 'bool', plotname=column, plotdir=plotdir, write_csv=True) else: # TODO this is dumb... I should make the integer-valued ones histograms as well hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=True) log = '' if column.find('hamming_to_true_naive') >= 0: hist.GetXaxis().SetTitle('hamming distance') else: hist.GetXaxis().SetTitle('inferred - true') plotting.draw(hist, 'int', plotname=column, plotdir=plotdir, write_csv=True, log=log) for column in self.hists: hist = plotting.make_hist_from_my_hist_class(self.hists[column], column) plotting.draw(hist, 'float', plotname=column, plotdir=plotdir, write_csv=True, log=log) check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg']) check_call(['./bin/permissify-www', plotdir]) # NOTE this should really permissify starting a few directories higher up
def plot(self, plotdir, only_csv=False): utils.prep_dir(plotdir, wildling=None, multilings=['*.csv', '*.svg', '*.root']) for column in self.values: if self.only_correct_gene_fractions and column not in bool_columns: continue if column in bool_columns: right = self.values[column]['right'] wrong = self.values[column]['wrong'] errs = fraction_uncertainty.err(right, right+wrong) print ' %s\n correct up to allele: %4d / %-4d = %4.4f (-%.3f, +%.3f)' % (column, right, right+wrong, float(right) / (right + wrong), errs[0], errs[1]) hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column) plotting.draw_no_root(hist, plotname=column, plotdir=plotdir, write_csv=True, stats='0-bin', only_csv=only_csv) else: # TODO this is dumb... I should make the integer-valued ones histograms as well hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=True) log = '' if column.find('hamming_to_true_naive') >= 0: # TODO why doesn't this just use the config dicts in plotheaders or wherever? hist.title = 'hamming distance' else: hist.title = 'inferred - true' plotting.draw_no_root(hist, plotname=column, plotdir=plotdir, write_csv=True, log=log, only_csv=only_csv) for column in self.hists: plotting.draw_no_root(self.hists[column], plotname=column, plotdir=plotdir, write_csv=True, log=log, only_csv=only_csv) if not only_csv: plotting.make_html(plotdir)
def get_single_vals(pv): yvals = [float(c) / t for c, t in zip(pv['ycounts'], pv['ytotals'])] # total shouldn't be able to be zero tmphilos = [fraction_uncertainty.err(c, t) for c, t in zip(pv['ycounts'], pv['ytotals'])] yerrs = [err[1] - err[0] for err in tmphilos] print ' %s %s' % (xlabel, ylabel) for iv in range(len(pv['xvals'])): print ' %8.0f %5.0f / %-5.0f = %5.2f +/- %.3f' % (pv['xvals'][iv], pv['ycounts'][iv], pv['ytotals'][iv], yvals[iv], yerrs[iv]) return pv['xvals'], yvals, yerrs
def make_bool_hist(n_true, n_false, hist_label): """ fill a two-bin histogram with the fraction false in the first bin and the fraction true in the second """ hist = TH1D(hist_label, '', 2, -0.5, 1.5) hist.Sumw2() true_frac = float(n_true) / (n_true + n_false) hist.SetBinContent(1, true_frac) true_bounds = fraction_uncertainty.err(n_true, n_true + n_false, use_beta=True) hist.SetBinError(1, max(abs(true_frac - true_bounds[0]), abs(true_bounds[1] - true_bounds[1]))) false_frac = float(n_false) / (n_true + n_false) hist.SetBinContent(2, false_frac) false_bounds = fraction_uncertainty.err(n_false, n_true + n_false, use_beta=True) hist.SetBinError(2, max(abs(false_frac - false_bounds[0]), abs(false_bounds[1] - false_bounds[1]))) hist.GetXaxis().SetNdivisions(0) hist.GetXaxis().SetBinLabel(1, 'right') hist.GetXaxis().SetBinLabel(2, 'wrong') hist.GetXaxis().SetLabelSize(0.1) return hist
def get_uncertainty(self, obs, total): if self.calculate_uncertainty: # it's kinda slow errs = fraction_uncertainty.err(obs, total) if errs[2]: self.n_cached += 1 else: self.n_not_cached += 1 else: errs = 0., 1. return errs[0], errs[1]
def plot(self, plotdir, only_csv=False): print ' plotting performance', import fraction_uncertainty import plotting start = time.time() for substr in self.subplotdirs: utils.prep_dir(plotdir + '/' + substr, wildlings=('*.csv', '*.svg')) for column in self.values: if column in plotconfig.gene_usage_columns: right = self.values[column]['right'] wrong = self.values[column]['wrong'] lo, hi = fraction_uncertainty.err(right, right + wrong) hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column) plotting.draw_no_root(hist, plotname=column, plotdir=plotdir + '/gene-call', write_csv=True, stats='0-bin', only_csv=only_csv) else: hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=False) if 'hamming_to_true_naive' in column: xtitle = 'hamming distance' tmpplotdir = plotdir + '/mutation' else: xtitle = 'inferred - true' if 'muted' in column: tmpplotdir = plotdir + '/mutation' else: tmpplotdir = plotdir + '/boundaries' plotting.draw_no_root(hist, plotname=column, plotdir=tmpplotdir, write_csv=True, only_csv=only_csv, xtitle=xtitle, shift_overflows=True) for column in self.hists: if '_vs_mute_freq' in column or '_vs_per_gene_support' in column: # only really care about the fraction, which we plot below continue plotting.draw_no_root(self.hists[column], plotname=column, plotdir=plotdir + '/mutation', write_csv=True, only_csv=only_csv, ytitle='counts', xtitle='inferred - true', shift_overflows=True) # fraction correct vs mute freq for region in utils.regions: hright = self.hists[region + '_gene_right_vs_mute_freq'] hwrong = self.hists[region + '_gene_wrong_vs_mute_freq'] if hright.integral(include_overflows=True) == 0: continue plotting.make_fraction_plot(hright, hwrong, plotdir + '/gene-call', region + '_fraction_correct_vs_mute_freq', xlabel='mut freq', ylabel='fraction correct up to allele', xbounds=(0., 0.5), only_csv=only_csv, write_csv=True) # per-gene support stuff for region in utils.regions: if self.hists[region + '_allele_right_vs_per_gene_support'].integral(include_overflows=True) == 0: continue hright = self.hists[region + '_allele_right_vs_per_gene_support'] hwrong = self.hists[region + '_allele_wrong_vs_per_gene_support'] plotting.make_fraction_plot(hright, hwrong, plotdir + '/gene-call', region + '_allele_fraction_correct_vs_per_gene_support', xlabel='support', ylabel='fraction with correct allele', xbounds=(-0.1, 1.1), only_csv=only_csv, write_csv=True) if not only_csv: # write html file and fix permissiions for substr in self.subplotdirs: plotting.make_html(plotdir + '/' + substr, n_columns=4) print '(%.1f sec)' % (time.time()-start)
def get_uncertainty(self, obs, total): import fraction_uncertainty if self.calculate_uncertainty: # it's kinda slow errs = fraction_uncertainty.err(obs, total) # if errs[2]: # self.n_cached += 1 # else: # self.n_not_cached += 1 else: errs = 0., 1. return errs[0], errs[1]
def plot(self, plotdir, only_csv=False): print ' plotting performance', start = time.time() for substr in self.subplotdirs: utils.prep_dir(plotdir + '/' + substr, wildlings=('*.csv', '*.svg')) for column in self.values: if column in bool_columns: right = self.values[column]['right'] wrong = self.values[column]['wrong'] lo, hi, _ = fraction_uncertainty.err(right, right + wrong) hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column) plotting.draw_no_root(hist, plotname=column, plotdir=plotdir + '/gene-call', write_csv=True, stats='0-bin', only_csv=only_csv) else: hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=False) if 'hamming_to_true_naive' in column: xtitle = 'hamming distance' tmpplotdir = plotdir + '/mutation' else: xtitle = 'inferred - true' if 'muted' in column: tmpplotdir = plotdir + '/mutation' else: tmpplotdir = plotdir + '/boundaries' plotting.draw_no_root(hist, plotname=column, plotdir=tmpplotdir, write_csv=True, only_csv=only_csv, xtitle=xtitle, shift_overflows=True) for column in self.hists: if '_vs_mute_freq' in column or '_vs_per_gene_support' in column: # only really care about the fraction, which we plot below continue plotting.draw_no_root(self.hists[column], plotname=column, plotdir=plotdir + '/mutation', write_csv=True, only_csv=only_csv, ytitle='counts', xtitle='inferred - true', shift_overflows=True) # fraction correct vs mute freq for region in utils.regions: hright = self.hists[region + '_gene_right_vs_mute_freq'] hwrong = self.hists[region + '_gene_wrong_vs_mute_freq'] if hright.integral(include_overflows=True) == 0: continue plotting.make_fraction_plot(hright, hwrong, plotdir + '/gene-call', region + '_fraction_correct_vs_mute_freq', xlabel='mut freq', ylabel='fraction correct up to allele', xbounds=(0., 0.5), only_csv=only_csv, write_csv=True) # per-gene support stuff for region in utils.regions: if self.hists[region + '_allele_right_vs_per_gene_support'].integral(include_overflows=True) == 0: continue hright = self.hists[region + '_allele_right_vs_per_gene_support'] hwrong = self.hists[region + '_allele_wrong_vs_per_gene_support'] plotting.make_fraction_plot(hright, hwrong, plotdir + '/gene-call', region + '_allele_fraction_correct_vs_per_gene_support', xlabel='support', ylabel='fraction with correct allele', xbounds=(-0.1, 1.1), only_csv=only_csv, write_csv=True) if not only_csv: # write html file and fix permissiions for substr in self.subplotdirs: plotting.make_html(plotdir + '/' + substr, n_columns=4) print '(%.1f sec)' % (time.time()-start)
def make_hist_from_bin_entry_file(fname, hist_label='', log=''): hist = Hist(fname=fname) if '_gene' in fname and '_vs_' not in fname: import re tot = 30000 correct = int(tot * hist.bin_contents[1]) incorrect = int(tot * hist.bin_contents[2]) frac = float(correct) / (correct + incorrect) # print correct, incorrect, tot bounds = fraction_uncertainty.err(correct, correct + incorrect, for_paper=True) print ' ', re.findall('[vdj]_gene', fname)[0], frac, 0.5*(bounds[1] - bounds[0]) # sys.exit() roothist = make_hist_from_my_hist_class(hist, hist_label) return roothist
def get_allele_finding_xyvals(self, gene, position): gpcounts = self.counts[gene][position] iterinfo = gpcounts.items() obs = [d['muted'] for nm, d in iterinfo if nm < self.n_max_mutations_per_segment] lohis = [fraction_uncertainty.err(d['muted'], d['total'], use_beta=True) for nm, d in iterinfo if nm < self.n_max_mutations_per_segment] errs = [(hi - lo) / 2 for lo, hi, _ in lohis] weights = [1./(e*e) for e in errs] freqs = [float(d['muted']) / d['total'] if d['total'] > 0 else 0. for nm, d in iterinfo if nm < self.n_max_mutations_per_segment] total = [d['total'] for nm, d in iterinfo if nm < self.n_max_mutations_per_segment] n_mutelist = [nm for nm in gpcounts.keys() if nm < self.n_max_mutations_per_segment] return {'obs' : obs, 'total' : total, 'n_mutelist' : n_mutelist, 'freqs' : freqs, 'errs' : errs, 'weights' : weights}
def add_gene_calls_vs_mute_freq_plots(args, hists, rebin=1.): for idir in range(len(args.names)): name = args.names[idir] for region in utils.regions: hright = hists[idir][region + '_gene_right_vs_mute_freq'] hwrong = hists[idir][region + '_gene_wrong_vs_mute_freq'] hdenom = TH1D(hright) hdenom.Add(hwrong) hfrac = TH1D(hright) hfrac.Divide(hdenom) hfrac.Scale(1. / rebin) # print name, region for ib in range(hfrac.GetNbinsX()+2): lo, hi, cached = fraction_uncertainty.err(hright.GetBinContent(ib), hdenom.GetBinContent(ib), for_paper=True) hfrac.SetBinError(ib, (hi - lo) / 2.) # print '%5d %5d %.2f %.3f' % (hright.GetBinContent(ib), hdenom.GetBinContent(ib), hfrac.GetBinContent(ib), (hi - lo) / 2.) hists[idir][region + '_gene_fraction_vs_mute_freq'] = hfrac
def make_hist_from_bin_entry_file(fname, hist_label='', log=''): hist = Hist(fname=fname) if '_gene' in fname and '_vs_' not in fname: import re tot = 30000 correct = int(tot * hist.bin_contents[1]) incorrect = int(tot * hist.bin_contents[2]) frac = float(correct) / (correct + incorrect) # print correct, incorrect, tot bounds = fraction_uncertainty.err(correct, correct + incorrect, for_paper=True) print ' ', re.findall('[vdj]_gene', fname)[0], frac, 0.5 * (bounds[1] - bounds[0]) # sys.exit() roothist = make_hist_from_my_hist_class(hist, hist_label) return roothist
def add_gene_calls_vs_mute_freq_plots(args, hists, rebin=1., debug=False): print 'TODO what\'s up with rebin rescaling below?' for idir in range(len(args.names)): name = args.names[idir] for region in utils.regions: hright = hists[idir][region + '_gene_right_vs_mute_freq'] hwrong = hists[idir][region + '_gene_wrong_vs_mute_freq'] hdenom = copy.deepcopy(hright) hdenom.add(hwrong) hfrac = copy.deepcopy(hright) hfrac.divide_by(hdenom) # hfrac.Scale(1. / rebin) if debug: print name, region for ib in range(hfrac.n_bins + 2): lo, hi, cached = fraction_uncertainty.err(hright.bin_contents[ib], hdenom.bin_contents[ib], for_paper=True) hfrac.errors[ib] = (hi - lo) / 2. if debug: print '%5d %5d %.2f %.3f' % (hright.bin_contents[ib], hdenom.bin_contents[ib], hfrac.bin_contents[ib], (hi - lo) / 2.) hists[idir][region + '_gene_fraction_vs_mute_freq'] = hfrac
def add_gene_calls_vs_mute_freq_plots(args, hists, rebin=1.): for idir in range(len(args.names)): name = args.names[idir] for region in utils.regions: hright = hists[idir][region + '_gene_right_vs_mute_freq'] hwrong = hists[idir][region + '_gene_wrong_vs_mute_freq'] hdenom = TH1D(hright) hdenom.Add(hwrong) hfrac = TH1D(hright) hfrac.Divide(hdenom) hfrac.Scale(1. / rebin) # print name, region for ib in range(hfrac.GetNbinsX() + 2): lo, hi, cached = fraction_uncertainty.err( hright.GetBinContent(ib), hdenom.GetBinContent(ib), for_paper=True) hfrac.SetBinError(ib, (hi - lo) / 2.) # print '%5d %5d %.2f %.3f' % (hright.GetBinContent(ib), hdenom.GetBinContent(ib), hfrac.GetBinContent(ib), (hi - lo) / 2.) hists[idir][region + '_gene_fraction_vs_mute_freq'] = hfrac
def tigger_calcs(self, position, gcounts, mean_x_icpt): iterinfo = gcounts['tigger'].items() obs = [d['muted'] for nm, d in iterinfo if nm < self.n_max_mutes] if sum(obs) < self.n_obs_min: # ignore positions with only a few observed mutations return lohis = [fraction_uncertainty.err(d['muted'], d['total'], use_beta=True) for nm, d in iterinfo if nm < self.n_max_mutes] errs = [(hi - lo) / 2 for lo, hi, _ in lohis] weights = [1./(e*e) for e in errs] freqs = [float(d['muted']) / d['total'] for nm, d in iterinfo if nm < self.n_max_mutes] total = [d['total'] for nm, d in iterinfo if nm < self.n_max_mutes] # for i in range(len(freqs)): # print ' %3d / %3d = %6.2f %6.2f %6.2f' % (obs[i], total[i], freqs[i], errs[i], weights[i]) n_mutelist = [nm for nm in gcounts['tigger'].keys() if nm < self.n_max_mutes] params, cov = numpy.polyfit(n_mutelist, freqs, 1, w=weights, cov=True) slope, slope_err = params[0], math.sqrt(cov[0][0]) y_icpt, y_icpt_err = params[1], math.sqrt(cov[1][1]) interesting = False if y_icpt + y_icpt_err < 1./8: x_icpt, x_icpt_err = -y_icpt / slope, abs(y_icpt / slope) * math.sqrt((y_icpt_err/y_icpt)**2 + (slope_err/slope)**2) mean_x_icpt['sum'] += x_icpt / x_icpt_err mean_x_icpt['total'] += 1. / x_icpt_err else: x_icpt, x_icpt_err = 0, 0 interesting = True print_str = ' %3d %9.3f +/- %-9.3f %9.3f +/- %-9.3f %7.4f +/- %7.4f %3d / %3d' % (position, x_icpt, x_icpt_err, y_icpt, y_icpt_err, slope, slope_err, sum(obs), sum(total)) if interesting: print_str = utils.color('red', print_str) print print_str # for testing it's easier to make plots here: # plotinfo = {'n_muted' : n_mutelist, 'freqs' : freqs, 'errs' : errs, 'slope' : slope, 'intercept' : y_icpt} # plotting.make_tigger_plot('IGHVX', position, plotinfo) return plotinfo
def frac_err(obs, total): lo, hi = fraction_uncertainty.err(obs, total) return 0.5 * (hi - lo)
def set_bin(numer, denom, ibin, label): frac = float(numer) / denom bounds = fraction_uncertainty.err(numer, denom, use_beta=True) err = max(abs(frac - bounds[0]), abs(frac - bounds[1])) hist.set_ibin(ibin, frac, error=err, label=label)
def plot(self, plotdir, only_csv=False): utils.prep_dir(plotdir, wildling=None, multilings=['*.csv', '*.svg', '*.root']) for column in self.values: if self.only_correct_gene_fractions and column not in bool_columns: continue if column in bool_columns: right = self.values[column]['right'] wrong = self.values[column]['wrong'] errs = fraction_uncertainty.err(right, right+wrong) print ' %s\n correct up to allele: %4d / %-4d = %4.4f (-%.3f, +%.3f)' % (column, right, right+wrong, float(right) / (right + wrong), errs[0], errs[1]) hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column) plotting.draw_no_root(hist, plotname=column, plotdir=plotdir, write_csv=True, stats='0-bin', only_csv=only_csv) else: # TODO this is dumb... I should make the integer-valued ones histograms as well hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=True) log = '' if column.find('hamming_to_true_naive') >= 0: # TODO why doesn't this just use the config dicts in plotheaders or wherever? hist.title = 'hamming distance' else: hist.title = 'inferred - true' plotting.draw_no_root(hist, plotname=column, plotdir=plotdir, write_csv=True, log=log, only_csv=only_csv) for column in self.hists: plotting.draw_no_root(self.hists[column], plotname=column, plotdir=plotdir, write_csv=True, log=log, only_csv=only_csv) # per-gene support crap for region in utils.regions: if self.hists[region + '_allele_right_vs_per_gene_support'].integral(include_overflows=True) == 0: continue xvals = self.hists[region + '_allele_right_vs_per_gene_support'].get_bin_centers() #ignore_overflows=True) right = self.hists[region + '_allele_right_vs_per_gene_support'].bin_contents wrong = self.hists[region + '_allele_wrong_vs_per_gene_support'].bin_contents yvals = [float(r) / (r + w) if r + w > 0. else 0. for r, w in zip(right, wrong)] # remove values corresponding to bins with no entries while yvals.count(0.) > 0: iv = yvals.index(0.) xvals.pop(iv) right.pop(iv) wrong.pop(iv) yvals.pop(iv) tmphilos = [fraction_uncertainty.err(r, r + w) for r, w in zip(right, wrong)] yerrs = [err[1] - err[0] for err in tmphilos] # fitting a line isn't particularly informative, actually # params, cov = numpy.polyfit(xvals, yvals, 1, w=[1./(e*e) if e > 0. else 0. for e in yerrs], cov=True) # slope, slope_err = params[0], math.sqrt(cov[0][0]) # y_icpt, y_icpt_err = params[1], math.sqrt(cov[1][1]) # print '%s slope: %5.2f +/- %5.2f y-intercept: %5.2f +/- %5.2f' % (region, slope, slope_err, y_icpt, y_icpt_err) # print '%s' % region # for iv in range(len(xvals)): # print ' %5.2f %5.0f / %5.0f = %5.2f +/- %.3f' % (xvals[iv], right[iv], right[iv] + wrong[iv], yvals[iv], yerrs[iv]) fig, ax = plotting.mpl_init() ax.errorbar(xvals, yvals, yerr=yerrs, markersize=10, linewidth=1, marker='.') ax.plot((0, 1), (0, 1), color='black', linestyle='--', linewidth=3) # line with slope 1 and intercept 0 # linevals = [slope*x + y_icpt for x in [0] + xvals] # fitted line # ax.plot([0] + xvals, linevals) plotting.mpl_finish(ax, plotdir, region + '_allele_fraction_correct_vs_per_gene_support', xlabel='support', ylabel='fraction correct', xbounds=(-0.1, 1.1), ybounds=(-0.1, 1.1)) if not only_csv: plotting.make_html(plotdir)