Пример #1
0
def make_bool_hist(n_true, n_false, hist_label):
    """ fill a two-bin histogram with the fraction false in the first bin and the fraction true in the second """
    hist = TH1D(hist_label, '', 2, -0.5, 1.5)
    hist.Sumw2()

    true_frac = float(n_true) / (n_true + n_false)
    hist.SetBinContent(1, true_frac)
    true_bounds = fraction_uncertainty.err(n_true,
                                           n_true + n_false,
                                           use_beta=True)
    hist.SetBinError(
        1,
        max(abs(true_frac - true_bounds[0]),
            abs(true_bounds[1] - true_bounds[1])))
    false_frac = float(n_false) / (n_true + n_false)
    hist.SetBinContent(2, false_frac)
    false_bounds = fraction_uncertainty.err(n_false,
                                            n_true + n_false,
                                            use_beta=True)
    hist.SetBinError(
        2,
        max(abs(false_frac - false_bounds[0]),
            abs(false_bounds[1] - false_bounds[1])))

    hist.GetXaxis().SetNdivisions(0)
    hist.GetXaxis().SetBinLabel(1, 'right')
    hist.GetXaxis().SetBinLabel(2, 'wrong')
    hist.GetXaxis().SetLabelSize(0.1)

    return hist
Пример #2
0
    def finalize(self, calculate_uncertainty=True):
        """ convert from counts to mut freqs """
        assert not self.finalized

        self.n_cached, self.n_not_cached = 0, 0
        for gene in self.counts:
            self.freqs[gene], self.plotting_info[gene] = {}, []
            # NOTE <counts> hold the overall (not per-base) frequencies, while <freqs> holds the per-base frequencies
            counts, freqs, plotting_info = self.counts[gene], self.freqs[
                gene], self.plotting_info[gene]
            sorted_positions = sorted(counts)
            for position in sorted_positions:
                freqs[position] = {}
                plotting_info.append({})
                plotting_info[-1]['name'] = utils.sanitize_name(
                    gene) + '_' + str(position)
                plotting_info[-1]['nuke_freqs'] = {}
                n_conserved, n_mutated = 0, 0
                for nuke in utils.nukes:
                    nuke_freq = float(
                        counts[position][nuke]) / counts[position]['total']
                    freqs[position][nuke] = nuke_freq
                    plotting_info[-1]['nuke_freqs'][nuke] = nuke_freq
                    if calculate_uncertainty:  # it's kinda slow
                        errs = fraction_uncertainty.err(
                            counts[position][nuke], counts[position]['total'])
                        if errs[2]:
                            self.n_cached += 1
                        else:
                            self.n_not_cached += 1
                        # print nuke_freq, errs[0], errs[1], '(', counts[position][nuke], ',', counts[position]['total'], ')'
                        assert errs[
                            0] <= nuke_freq  # these checks are probably unnecessary. EDIT and totally saved my ass about ten minutes after writing the previous statement
                        assert nuke_freq <= errs[1]
                        freqs[position][nuke + '_lo_err'] = errs[0]
                        freqs[position][nuke + '_hi_err'] = errs[1]

                    if nuke == counts[position]['gl_nuke']:
                        n_conserved += counts[position][nuke]
                    else:
                        n_mutated += counts[position][nuke]  # sum over A,C,G,T
                    # uncert = fraction_uncertainty.err(obs, total)  # uncertainty for each nuke
                counts[position]['freq'] = float(
                    n_mutated) / counts[position]['total']
                mutated_fraction_err = (0.0, 0.0)
                if calculate_uncertainty:  # it's kinda slow
                    mutated_fraction_err = fraction_uncertainty.err(
                        n_mutated, counts[position]['total'])
                    if mutated_fraction_err[2]:
                        self.n_cached += 1
                    else:
                        self.n_not_cached += 1
                counts[position]['freq_lo_err'] = mutated_fraction_err[0]
                counts[position]['freq_hi_err'] = mutated_fraction_err[1]

        self.mean_rates['all'].normalize()
        for region in utils.regions:
            self.mean_rates[region].normalize()

        self.finalized = True
Пример #3
0
    def finalize(self, calculate_uncertainty=True):
        """ convert from counts to mut freqs """
        assert not self.finalized

        self.n_cached, self.n_not_cached = 0, 0
        for gene in self.counts:
            self.freqs[gene], self.plotting_info[gene] = {}, []
            # NOTE <counts> hold the overall (not per-base) frequencies, while <freqs> holds the per-base frequencies
            counts, freqs, plotting_info = self.counts[gene], self.freqs[gene], self.plotting_info[gene]
            sorted_positions = sorted(counts)
            for position in sorted_positions:
                freqs[position] = {}
                plotting_info.append({})
                plotting_info[-1]['name'] = utils.sanitize_name(gene) + '_' + str(position)
                plotting_info[-1]['nuke_freqs'] = {}
                n_conserved, n_mutated = 0, 0
                for nuke in utils.nukes:
                    nuke_freq = float(counts[position][nuke]) / counts[position]['total']
                    freqs[position][nuke] = nuke_freq
                    plotting_info[-1]['nuke_freqs'][nuke] = nuke_freq
                    if calculate_uncertainty:  # it's kinda slow
                        errs = fraction_uncertainty.err(counts[position][nuke], counts[position]['total'])
                        if errs[2]:
                            self.n_cached += 1
                        else:
                            self.n_not_cached += 1
                        # print nuke_freq, errs[0], errs[1], '(', counts[position][nuke], ',', counts[position]['total'], ')'
                        assert errs[0] <= nuke_freq  # these checks are probably unnecessary. EDIT and totally saved my ass about ten minutes after writing the previous statement
                        assert nuke_freq <= errs[1]
                        freqs[position][nuke + '_lo_err'] = errs[0]
                        freqs[position][nuke + '_hi_err'] = errs[1]

                    if nuke == counts[position]['gl_nuke']:
                        n_conserved += counts[position][nuke]
                    else:
                        n_mutated += counts[position][nuke]  # sum over A,C,G,T
                    # uncert = fraction_uncertainty.err(obs, total)  # uncertainty for each nuke
                counts[position]['freq'] = float(n_mutated) / counts[position]['total']
                mutated_fraction_err = (0.0, 0.0)
                if calculate_uncertainty:  # it's kinda slow
                    mutated_fraction_err = fraction_uncertainty.err(n_mutated, counts[position]['total'])
                    if mutated_fraction_err[2]:
                        self.n_cached += 1
                    else:
                        self.n_not_cached += 1
                counts[position]['freq_lo_err'] = mutated_fraction_err[0]
                counts[position]['freq_hi_err'] = mutated_fraction_err[1]

        self.mean_rates['all'].normalize(overflow_warn=False)  # we expect overflows in mute freq hists, so no need to warn us
        for region in utils.regions:
            self.mean_rates[region].normalize(overflow_warn=False)

        # for gene in self.tmpcounts:
        #     for position in self.tmpcounts[gene]:
        #         self.tmpcounts[gene][position]['muted'].divide_by(self.tmpcounts[gene][position]['total'], debug=False)

        self.finalized = True
 def plot(self, plotdir):
     utils.prep_dir(plotdir + '/plots', wildling=None, multilings=['*.csv', '*.svg', '*.root'])
     for column in self.values:
         if self.only_correct_gene_fractions and column not in bool_columns:
             continue
         if column in bool_columns:
             right = self.values[column]['right']
             wrong = self.values[column]['wrong']
             errs = fraction_uncertainty.err(right, right+wrong)
             print '  %s\n    correct up to allele: %4d / %-4d = %4.4f (-%.3f, +%.3f)' % (column, right, right+wrong, float(right) / (right + wrong), errs[0], errs[1])
             hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column)
             plotting.draw(hist, 'bool', plotname=column, plotdir=plotdir, write_csv=True)
         else:
             # TODO this is dumb... I should make the integer-valued ones histograms as well
             hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=True)
             log = ''
             if column.find('hamming_to_true_naive') >= 0:
                 hist.GetXaxis().SetTitle('hamming distance')
             else:
                 hist.GetXaxis().SetTitle('inferred - true')
             plotting.draw(hist, 'int', plotname=column, plotdir=plotdir, write_csv=True, log=log)
     for column in self.hists:
         hist = plotting.make_hist_from_my_hist_class(self.hists[column], column)
         plotting.draw(hist, 'float', plotname=column, plotdir=plotdir, write_csv=True, log=log)
     
     check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg'])
     check_call(['./bin/permissify-www', plotdir])  # NOTE this should really permissify starting a few directories higher up
Пример #5
0
    def plot(self, plotdir, only_csv=False):
        utils.prep_dir(plotdir, wildling=None, multilings=['*.csv', '*.svg', '*.root'])
        for column in self.values:
            if self.only_correct_gene_fractions and column not in bool_columns:
                continue
            if column in bool_columns:
                right = self.values[column]['right']
                wrong = self.values[column]['wrong']
                errs = fraction_uncertainty.err(right, right+wrong)
                print '  %s\n    correct up to allele: %4d / %-4d = %4.4f (-%.3f, +%.3f)' % (column, right, right+wrong, float(right) / (right + wrong), errs[0], errs[1])
                hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column)
                plotting.draw_no_root(hist, plotname=column, plotdir=plotdir, write_csv=True, stats='0-bin', only_csv=only_csv)
            else:
                # TODO this is dumb... I should make the integer-valued ones histograms as well
                hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=True)
                log = ''
                if column.find('hamming_to_true_naive') >= 0:  # TODO why doesn't this just use the config dicts in plotheaders or wherever?
                    hist.title = 'hamming distance'
                else:
                    hist.title = 'inferred - true'
                plotting.draw_no_root(hist, plotname=column, plotdir=plotdir, write_csv=True, log=log, only_csv=only_csv)
        for column in self.hists:
            plotting.draw_no_root(self.hists[column], plotname=column, plotdir=plotdir, write_csv=True, log=log, only_csv=only_csv)

        if not only_csv:
            plotting.make_html(plotdir)
Пример #6
0
 def get_single_vals(pv):
     yvals = [float(c) / t for c, t in zip(pv['ycounts'], pv['ytotals'])]  # total shouldn't be able to be zero
     tmphilos = [fraction_uncertainty.err(c, t) for c, t in zip(pv['ycounts'], pv['ytotals'])]
     yerrs = [err[1] - err[0] for err in tmphilos]
     print '  %s                    %s' % (xlabel, ylabel)
     for iv in range(len(pv['xvals'])):
         print '   %8.0f     %5.0f / %-5.0f  =  %5.2f   +/-  %.3f' % (pv['xvals'][iv], pv['ycounts'][iv], pv['ytotals'][iv], yvals[iv], yerrs[iv])
     return pv['xvals'], yvals, yerrs
Пример #7
0
def make_bool_hist(n_true, n_false, hist_label):
    """ fill a two-bin histogram with the fraction false in the first bin and the fraction true in the second """
    hist = TH1D(hist_label, '', 2, -0.5, 1.5)
    hist.Sumw2()

    true_frac = float(n_true) / (n_true + n_false)
    hist.SetBinContent(1, true_frac)
    true_bounds = fraction_uncertainty.err(n_true, n_true + n_false, use_beta=True)
    hist.SetBinError(1, max(abs(true_frac - true_bounds[0]), abs(true_bounds[1] - true_bounds[1])))
    false_frac = float(n_false) / (n_true + n_false)
    hist.SetBinContent(2, false_frac)
    false_bounds = fraction_uncertainty.err(n_false, n_true + n_false, use_beta=True)
    hist.SetBinError(2, max(abs(false_frac - false_bounds[0]), abs(false_bounds[1] - false_bounds[1])))

    hist.GetXaxis().SetNdivisions(0)
    hist.GetXaxis().SetBinLabel(1, 'right')
    hist.GetXaxis().SetBinLabel(2, 'wrong')
    hist.GetXaxis().SetLabelSize(0.1)

    return hist
Пример #8
0
    def get_uncertainty(self, obs, total):
        if self.calculate_uncertainty:  # it's kinda slow
            errs = fraction_uncertainty.err(obs, total)
            if errs[2]:
                self.n_cached += 1
            else:
                self.n_not_cached += 1
        else:
            errs = 0., 1.

        return errs[0], errs[1]
Пример #9
0
    def plot(self, plotdir, only_csv=False):
        print '  plotting performance',
        import fraction_uncertainty
        import plotting
        start = time.time()
        for substr in self.subplotdirs:
            utils.prep_dir(plotdir + '/' + substr, wildlings=('*.csv', '*.svg'))

        for column in self.values:
            if column in plotconfig.gene_usage_columns:
                right = self.values[column]['right']
                wrong = self.values[column]['wrong']
                lo, hi = fraction_uncertainty.err(right, right + wrong)
                hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column)
                plotting.draw_no_root(hist, plotname=column, plotdir=plotdir + '/gene-call', write_csv=True, stats='0-bin', only_csv=only_csv)
            else:
                hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=False)
                if 'hamming_to_true_naive' in column:
                    xtitle = 'hamming distance'
                    tmpplotdir = plotdir + '/mutation'
                else:
                    xtitle = 'inferred - true'
                    if 'muted' in column:
                        tmpplotdir = plotdir + '/mutation'
                    else:
                        tmpplotdir = plotdir + '/boundaries'
                plotting.draw_no_root(hist, plotname=column, plotdir=tmpplotdir, write_csv=True, only_csv=only_csv, xtitle=xtitle, shift_overflows=True)

        for column in self.hists:
            if '_vs_mute_freq' in column or '_vs_per_gene_support' in column:  # only really care about the fraction, which we plot below
                continue
            plotting.draw_no_root(self.hists[column], plotname=column, plotdir=plotdir + '/mutation', write_csv=True, only_csv=only_csv, ytitle='counts', xtitle='inferred - true', shift_overflows=True)

        # fraction correct vs mute freq
        for region in utils.regions:
            hright = self.hists[region + '_gene_right_vs_mute_freq']
            hwrong = self.hists[region + '_gene_wrong_vs_mute_freq']
            if hright.integral(include_overflows=True) == 0:
                continue
            plotting.make_fraction_plot(hright, hwrong, plotdir + '/gene-call', region + '_fraction_correct_vs_mute_freq', xlabel='mut freq', ylabel='fraction correct up to allele', xbounds=(0., 0.5), only_csv=only_csv, write_csv=True)

        # per-gene support stuff
        for region in utils.regions:
            if self.hists[region + '_allele_right_vs_per_gene_support'].integral(include_overflows=True) == 0:
                continue
            hright = self.hists[region + '_allele_right_vs_per_gene_support']
            hwrong = self.hists[region + '_allele_wrong_vs_per_gene_support']
            plotting.make_fraction_plot(hright, hwrong, plotdir + '/gene-call', region + '_allele_fraction_correct_vs_per_gene_support', xlabel='support', ylabel='fraction with correct allele', xbounds=(-0.1, 1.1), only_csv=only_csv, write_csv=True)

        if not only_csv:  # write html file and fix permissiions
            for substr in self.subplotdirs:
                plotting.make_html(plotdir + '/' + substr, n_columns=4)

        print '(%.1f sec)' % (time.time()-start)
Пример #10
0
    def get_uncertainty(self, obs, total):
        import fraction_uncertainty
        if self.calculate_uncertainty:  # it's kinda slow
            errs = fraction_uncertainty.err(obs, total)
            # if errs[2]:
            #     self.n_cached += 1
            # else:
            #     self.n_not_cached += 1
        else:
            errs = 0., 1.

        return errs[0], errs[1]
Пример #11
0
    def plot(self, plotdir, only_csv=False):
        print '  plotting performance',
        start = time.time()
        for substr in self.subplotdirs:
            utils.prep_dir(plotdir + '/' + substr, wildlings=('*.csv', '*.svg'))

        for column in self.values:
            if column in bool_columns:
                right = self.values[column]['right']
                wrong = self.values[column]['wrong']
                lo, hi, _ = fraction_uncertainty.err(right, right + wrong)
                hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column)
                plotting.draw_no_root(hist, plotname=column, plotdir=plotdir + '/gene-call', write_csv=True, stats='0-bin', only_csv=only_csv)
            else:
                hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=False)
                if 'hamming_to_true_naive' in column:
                    xtitle = 'hamming distance'
                    tmpplotdir = plotdir + '/mutation'
                else:
                    xtitle = 'inferred - true'
                    if 'muted' in column:
                        tmpplotdir = plotdir + '/mutation'
                    else:
                        tmpplotdir = plotdir + '/boundaries'
                plotting.draw_no_root(hist, plotname=column, plotdir=tmpplotdir, write_csv=True, only_csv=only_csv, xtitle=xtitle, shift_overflows=True)

        for column in self.hists:
            if '_vs_mute_freq' in column or '_vs_per_gene_support' in column:  # only really care about the fraction, which we plot below
                continue
            plotting.draw_no_root(self.hists[column], plotname=column, plotdir=plotdir + '/mutation', write_csv=True, only_csv=only_csv, ytitle='counts', xtitle='inferred - true', shift_overflows=True)

        # fraction correct vs mute freq
        for region in utils.regions:
            hright = self.hists[region + '_gene_right_vs_mute_freq']
            hwrong = self.hists[region + '_gene_wrong_vs_mute_freq']
            if hright.integral(include_overflows=True) == 0:
                continue
            plotting.make_fraction_plot(hright, hwrong, plotdir + '/gene-call', region + '_fraction_correct_vs_mute_freq', xlabel='mut freq', ylabel='fraction correct up to allele', xbounds=(0., 0.5), only_csv=only_csv, write_csv=True)

        # per-gene support stuff
        for region in utils.regions:
            if self.hists[region + '_allele_right_vs_per_gene_support'].integral(include_overflows=True) == 0:
                continue
            hright = self.hists[region + '_allele_right_vs_per_gene_support']
            hwrong = self.hists[region + '_allele_wrong_vs_per_gene_support']
            plotting.make_fraction_plot(hright, hwrong, plotdir + '/gene-call', region + '_allele_fraction_correct_vs_per_gene_support', xlabel='support', ylabel='fraction with correct allele', xbounds=(-0.1, 1.1), only_csv=only_csv, write_csv=True)

        if not only_csv:  # write html file and fix permissiions
            for substr in self.subplotdirs:
                plotting.make_html(plotdir + '/' + substr, n_columns=4)

        print '(%.1f sec)' % (time.time()-start)
Пример #12
0
def make_hist_from_bin_entry_file(fname, hist_label='', log=''):
    hist = Hist(fname=fname)
    if '_gene' in fname and '_vs_' not in fname:
        import re
        tot = 30000
        correct = int(tot * hist.bin_contents[1])
        incorrect = int(tot * hist.bin_contents[2])
        frac = float(correct) / (correct + incorrect)
        # print correct, incorrect, tot
        bounds = fraction_uncertainty.err(correct, correct + incorrect, for_paper=True)
        print '  ', re.findall('[vdj]_gene', fname)[0], frac, 0.5*(bounds[1] - bounds[0])
        # sys.exit()
    roothist = make_hist_from_my_hist_class(hist, hist_label)
    return roothist
Пример #13
0
    def get_allele_finding_xyvals(self, gene, position):
        gpcounts = self.counts[gene][position]
        iterinfo = gpcounts.items()

        obs = [d['muted'] for nm, d in iterinfo if nm < self.n_max_mutations_per_segment]

        lohis = [fraction_uncertainty.err(d['muted'], d['total'], use_beta=True) for nm, d in iterinfo if nm < self.n_max_mutations_per_segment]
        errs = [(hi - lo) / 2 for lo, hi, _ in lohis]
        weights = [1./(e*e) for e in errs]

        freqs = [float(d['muted']) / d['total'] if d['total'] > 0 else 0. for nm, d in iterinfo if nm < self.n_max_mutations_per_segment]
        total = [d['total'] for nm, d in iterinfo if nm < self.n_max_mutations_per_segment]

        n_mutelist = [nm for nm in gpcounts.keys() if nm < self.n_max_mutations_per_segment]

        return {'obs' : obs, 'total' : total, 'n_mutelist' : n_mutelist, 'freqs' : freqs, 'errs' : errs, 'weights' : weights}
Пример #14
0
def add_gene_calls_vs_mute_freq_plots(args, hists, rebin=1.):
    for idir in range(len(args.names)):
        name = args.names[idir]
        for region in utils.regions:
            hright = hists[idir][region + '_gene_right_vs_mute_freq']
            hwrong = hists[idir][region + '_gene_wrong_vs_mute_freq']
            hdenom = TH1D(hright)
            hdenom.Add(hwrong)
            hfrac = TH1D(hright)
            hfrac.Divide(hdenom)
            hfrac.Scale(1. / rebin)
            # print name, region
            for ib in range(hfrac.GetNbinsX()+2):
                lo, hi, cached = fraction_uncertainty.err(hright.GetBinContent(ib), hdenom.GetBinContent(ib), for_paper=True)
                hfrac.SetBinError(ib, (hi - lo) / 2.)
                # print '%5d %5d   %.2f   %.3f' % (hright.GetBinContent(ib), hdenom.GetBinContent(ib), hfrac.GetBinContent(ib), (hi - lo) / 2.)
            hists[idir][region + '_gene_fraction_vs_mute_freq'] = hfrac
Пример #15
0
def make_hist_from_bin_entry_file(fname, hist_label='', log=''):
    hist = Hist(fname=fname)
    if '_gene' in fname and '_vs_' not in fname:
        import re
        tot = 30000
        correct = int(tot * hist.bin_contents[1])
        incorrect = int(tot * hist.bin_contents[2])
        frac = float(correct) / (correct + incorrect)
        # print correct, incorrect, tot
        bounds = fraction_uncertainty.err(correct,
                                          correct + incorrect,
                                          for_paper=True)
        print '  ', re.findall('[vdj]_gene',
                               fname)[0], frac, 0.5 * (bounds[1] - bounds[0])
        # sys.exit()
    roothist = make_hist_from_my_hist_class(hist, hist_label)
    return roothist
Пример #16
0
def add_gene_calls_vs_mute_freq_plots(args, hists, rebin=1., debug=False):
    print 'TODO what\'s up with rebin rescaling below?'
    for idir in range(len(args.names)):
        name = args.names[idir]
        for region in utils.regions:
            hright = hists[idir][region + '_gene_right_vs_mute_freq']
            hwrong = hists[idir][region + '_gene_wrong_vs_mute_freq']
            hdenom = copy.deepcopy(hright)
            hdenom.add(hwrong)
            hfrac = copy.deepcopy(hright)
            hfrac.divide_by(hdenom)
            # hfrac.Scale(1. / rebin)  
            if debug:
                print name, region
            for ib in range(hfrac.n_bins + 2):
                lo, hi, cached = fraction_uncertainty.err(hright.bin_contents[ib], hdenom.bin_contents[ib], for_paper=True)
                hfrac.errors[ib] = (hi - lo) / 2.
                if debug:
                    print '%5d %5d   %.2f   %.3f' % (hright.bin_contents[ib], hdenom.bin_contents[ib], hfrac.bin_contents[ib], (hi - lo) / 2.)
            hists[idir][region + '_gene_fraction_vs_mute_freq'] = hfrac
Пример #17
0
def add_gene_calls_vs_mute_freq_plots(args, hists, rebin=1.):
    for idir in range(len(args.names)):
        name = args.names[idir]
        for region in utils.regions:
            hright = hists[idir][region + '_gene_right_vs_mute_freq']
            hwrong = hists[idir][region + '_gene_wrong_vs_mute_freq']
            hdenom = TH1D(hright)
            hdenom.Add(hwrong)
            hfrac = TH1D(hright)
            hfrac.Divide(hdenom)
            hfrac.Scale(1. / rebin)
            # print name, region
            for ib in range(hfrac.GetNbinsX() + 2):
                lo, hi, cached = fraction_uncertainty.err(
                    hright.GetBinContent(ib),
                    hdenom.GetBinContent(ib),
                    for_paper=True)
                hfrac.SetBinError(ib, (hi - lo) / 2.)
                # print '%5d %5d   %.2f   %.3f' % (hright.GetBinContent(ib), hdenom.GetBinContent(ib), hfrac.GetBinContent(ib), (hi - lo) / 2.)
            hists[idir][region + '_gene_fraction_vs_mute_freq'] = hfrac
Пример #18
0
    def tigger_calcs(self, position, gcounts, mean_x_icpt):
        iterinfo = gcounts['tigger'].items()

        obs = [d['muted'] for nm, d in iterinfo if nm < self.n_max_mutes]
        if sum(obs) < self.n_obs_min:  # ignore positions with only a few observed mutations
            return

        lohis = [fraction_uncertainty.err(d['muted'], d['total'], use_beta=True) for nm, d in iterinfo if nm < self.n_max_mutes]
        errs = [(hi - lo) / 2 for lo, hi, _ in lohis]
        weights = [1./(e*e) for e in errs]

        freqs = [float(d['muted']) / d['total'] for nm, d in iterinfo if nm < self.n_max_mutes]
        total = [d['total'] for nm, d in iterinfo if nm < self.n_max_mutes]

        # for i in range(len(freqs)):
        #     print '  %3d / %3d = %6.2f    %6.2f    %6.2f' % (obs[i], total[i], freqs[i], errs[i], weights[i])
    
        n_mutelist = [nm for nm in gcounts['tigger'].keys() if nm < self.n_max_mutes]

        params, cov = numpy.polyfit(n_mutelist, freqs, 1, w=weights, cov=True)
        slope, slope_err = params[0], math.sqrt(cov[0][0])
        y_icpt, y_icpt_err = params[1], math.sqrt(cov[1][1])
    
        interesting = False
        if y_icpt + y_icpt_err < 1./8:
            x_icpt, x_icpt_err = -y_icpt / slope, abs(y_icpt / slope) * math.sqrt((y_icpt_err/y_icpt)**2 + (slope_err/slope)**2)
            mean_x_icpt['sum'] += x_icpt / x_icpt_err
            mean_x_icpt['total'] += 1. / x_icpt_err
        else:
            x_icpt, x_icpt_err = 0, 0
            interesting = True
        print_str = '   %3d   %9.3f +/- %-9.3f   %9.3f +/- %-9.3f   %7.4f +/- %7.4f      %3d / %3d' % (position, x_icpt, x_icpt_err, y_icpt, y_icpt_err, slope, slope_err, sum(obs), sum(total))
        if interesting:
            print_str = utils.color('red', print_str)
        print print_str

        # for testing it's easier to make plots here:
        # plotinfo = {'n_muted' : n_mutelist, 'freqs' : freqs, 'errs' : errs, 'slope' : slope, 'intercept' : y_icpt}
        # plotting.make_tigger_plot('IGHVX', position, plotinfo)
        return plotinfo
Пример #19
0
def frac_err(obs, total):
    lo, hi = fraction_uncertainty.err(obs, total)
    return 0.5 * (hi - lo)
Пример #20
0
 def set_bin(numer, denom, ibin, label):
     frac = float(numer) / denom
     bounds = fraction_uncertainty.err(numer, denom, use_beta=True)
     err = max(abs(frac - bounds[0]), abs(frac - bounds[1]))
     hist.set_ibin(ibin, frac, error=err, label=label)
Пример #21
0
 def set_bin(numer, denom, ibin, label):
     frac = float(numer) / denom
     bounds = fraction_uncertainty.err(numer, denom, use_beta=True)
     err = max(abs(frac - bounds[0]), abs(frac - bounds[1]))
     hist.set_ibin(ibin, frac, error=err, label=label)
Пример #22
0
    def plot(self, plotdir, only_csv=False):
        utils.prep_dir(plotdir, wildling=None, multilings=['*.csv', '*.svg', '*.root'])
        for column in self.values:
            if self.only_correct_gene_fractions and column not in bool_columns:
                continue
            if column in bool_columns:
                right = self.values[column]['right']
                wrong = self.values[column]['wrong']
                errs = fraction_uncertainty.err(right, right+wrong)
                print '  %s\n    correct up to allele: %4d / %-4d = %4.4f (-%.3f, +%.3f)' % (column, right, right+wrong, float(right) / (right + wrong), errs[0], errs[1])
                hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column)
                plotting.draw_no_root(hist, plotname=column, plotdir=plotdir, write_csv=True, stats='0-bin', only_csv=only_csv)
            else:
                # TODO this is dumb... I should make the integer-valued ones histograms as well
                hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=True)
                log = ''
                if column.find('hamming_to_true_naive') >= 0:  # TODO why doesn't this just use the config dicts in plotheaders or wherever?
                    hist.title = 'hamming distance'
                else:
                    hist.title = 'inferred - true'
                plotting.draw_no_root(hist, plotname=column, plotdir=plotdir, write_csv=True, log=log, only_csv=only_csv)
        for column in self.hists:
            plotting.draw_no_root(self.hists[column], plotname=column, plotdir=plotdir, write_csv=True, log=log, only_csv=only_csv)

        # per-gene support crap
        for region in utils.regions:
            if self.hists[region + '_allele_right_vs_per_gene_support'].integral(include_overflows=True) == 0:
                continue
            xvals = self.hists[region + '_allele_right_vs_per_gene_support'].get_bin_centers() #ignore_overflows=True)
            right = self.hists[region + '_allele_right_vs_per_gene_support'].bin_contents
            wrong = self.hists[region + '_allele_wrong_vs_per_gene_support'].bin_contents
            yvals = [float(r) / (r + w) if r + w > 0. else 0. for r, w in zip(right, wrong)]

            # remove values corresponding to bins with no entries
            while yvals.count(0.) > 0:
                iv = yvals.index(0.)
                xvals.pop(iv)
                right.pop(iv)
                wrong.pop(iv)
                yvals.pop(iv)

            tmphilos = [fraction_uncertainty.err(r, r + w) for r, w in zip(right, wrong)]
            yerrs = [err[1] - err[0] for err in tmphilos]

            # fitting a line isn't particularly informative, actually
            # params, cov = numpy.polyfit(xvals, yvals, 1, w=[1./(e*e) if e > 0. else 0. for e in yerrs], cov=True)
            # slope, slope_err = params[0], math.sqrt(cov[0][0])
            # y_icpt, y_icpt_err = params[1], math.sqrt(cov[1][1])
            # print '%s  slope: %5.2f +/- %5.2f  y-intercept: %5.2f +/- %5.2f' % (region, slope, slope_err, y_icpt, y_icpt_err)

            # print '%s' % region
            # for iv in range(len(xvals)):
            #     print '   %5.2f     %5.0f / %5.0f  =  %5.2f   +/-  %.3f' % (xvals[iv], right[iv], right[iv] + wrong[iv], yvals[iv], yerrs[iv])

            fig, ax = plotting.mpl_init()

            ax.errorbar(xvals, yvals, yerr=yerrs, markersize=10, linewidth=1, marker='.')
            ax.plot((0, 1), (0, 1), color='black', linestyle='--', linewidth=3)  # line with slope 1 and intercept 0
            # linevals = [slope*x + y_icpt for x in [0] + xvals]  # fitted line
            # ax.plot([0] + xvals, linevals)

            plotting.mpl_finish(ax, plotdir, region + '_allele_fraction_correct_vs_per_gene_support', xlabel='support', ylabel='fraction correct', xbounds=(-0.1, 1.1), ybounds=(-0.1, 1.1))

        if not only_csv:
            plotting.make_html(plotdir)