def test_basic_usage(): ''' Test basic usage -- whether Hist are properly derived from\ boost-histogram. ''' # Test normal Hist h = Hist(axis.Regular(10, 0, 1, name='x')) h.fill([0.35, 0.35, 0.45]) assert h[2] == 0 assert h[3] == 2 assert h[4] == 1 assert h[5] == 0 assert h[{0:2}] == 0 assert h[{0:3}] == 2 assert h[{0:4}] == 1 assert h[{0:5}] == 0 # Test multi-axis Hist h = Hist( axis.Regular(10, 0, 1, name="x"), axis.Regular(10, 0, 1, name="y"), axis.Integer(0, 2, name="z") ) h.fill([0.35, 0.35, 0.35, 0.45, 0.55, 0.55, 0.55], [0.35, 0.35, 0.45, 0.45, 0.45, 0.45, 0.45], [0, 0, 1, 1, 1, 1, 1])
def get_mute_hist(self, mtype): if self.args.mutate_from_scratch: mean_mute_val = self.args.scratch_mute_freq if self.args.same_mute_freq_for_all_seqs: hist = Hist(1, mean_mute_val - utils.eps, mean_mute_val + utils.eps) hist.fill(mean_mute_val) else: n_entries = 500 length_vals = [ v for v in numpy.random.exponential(mean_mute_val, n_entries) ] # count doesn't work on numpy.ndarray objects max_val = 0.8 # this is arbitrary, but you shouldn't be calling this with anything that gets a significant number anywhere near there, anyway if length_vals.count(max_val): print '%s lots of really high mutation rates treegenerator::get_mute_hist()' % utils.color( 'yellow', 'warning') length_vals = [min(v, max_val) for v in length_vals] hist = Hist(30, 0., max_val) for val in length_vals: hist.fill(val) hist.normalize() else: hist = Hist(fname=self.parameter_dir + '/' + mtype + '-mean-mute-freqs.csv') return hist
def addplot(oindexlist, ofracslist, n_seqs, fname, title): hist = Hist(30, 0., 1.) for ofracs in ofracslist: hist.fill(ofracs) fig, ax = self.plotting.mpl_init() hist.mpl_plot(ax, remove_empty_bins=True) ax.text(0.65, 0.8 * ax.get_ylim()[1], 'size: %d' % n_seqs, fontsize=20, fontweight='bold') ax.text(0.65, 0.7 * ax.get_ylim()[1], 'h: %.2f' % utils.fay_wu_h(line=None, restrict_to_region=restrict_to_region, occurence_indices=oindexlist, n_seqs=n_seqs), fontsize=20, fontweight='bold') regionstr = restrict_to_region + ' ' if restrict_to_region is not None else '' self.plotting.mpl_finish( ax, plotdir, fname, title=title, xlabel=regionstr + 'mutation frequency', ylabel=regionstr + 'density of mutations', xticks=[0, 1], log='' ) # xticks=[min(occurence_fractions), max(occurence_fractions)], self.addfname(fnames, fname)
def test_basic_usage(): h = Hist(axis.Regular(10, 0, 1)) h.fill([0.35, 0.35, 0.45]) assert h[2] == 0 assert h[3] == 2 assert h[4] == 1 assert h[5] == 0
def get_cluster_size_hist(partition, rebin=None): sizes = [len(c) for c in partition] nbins = max(sizes) # if nbins > 30: # rebin = 2 if rebin is not None: nbins = int(float(nbins) / rebin) hist = Hist(nbins, 0.5, max(sizes) + 0.5) for sz in sizes: hist.fill(sz) return hist
def test_basic_usage(): # Check hist with only one axis h = Hist(axis.Regular(10, 0, 1)) h.fill([0.35, 0.35, 0.45]) assert h[2] == 0 assert h[3] == 2 assert h[4] == 1 assert h[5] == 0 # Check hist with two axes h = Hist(axis.Regular(10, 0, 1), axis.Regular(10, 0, 1)) h.fill([0.35, 0.35, 0.45], [0.65, 0.65, 0.85]) assert h[3, 6] == 2 assert h[4, 8] == 1 assert h[3, 5] == 0 # Checking hist with axis type bool h = Hist(axis.bool()) h.fill([0, 1, 1]) assert h[0] == 1 assert h[1] == 2 # check if there are exactly two bins (accessing h[2] raises IndexError) with pytest.raises(IndexError): assert h[2] == 0 # check if flow is disabled (if view() with or without flow gives the same output) assert (h.view() == h.view(flow=True)).all() h = Hist(axis.Regular(10, 0, 1), axis.Regular(10, 0, 1)) h.fill([0.35, 0.35, 0.45], [0.65, 0.65, 0.85]) # Check indexing using dict and bh.loc() h2 = h[loc(0.35), :] # Broken in 0.6.2, fixed now h3 = h[{0: loc(0.35)}] assert (h2.view() == h3.view()).all()
def make_hist(): ax, fill = request.param h = Hist(ax) if fill is int: h.fill(np.random.randn(10)) elif fill is bool: h.fill(np.random.randint(0, 1, size=10) == 1) elif fill is str: h.fill(np.random.choice(("T", "F"), size=10)) return h
def test_plot1d_auto_handling(): """ Test plot() by comparing against a reference image generated via `pytest --mpl-generate-path=tests/baseline` """ np.random.seed(42) h = Hist( axis.Regular(10, 0, 10, name="variable", label="variable"), axis.StrCategory("", name="dataset", growth=True), ) h_nameless = Hist( axis.Regular(10, 0, 10), axis.StrCategory("", growth=True), ) h.fill(dataset="A", variable=np.random.normal(3, 2, 100)) h.fill(dataset="B", variable=np.random.normal(5, 2, 100)) h.fill(dataset="C", variable=np.random.normal(7, 2, 100)) h_nameless.fill(np.random.normal(3, 2, 1000), "A") h_nameless.fill(np.random.normal(5, 2, 1000), "B") h_nameless.fill(np.random.normal(7, 2, 1000), "C") fig, (ax1, ax2) = plt.subplots(2, 2, figsize=(14, 10)) assert h.plot(ax=ax1[0]) assert h_nameless.plot(ax=ax2[0]) # Discrete axis plotting not yet implemented # assert h.plot(ax=ax1[1], overlay='variable') # assert h.plot(ax=ax2[1], overlay=1) return fig
def get_cluster_size_hist(partition): sizes = [len(c) for c in partition] hist = Hist(max(sizes), 0.5, max(sizes) + 0.5) for sz in sizes: hist.fill(sz) return hist
def test_basic_usage(): # Check if axis without name raises an error with pytest.raises(KeyError): h_named = NamedHist( axis.Regular(10, 0, 1, name="x"), axis.Regular(10, 0, 1) ) h_named = NamedHist( axis.Regular(10, 0, 1, name="x"), axis.Regular(10, 0, 1, name="y") ) # NamedHist should require axis.Regular to have a name set # Check if filling without keyword raises error with pytest.raises(ValueError): h_named.fill([0.35, 0.35, 0.45], y=[0.65, 0.75, 0.85]) h_named.fill(x=[0.35, 0.35, 0.45], y=[5, 10, 14]) # Fill should be keyword only, with the names h_normal = Hist( axis.Regular(10, 0, 1, name="x"), axis.Regular(10, 5, 15, name="y") ) h_normal.fill([0.35, 0.35, 0.45], [0.65, 0.75, 0.85]) assert (h_named.view() == h_normal.view()).all() h = NamedHist( axis.Regular(10, 0, 1, name='x') ) h.fill(x=[0.35, 0.35, 0.45]) # Example of a test that should be made to pass: assert h[{'x': 2}] == 0 # Should work assert h[{'x': 3}] == 2 # Should work assert h[{'x': 4}] == 1 # Should work assert h[{'x': 5}] == 0 # Should work # Additional Test cases on indexing h2 = h_normal[{0: slice(1, 5, None), 1: slice(None, 5, None)}] h3 = h_named[{'y': slice(None, 5, None), 'x': slice(1, 5, None)}] # Check if indexing by axis name works correctly assert (h2.view() == h3.view()).all() h2 = h_normal[{0: 3}] h3 = h_named[{'x': 3}] # Check if indexing works correctly assert (h2.view() == h3.view()).all() h2 = h_normal[{0: loc(0.35)}] h3 = h_normal[loc(0.35), :] h4 = h_named[{'x': loc(0.35)}] # Checking if indexing with loc() works correctly assert (h2.view() == h3.view()).all() assert (h3.view() == h4.view()).all() h2 = h_normal[{1: slice(None, None, sum)}] h3 = h_named[{'y': slice(None, None, sum)}] # Check if indexing with sum works correctly assert (h2.view() == h3.view()).all()
biggest_adiffs = sorted(chfo, key=lambda q: chfo[q]['max_abs_diff'], reverse=True) for uid in biggest_adiffs[:5]: print '%-3d %6.3f' % (chfo[uid]['imax'], chfo[uid]['max_abs_diff']) utils.print_reco_event(annotations[uid]) n_above_cutoff = len( [_ for cfo in chfo.values() if cfo['max_abs_diff'] > args.cutoff]) chimeric_fraction = n_above_cutoff / float(len(chfo)) print ' %d / %d = %.3f above chimeric cutoff' % (n_above_cutoff, len(chfo), chimeric_fraction) hmaxval = Hist(45, 0., 0.65) for uid in annotations: hmaxval.fill(chfo[uid]['max_abs_diff']) himax = Hist(75, 0., 400) for uid in annotations: himax.fill(chfo[uid]['imax']) utils.prep_dir(args.plotdir, wildlings=['*.svg', '*.csv']) import matplotlib from matplotlib import pyplot as plt fig, ax = plotting.mpl_init() xvals, yvals = zip(*[(v['imax'], v['max_abs_diff']) for v in chfo.values()]) plt.scatter(xvals, yvals, alpha=0.4) print 'writing to %s' % args.plotdir plotting.mpl_finish(ax, args.plotdir,
def make_single_joyplot(self, sorted_clusters, annotations, repertoire_size, plotdir, plotname, plot_high_mutation=False, cluster_indices=None, title=None, debug=False): def gety(minval, maxval, xmax, x): slope = (maxval - minval) / xmax return slope * x + minval def getnmutelist(cluster): return annotations[':'.join(cluster)]['n_mutations'] colors = ['#006600', '#3399ff', '#ffa500'] # goldenrod '#daa520' # red '#cc0000', # dark red '#990012' # purple '#a821c7' # grey '#808080' dpi = 80 xpixels = 450 ypixels = max(400, 10 * len(sorted_clusters)) fig, ax = self.plotting.mpl_init(figsize=(xpixels / dpi, ypixels / dpi)) min_linewidth = 0.3 max_linewidth = 12 # min_alpha = 0.1 # max_alpha = 1. # linewidth = 7 alpha = 0.55 ymin, ymax = 9999, 0 iclust_global = 0 # index within this plot yticks, yticklabels = [], [] high_mutation_clusters = [] biggest_n_mutations = None if debug: print ' %s %d x %d %s' % (plotname, xpixels, ypixels, utils.color('red', 'high mutation') if plot_high_mutation else '') print ' size frac yval median mean' for csize, cluster_group in itertools.groupby(sorted_clusters, key=lambda c: len(c)): cluster_group = sorted(list(cluster_group), key=lambda c: numpy.median(getnmutelist(c))) n_clusters = len(cluster_group) repfracstr = self.get_repfracstr(csize, repertoire_size) for iclust in range(len( cluster_group)): # index within the clusters of this size cluster = cluster_group[iclust] nmutelist = sorted(getnmutelist(cluster)) nmedian = numpy.median(nmutelist) nmean = numpy.mean( nmutelist) # maybe should use this instead of the median? if biggest_n_mutations is None or nmutelist[ -1] > biggest_n_mutations: biggest_n_mutations = nmutelist[-1] if nmedian > self.n_max_mutations and not plot_high_mutation: high_mutation_clusters.append(cluster) continue yval = len(sorted_clusters) - iclust_global if yval < ymin: ymin = yval if yval > ymax: ymax = yval yticks.append(yval) # yticklabels.append('%d' % csize) yticklabels.append(repfracstr) base_color = colors[iclust_global % len(colors)] qti_n_muted = {} if self.args.queries_to_include is not None: queries_to_include_in_this_cluster = set(cluster) & set( self.args.queries_to_include) if len(queries_to_include_in_this_cluster) > 0: unsorted_nmutelist = getnmutelist(cluster) qti_n_muted = { uid: unsorted_nmutelist[cluster.index(uid)] for uid in queries_to_include_in_this_cluster } # add a red line for each of 'em (i.e. color that hist bin red) if plot_high_mutation: xtext = 1.1 elif float(nmedian) / self.n_max_mutations < 0.5: xtext = 0.75 else: xtext = 0.1 ax.text(xtext * self.n_max_mutations, yval, ' '.join( sorted(queries_to_include_in_this_cluster, key=lambda q: qti_n_muted[q])), color='red', fontsize=8) if debug: print ' %5s %-10s %4.1f %6.1f %6.1f' % ( '%d' % csize if iclust == 0 else '', repfracstr if iclust == 0 else '', yval, nmedian, nmean) nbins = nmutelist[-1] - nmutelist[0] + 1 hist = Hist(nbins, nmutelist[0] - 0.5, nmutelist[-1] + 0.5) for nm in nmutelist: hist.fill(nm) assert hist.overflow_contents() == 0. # includes underflows xmax = max( hist.bin_contents) # NOTE no relation to <ymax> above for ibin in range(1, hist.n_bins + 1): linewidth = gety(min_linewidth, max_linewidth, xmax, hist.bin_contents[ibin]) color = base_color # alpha = gety(min_alpha, max_alpha, xmax, hist.bin_contents[ibin]) for nmuted in qti_n_muted.values(): if hist.find_bin(nmuted) == ibin: color = 'red' if hist.bin_contents[ibin] == 0.: color = 'grey' linewidth = min_linewidth alpha = 0.4 ax.plot([hist.low_edges[ibin], hist.low_edges[ibin + 1]], [yval, yval], color=color, linewidth=linewidth, alpha=alpha, solid_capstyle='butt') if cluster_indices is not None: xtext = nmutelist[ -1] if plot_high_mutation else self.n_max_mutations # NOTE reuse of <xtext> (arg) xwidth = ax.get_xlim()[1] - ax.get_xlim( )[0] if plot_high_mutation else self.n_max_mutations ax.text(0.05 * xwidth + xtext, yval, str(cluster_indices[':'.join(cluster)]), color=base_color, fontsize=6, alpha=alpha, fontdict={'weight': 'bold'}) ax.text(0.12 * xwidth + xtext, yval, str(csize), color=base_color, fontsize=6, alpha=alpha, fontdict={'weight': 'bold'}) iclust_global += 1 xbounds = [-0.2, self.n_max_mutations] if not plot_high_mutation else [ self.n_max_mutations, biggest_n_mutations ] ybounds = [0.95 * ymin, 1.05 * ymax] n_ticks = 5 if len(yticks) > n_ticks: yticks = [ yticks[i] for i in range(0, len(yticks), int(len(yticks) / float(n_ticks - 1))) ] yticklabels = [ yticklabels[i] for i in range(0, len(yticklabels), int(len(yticklabels) / float(n_ticks - 1))) ] self.plotting.mpl_finish( ax, plotdir, plotname, xlabel='N mutations', ylabel='fraction of repertoire', title=title, # ylabel = 'clonal family size' xbounds=xbounds, ybounds=ybounds, yticks=yticks, yticklabels=yticklabels, adjust={'left': 0.25}) return high_mutation_clusters
label=label_gen, name=voi_gen, underflow=True, overflow=True), hist.axis.Variable(voi_v.bins, label=label_rec, name=voi_rec, underflow=True, overflow=True), storage=hist.storage.Weight(), # like ROOT's Sumw2() ) gen_array = arrays[voi_gen] rec_array = arrays[voi_rec] hist2d.fill(gen_array, rec_array, weight=arrays[branch_weight]) outputDir = os.path.join( os.environ.get('CMSSW_BASE'), 'src/UHH2/HighPtSingleTop/output/Analysis/mainsel', year, channel, 'unfolding/migration_matrices/') os.system('mkdir -p ' + outputDir) outfilePath = os.path.join( outputDir, 'migration_matrix__' + args.drds + '__' + voi_k + '__' + gen_level + '__region_' + region + '__' + year + '_' + channel + '.root') with uproot.recreate(outfilePath) as outfile: outfile['migration_matrix'] = hist2d outfile['axis_label_gen'] = label_gen outfile['axis_label_rec'] = label_rec
def gk(uids): return ':'.join(uids) glfo = glutils.read_glfo(args.infile.replace('.csv', '-glfo'), locus='igh') annotations = {} with open(args.infile) as csvfile: reader = csv.DictReader(csvfile) for line in reader: if line['v_gene'] == '': # failed (i.e. couldn't find an annotation) continue utils.process_input_line(line) # converts strings in the csv file to floats/ints/dicts/etc. utils.add_implicit_info(glfo, line) # add stuff to <line> that's useful, isn't written to the csv since it's redundant annotations[gk(line['unique_ids'])] = line chfo = {uid : utils.get_chimera_max_abs_diff(annotations[uid], iseq=0) for uid in annotations} biggest_adiffs = sorted(chfo, key=lambda q: chfo[q][1], reverse=True) for uid in biggest_adiffs[:10]: print chfo[uid] utils.print_reco_event(annotations[uid]) htmp = Hist(45, 0., 0.65) for uid in annotations: htmp.fill(chfo[uid][1]) utils.prep_dir(args.plotdir, wildlings=['*.svg', '*.csv']) plotname = 'mfreq-diff' plotting.draw_no_root(htmp, plotdir=args.plotdir, plotname=plotname, shift_overflows=True, xtitle='abs mfreq diff', ytitle='seqs') plotting.draw_no_root(htmp, plotdir=args.plotdir, plotname=plotname + '-log', shift_overflows=True, log='y', xtitle='abs mfreq diff', ytitle='seqs') print 'writing to %s' % args.plotdir htmp.write('%s/%s.csv' % (args.plotdir, plotname))
def make_single_size_vs_shm_plot(self, sorted_clusters, annotations, repertoire_size, base_plotdir, plotname, n_max_mutations=100, plot_high_mutation=False, title=None, debug=False): import plotting def gety(minval, maxval, xmax, x): slope = (maxval - minval) / xmax return slope * x + minval def getnmutelist(cluster): return annotations[':'.join(cluster)]['n_mutations'] colors = ['#006600', '#3399ff', '#ffa500'] # goldenrod '#daa520' # red '#cc0000', # dark red '#990012' # purple '#a821c7' # grey '#808080' dpi = 80 xpixels = 450 ypixels = max(400, 10 * len(sorted_clusters)) fig, ax = plotting.mpl_init(figsize=(xpixels / dpi, ypixels / dpi)) min_linewidth = 0.3 max_linewidth = 12 # min_alpha = 0.1 # max_alpha = 1. # linewidth = 7 alpha = 0.55 ymin, ymax = 9999, 0 iclust_global = 0 yticks, yticklabels = [], [] high_mutation_clusters = [] biggest_n_mutations = None if debug: print ' %s %d x %d' % ( plotname, xpixels, ypixels ) #, utils.color('red', 'high mutation') if plot_high_mutation else '') print ' size frac yval median mean' for csize, cluster_group in itertools.groupby(sorted_clusters, key=lambda c: len(c)): cluster_group = sorted(list(cluster_group), key=lambda c: numpy.median(getnmutelist(c))) n_clusters = len(cluster_group) repfracstr = self.get_repfracstr(csize, repertoire_size) for iclust in range(len(cluster_group)): cluster = cluster_group[iclust] nmutelist = sorted(getnmutelist(cluster)) nmedian = numpy.median(nmutelist) nmean = numpy.mean( nmutelist) # maybe should use this instead of the median? if biggest_n_mutations is None or nmutelist[ -1] > biggest_n_mutations: biggest_n_mutations = nmutelist[-1] yval = len(sorted_clusters) - iclust_global if yval < ymin: ymin = yval if yval > ymax: ymax = yval yticks.append(yval) yticklabels.append('%d' % csize) # yticklabels.append(repfracstr) base_color = colors[iclust_global % len(colors)] if self.args.queries_to_include is not None: queries_to_include_in_this_cluster = set(cluster) & set( self.args.queries_to_include) if len(queries_to_include_in_this_cluster) > 0: base_color = 'red' if plot_high_mutation: xtext = 1.1 elif float(nmedian) / n_max_mutations < 0.5: xtext = 0.75 else: xtext = 0.1 ax.text(xtext * n_max_mutations, yval, ' '.join(queries_to_include_in_this_cluster), color='red', fontsize=8) if debug: print ' %5s %-10s %4.1f %6.1f %6.1f' % ( '%d' % csize if iclust == 0 else '', repfracstr if iclust == 0 else '', yval, nmedian, nmean), if nmedian > n_max_mutations and not plot_high_mutation: if debug: print '%s' % utils.color('red', 'high mutation') high_mutation_clusters.append(cluster) continue if debug: print '' nbins = nmutelist[-1] - nmutelist[0] + 1 hist = Hist(nbins, nmutelist[0] - 0.5, nmutelist[-1] + 0.5) for nm in nmutelist: hist.fill(nm) assert hist.overflow_contents() == 0. # includes underflows xmax = max(hist.bin_contents) # float(csize) for ibin in range(1, hist.n_bins + 1): linewidth = gety(min_linewidth, max_linewidth, xmax, hist.bin_contents[ibin]) color = base_color # alpha = gety(min_alpha, max_alpha, xmax, hist.bin_contents[ibin]) if hist.bin_contents[ibin] == 0.: color = 'grey' linewidth = min_linewidth alpha = 0.4 ax.plot([hist.low_edges[ibin], hist.low_edges[ibin + 1]], [yval, yval], color=color, linewidth=linewidth, alpha=alpha, solid_capstyle='butt') iclust_global += 1 xbounds = [-0.2, n_max_mutations] if not plot_high_mutation else [ n_max_mutations, biggest_n_mutations ] ybounds = [0.95 * ymin, 1.05 * ymax] n_ticks = 5 if len(yticks) > n_ticks: yticks = [ yticks[i] for i in range(0, len(yticks), int(len(yticks) / float(n_ticks - 1))) ] yticklabels = [ yticklabels[i] for i in range(0, len(yticklabels), int(len(yticklabels) / float(n_ticks - 1))) ] plotting.mpl_finish(ax, base_plotdir + '/overall', plotname, xlabel='N mutations', ylabel='clonal family size', title=title, xbounds=xbounds, ybounds=ybounds, yticks=yticks, yticklabels=yticklabels, adjust={'left': 0.18}) return high_mutation_clusters