def make_hist_from_dict_of_counts(values, var_type, hist_label, log='', xmin_force=0.0, xmax_force=0.0, normalize=False, sort=False): """ Fill a histogram with values from a dictionary (each key will correspond to one bin) """ assert var_type == 'int' or var_type == 'string' # floats should be handled by Hist class in hist.py if len(values) == 0: print 'WARNING no values for %s in make_hist' % hist_label return TH1D(hist_label, '', 1, 0, 1) bin_labels = sorted(values) if not sort and var_type == 'string': # for strings, sort so most common value is to left side bin_labels = sorted(values, key=values.get, reverse=True) if var_type == 'string': n_bins = len(values) else: n_bins = bin_labels[-1] - bin_labels[0] + 1 hist = None xbins = [0. for _ in range(n_bins+1)] # NOTE the +1 is 'cause you need the lower edge of the overflow bin if xmin_force == xmax_force: # if boundaries aren't set explicitly, work out what they should be if var_type == 'string': set_bins(bin_labels, n_bins, 'x' in log, xbins, var_type) hist = Hist(n_bins, xbins[0], xbins[-1], xbins=xbins) else: hist = Hist(n_bins, bin_labels[0] - 0.5, bin_labels[-1] + 0.5) # for integers, just go from the first to the last bin label (they're sorted) else: hist = Hist(n_bins, xmin_force, xmax_force) for ival in range(len(values)): if var_type == 'string': label = bin_labels[ival] ibin = ival + 1 else: label = '' ibin = hist.find_bin(bin_labels[ival]) hist.set_ibin(ibin, values[bin_labels[ival]], error=math.sqrt(values[bin_labels[ival]]), label=label) # make sure there's no overflows if hist.bin_contents[0] != 0.0 or hist.bin_contents[-1] != 0.0: for ibin in range(hist.n_bins + 2): print '%d %f %f' % (ibin, hist.low_edges[ibin], hist.bin_contents[ibin]) raise Exception('overflows in ' + hist_label) if normalize: hist.normalize() hist.ytitle = 'freq' else: hist.ytitle = 'counts' roothist = make_hist_from_my_hist_class(hist, hist_label) return roothist
def make_hist_from_dict_of_counts(values, var_type, hist_label, log='', xmin_force=0.0, xmax_force=0.0, normalize=False, sort=False): """ Fill a histogram with values from a dictionary (each key will correspond to one bin) """ assert var_type == 'int' or var_type == 'string' # floats should be handled by Hist class in hist.py if len(values) == 0: print 'WARNING no values for %s in make_hist' % hist_label return Hist(1, 0, 1) bin_labels = sorted(values) if not sort and var_type == 'string': # for strings, sort so most common value is to left side bin_labels = sorted(values, key=values.get, reverse=True) if var_type == 'string': n_bins = len(values) else: n_bins = bin_labels[-1] - bin_labels[0] + 1 hist = None xbins = [0. for _ in range(n_bins+1)] # NOTE the +1 is 'cause you need the lower edge of the overflow bin if xmin_force == xmax_force: # if boundaries aren't set explicitly, work out what they should be if var_type == 'string': set_bins(bin_labels, n_bins, 'x' in log, xbins, var_type) hist = Hist(n_bins, xbins[0], xbins[-1], xbins=xbins) else: hist = Hist(n_bins, bin_labels[0] - 0.5, bin_labels[-1] + 0.5) # for integers, just go from the first to the last bin label (they're sorted) else: hist = Hist(n_bins, xmin_force, xmax_force) for ival in range(len(values)): if var_type == 'string': label = bin_labels[ival] ibin = ival + 1 else: label = '' ibin = hist.find_bin(bin_labels[ival]) hist.set_ibin(ibin, values[bin_labels[ival]], error=math.sqrt(values[bin_labels[ival]]), label=label) # make sure there's no overflows if hist.bin_contents[0] != 0.0 or hist.bin_contents[-1] != 0.0: for ibin in range(hist.n_bins + 2): print '%d %f %f' % (ibin, hist.low_edges[ibin], hist.bin_contents[ibin]) raise Exception('overflows in ' + hist_label) if normalize: hist.normalize() hist.ytitle = 'freq' else: hist.ytitle = 'counts' return hist
def plot(self, plotdir, only_csv=False, only_overall=False): if not self.finalized: self.finalize() overall_plotdir = plotdir + '/overall' for gene in self.freqs: if only_overall: continue freqs = self.freqs[gene] if len(freqs) == 0: if gene not in glutils.dummy_d_genes.values(): print ' %s no mutefreqer obs for %s' % (utils.color('red', 'warning'), utils.color_gene(gene)) continue sorted_positions = sorted(freqs.keys()) genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='position', ytitle='mut freq', title=gene) for position in sorted_positions: hi_diff = abs(freqs[position]['freq'] - freqs[position]['freq_hi_err']) lo_diff = abs(freqs[position]['freq'] - freqs[position]['freq_lo_err']) err = 0.5*(hi_diff + lo_diff) genehist.set_ibin(genehist.find_bin(position), freqs[position]['freq'], error=err) xline = None figsize = [7, 4] if utils.get_region(gene) in utils.conserved_codons[self.glfo['chain']]: codon = utils.conserved_codons[self.glfo['chain']][utils.get_region(gene)] xline = self.glfo[codon + '-positions'][gene] if utils.get_region(gene) == 'v': figsize[0] *= 3.5 elif utils.get_region(gene) == 'j': figsize[0] *= 2 plotting.draw_no_root(self.per_gene_mean_rates[gene], plotdir=plotdir + '/per-gene/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, only_csv=only_csv, shift_overflows=True) # per-position plots: plotting.draw_no_root(genehist, plotdir=plotdir + '/per-gene-per-position/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv, shift_overflows=True) # # per-position, per-base plots: # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # needs translation to mpl UPDATE fcn is fixed, but I can't be bothered uncommenting this at the moment # make mean mute freq hists for rstr in ['all', 'cdr3'] + utils.regions: if rstr == 'all': bounds = (0.0, 0.4) else: bounds = (0.0, 0.6 if rstr == 'd' else 0.4) plotting.draw_no_root(self.mean_rates[rstr], plotname=rstr+'_mean-freq', plotdir=overall_plotdir, stats='mean', bounds=bounds, write_csv=True, only_csv=only_csv, shift_overflows=True) plotting.draw_no_root(self.mean_n_muted[rstr], plotname=rstr+'_mean-n-muted', plotdir=overall_plotdir, stats='mean', write_csv=True, only_csv=only_csv, shift_overflows=True) if not only_csv: # write html file and fix permissiions for substr in self.subplotdirs: plotting.make_html(plotdir + '/' + substr)
def plot(self, base_plotdir, cyst_positions=None, tryp_positions=None, only_csv=False): if not self.finalized: self.finalize() plotdir = base_plotdir + '/mute-freqs' overall_plotdir = plotdir + '/overall' utils.prep_dir(overall_plotdir, multilings=('*.csv', '*.svg')) for region in utils.regions: utils.prep_dir(plotdir + '/' + region, multilings=('*.csv', '*.svg')) # utils.prep_dir(plotdir + '/' + region + '-per-base/plots', multilings=('*.csv', '*.png')) if self.tigger: utils.prep_dir(plotdir + '/tigger', multilings=('*.csv', '*.svg')) for gene in self.freqs: freqs = self.freqs[gene] sorted_positions = sorted(freqs.keys()) genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='fixme', ytitle='fixme') #, title=utils.sanitize_name(gene)) for position in sorted_positions: hi_diff = abs(freqs[position]['freq'] - freqs[position]['freq_hi_err']) lo_diff = abs(freqs[position]['freq'] - freqs[position]['freq_lo_err']) err = 0.5*(hi_diff + lo_diff) genehist.set_ibin(genehist.find_bin(position), freqs[position]['freq'], error=err) xline = None figsize = [3, 3] if utils.get_region(gene) == 'v' and cyst_positions is not None: xline = cyst_positions[gene] figsize[0] *= 3.5 elif utils.get_region(gene) == 'j' and tryp_positions is not None: xline = tryp_positions[gene] figsize[0] *= 2 plotting.draw_no_root(genehist, plotdir=plotdir + '/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv) # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # needs translation to mpl # make mean mute freq hists plotting.draw_no_root(self.mean_rates['all'], plotname='all-mean-freq', plotdir=overall_plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv) for region in utils.regions: plotting.draw_no_root(self.mean_rates[region], plotname=region+'-mean-freq', plotdir=overall_plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv) if self.tigger: self.tigger_plot(only_csv) if not only_csv: # write html file and fix permissiions plotting.make_html(overall_plotdir) for region in utils.regions: plotting.make_html(plotdir + '/' + region, n_columns=1)
def plot(self, base_plotdir, cyst_positions=None, tryp_positions=None, only_csv=False): if not self.finalized: self.finalize() plotdir = base_plotdir + '/mute-freqs' utils.prep_dir(plotdir + '/plots', multilings=('*.csv', '*.svg')) for region in utils.regions: utils.prep_dir(plotdir + '/' + region + '/plots', multilings=('*.csv', '*.svg')) # utils.prep_dir(plotdir + '/' + region + '-per-base/plots', multilings=('*.csv', '*.png')) for gene in self.counts: counts, plotting_info = self.counts[gene], self.plotting_info[gene] sorted_positions = sorted(counts) genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='fixme', ytitle='fixme') #, title=utils.sanitize_name(gene)) for position in sorted_positions: hi_diff = abs(counts[position]['freq'] - counts[position]['freq_hi_err']) lo_diff = abs(counts[position]['freq'] - counts[position]['freq_lo_err']) err = 0.5*(hi_diff + lo_diff) genehist.set_ibin(genehist.find_bin(position), counts[position]['freq'], error=err) xline = None figsize = [3, 3] if utils.get_region(gene) == 'v' and cyst_positions is not None: xline = cyst_positions[gene]['cysteine-position'] figsize[0] *= 3.5 elif utils.get_region(gene) == 'j' and tryp_positions is not None: xline = int(tryp_positions[gene]) figsize[0] *= 2 plotting.draw_no_root(genehist, plotdir=plotdir + '/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv) # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # needs translation to mpl # make mean mute freq hists plotting.draw_no_root(self.mean_rates['all'], plotname='all-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv) for region in utils.regions: plotting.draw_no_root(self.mean_rates[region], plotname=region+'-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv) if not only_csv: # write html file and fix permissiions check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg']) for region in utils.regions: check_call(['./bin/makeHtml', plotdir + '/' + region, '1', 'null', 'svg']) # check_call(['./bin/makeHtml', plotdir + '/' + region + '-per-base', '1', 'null', 'png']) check_call(['./bin/permissify-www', plotdir]) # NOTE this should really permissify starting a few directories higher up
def plot(self, plotdir, only_csv=False, only_overall=False): import plotting if not self.finalized: self.finalize() overall_plotdir = plotdir + '/overall' for gene in self.freqs: if only_overall: continue freqs = self.freqs[gene] if len(freqs) == 0: if gene not in glutils.dummy_d_genes.values(): print ' %s no mutefreqer obs for %s' % (utils.color( 'red', 'warning'), utils.color_gene(gene)) continue sorted_positions = sorted(freqs.keys()) genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='position', ytitle='mut freq', title=gene) for position in sorted_positions: hi_diff = abs(freqs[position]['freq'] - freqs[position]['freq_hi_err']) lo_diff = abs(freqs[position]['freq'] - freqs[position]['freq_lo_err']) err = 0.5 * (hi_diff + lo_diff) genehist.set_ibin(genehist.find_bin(position), freqs[position]['freq'], error=err) xline = None figsize = [7, 4] if utils.get_region(gene) in utils.conserved_codons[ self.glfo['locus']]: xline = utils.cdn_pos(self.glfo, utils.get_region(gene), gene) if utils.get_region(gene) == 'v': figsize[0] *= 3.5 elif utils.get_region(gene) == 'j': figsize[0] *= 2 plotting.draw_no_root(self.per_gene_mean_rates[gene], plotdir=plotdir + '/per-gene/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, only_csv=only_csv, shift_overflows=True) # per-position plots: plotting.draw_no_root(genehist, plotdir=plotdir + '/per-gene-per-position/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv, shift_overflows=True) # # per-position, per-base plots: # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # needs translation to mpl UPDATE fcn is fixed, but I can't be bothered uncommenting this at the moment # make mean mute freq hists for rstr in ['all', 'cdr3'] + utils.regions: if rstr == 'all': bounds = (0.0, 0.4) else: bounds = (0.0, 0.6 if rstr == 'd' else 0.4) plotting.draw_no_root(self.mean_rates[rstr], plotname=rstr + '_mean-freq', plotdir=overall_plotdir, stats='mean', bounds=bounds, write_csv=True, only_csv=only_csv, shift_overflows=True) plotting.draw_no_root(self.mean_n_muted[rstr], plotname=rstr + '_mean-n-muted', plotdir=overall_plotdir, stats='mean', write_csv=True, only_csv=only_csv, shift_overflows=True) if not only_csv: # write html file and fix permissiions for substr in self.subplotdirs: plotting.make_html(plotdir + '/' + substr)
def make_single_joyplot(self, sorted_clusters, annotations, repertoire_size, plotdir, plotname, plot_high_mutation=False, cluster_indices=None, title=None, debug=False): def gety(minval, maxval, xmax, x): slope = (maxval - minval) / xmax return slope * x + minval def getnmutelist(cluster): return annotations[':'.join(cluster)]['n_mutations'] colors = ['#006600', '#3399ff', '#ffa500'] # goldenrod '#daa520' # red '#cc0000', # dark red '#990012' # purple '#a821c7' # grey '#808080' dpi = 80 xpixels = 450 ypixels = max(400, 10 * len(sorted_clusters)) fig, ax = self.plotting.mpl_init(figsize=(xpixels / dpi, ypixels / dpi)) min_linewidth = 0.3 max_linewidth = 12 # min_alpha = 0.1 # max_alpha = 1. # linewidth = 7 alpha = 0.55 ymin, ymax = 9999, 0 iclust_global = 0 # index within this plot yticks, yticklabels = [], [] high_mutation_clusters = [] biggest_n_mutations = None if debug: print ' %s %d x %d %s' % (plotname, xpixels, ypixels, utils.color('red', 'high mutation') if plot_high_mutation else '') print ' size frac yval median mean' for csize, cluster_group in itertools.groupby(sorted_clusters, key=lambda c: len(c)): cluster_group = sorted(list(cluster_group), key=lambda c: numpy.median(getnmutelist(c))) n_clusters = len(cluster_group) repfracstr = self.get_repfracstr(csize, repertoire_size) for iclust in range(len( cluster_group)): # index within the clusters of this size cluster = cluster_group[iclust] nmutelist = sorted(getnmutelist(cluster)) nmedian = numpy.median(nmutelist) nmean = numpy.mean( nmutelist) # maybe should use this instead of the median? if biggest_n_mutations is None or nmutelist[ -1] > biggest_n_mutations: biggest_n_mutations = nmutelist[-1] if nmedian > self.n_max_mutations and not plot_high_mutation: high_mutation_clusters.append(cluster) continue yval = len(sorted_clusters) - iclust_global if yval < ymin: ymin = yval if yval > ymax: ymax = yval yticks.append(yval) # yticklabels.append('%d' % csize) yticklabels.append(repfracstr) base_color = colors[iclust_global % len(colors)] qti_n_muted = {} if self.args.queries_to_include is not None: queries_to_include_in_this_cluster = set(cluster) & set( self.args.queries_to_include) if len(queries_to_include_in_this_cluster) > 0: unsorted_nmutelist = getnmutelist(cluster) qti_n_muted = { uid: unsorted_nmutelist[cluster.index(uid)] for uid in queries_to_include_in_this_cluster } # add a red line for each of 'em (i.e. color that hist bin red) if plot_high_mutation: xtext = 1.1 elif float(nmedian) / self.n_max_mutations < 0.5: xtext = 0.75 else: xtext = 0.1 ax.text(xtext * self.n_max_mutations, yval, ' '.join( sorted(queries_to_include_in_this_cluster, key=lambda q: qti_n_muted[q])), color='red', fontsize=8) if debug: print ' %5s %-10s %4.1f %6.1f %6.1f' % ( '%d' % csize if iclust == 0 else '', repfracstr if iclust == 0 else '', yval, nmedian, nmean) nbins = nmutelist[-1] - nmutelist[0] + 1 hist = Hist(nbins, nmutelist[0] - 0.5, nmutelist[-1] + 0.5) for nm in nmutelist: hist.fill(nm) assert hist.overflow_contents() == 0. # includes underflows xmax = max( hist.bin_contents) # NOTE no relation to <ymax> above for ibin in range(1, hist.n_bins + 1): linewidth = gety(min_linewidth, max_linewidth, xmax, hist.bin_contents[ibin]) color = base_color # alpha = gety(min_alpha, max_alpha, xmax, hist.bin_contents[ibin]) for nmuted in qti_n_muted.values(): if hist.find_bin(nmuted) == ibin: color = 'red' if hist.bin_contents[ibin] == 0.: color = 'grey' linewidth = min_linewidth alpha = 0.4 ax.plot([hist.low_edges[ibin], hist.low_edges[ibin + 1]], [yval, yval], color=color, linewidth=linewidth, alpha=alpha, solid_capstyle='butt') if cluster_indices is not None: xtext = nmutelist[ -1] if plot_high_mutation else self.n_max_mutations # NOTE reuse of <xtext> (arg) xwidth = ax.get_xlim()[1] - ax.get_xlim( )[0] if plot_high_mutation else self.n_max_mutations ax.text(0.05 * xwidth + xtext, yval, str(cluster_indices[':'.join(cluster)]), color=base_color, fontsize=6, alpha=alpha, fontdict={'weight': 'bold'}) ax.text(0.12 * xwidth + xtext, yval, str(csize), color=base_color, fontsize=6, alpha=alpha, fontdict={'weight': 'bold'}) iclust_global += 1 xbounds = [-0.2, self.n_max_mutations] if not plot_high_mutation else [ self.n_max_mutations, biggest_n_mutations ] ybounds = [0.95 * ymin, 1.05 * ymax] n_ticks = 5 if len(yticks) > n_ticks: yticks = [ yticks[i] for i in range(0, len(yticks), int(len(yticks) / float(n_ticks - 1))) ] yticklabels = [ yticklabels[i] for i in range(0, len(yticklabels), int(len(yticklabels) / float(n_ticks - 1))) ] self.plotting.mpl_finish( ax, plotdir, plotname, xlabel='N mutations', ylabel='fraction of repertoire', title=title, # ylabel = 'clonal family size' xbounds=xbounds, ybounds=ybounds, yticks=yticks, yticklabels=yticklabels, adjust={'left': 0.25}) return high_mutation_clusters
def make_hist_from_dict_of_counts( values, var_type, hist_label, is_log_x=False, xmin_force=0.0, xmax_force=0.0, sort_by_counts=False, default_n_bins=30 ): # default_n_bins is only used if is_log_x set we're doing auto log bins """ Fill a histogram with values from a dictionary (each key will correspond to one bin) """ assert var_type == 'int' or var_type == 'string' # floats should be handled by Hist class in hist.py if len(values) == 0: print 'WARNING no values for %s in make_hist' % hist_label return Hist(1, 0, 1) bin_labels = sorted( values ) # by default sort by keys in dict (i.e. these aren't usually actually string "labels") if sort_by_counts: # instead sort by counts bin_labels = sorted(values, key=values.get, reverse=True) if var_type == 'string': n_bins = len(values) else: n_bins = bin_labels[-1] - bin_labels[ 0] + 1 if not is_log_x else default_n_bins hist = None xbins = [ 0. for _ in range(n_bins + 1) ] # NOTE the +1 is 'cause you need the lower edge of the overflow bin if xmin_force == xmax_force: # if boundaries aren't set explicitly, work out what they should be if var_type == 'string': set_bins(bin_labels, n_bins, is_log_x, xbins, var_type) hist = Hist(n_bins, xbins[0], xbins[-1], xbins=xbins) else: if is_log_x: # get automatic log-spaced bins set_bins(bin_labels, n_bins, is_log_x, xbins, var_type) hist = Hist(n_bins, xbins[0], xbins[-1], xbins=xbins) else: hist = Hist( n_bins, bin_labels[0] - 0.5, bin_labels[-1] + 0.5 ) # for integers, just go from the first to the last bin label (they're sorted) else: hist = Hist(n_bins, xmin_force, xmax_force) for ival in range(len(values)): if var_type == 'string': label = bin_labels[ival] ibin = ival + 1 else: label = '' ibin = hist.find_bin(bin_labels[ival]) hist.set_ibin(ibin, values[bin_labels[ival]], error=math.sqrt(values[bin_labels[ival]]), label=label) # make sure there's no overflows if hist.bin_contents[0] != 0.0 or hist.bin_contents[-1] != 0.0: for ibin in range(hist.n_bins + 2): print '%d %f %f' % (ibin, hist.low_edges[ibin], hist.bin_contents[ibin]) raise Exception('overflows in ' + hist_label) return hist