def sample(self, n_vals, include_overflows=False, debug_plot=False): # draw <n_vals> random numbers from the x axis, according to the probabilities given by the bin contents NOTE similarity to recombinator.choose_vdj_combo() assert not include_overflows # probably doesn't really make sense (since contents of overflows could've been from anywhere below/above, but we'd only return bin center), this is just a way to remind that it doesn't make sense self.normalize(include_overflows=include_overflows) # if this is going to get called a lot with n_vals of 1, this would be slow, but otoh we *really* want to make sure things are normalized with include_overflows the same as it is here centers = self.get_bin_centers() pvals = numpy.random.uniform(0, 1, size=n_vals) return_vals = [None for _ in pvals] sum_prob, last_sum_prob = 0., 0. for ibin in self.ibiniter(include_overflows): sum_prob += self.bin_contents[ibin] for iprob, pval in enumerate(pvals): if pval < sum_prob and pval >= last_sum_prob: return_vals[iprob] = centers[ibin] last_sum_prob = sum_prob assert return_vals.count(None) == 0 if debug_plot: import plotting fig, ax = plotting.mpl_init() self.mpl_plot(ax, label='original') shist = Hist(value_list=return_vals, init_int_bins=True) shist.normalize(include_overflows=False) shist.mpl_plot(ax, label='sampled', color='red') plotting.mpl_finish(ax, '', 'tmp') return return_vals
def make_transition_plot(self, gene_name, model): """ NOTE shares a lot with make_mutefreq_plot() in python/paramutils.py """ fig, ax = plotting.mpl_init() fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)]) ibin = 0 print utils.color_gene(utils.unsanitize_name(gene_name)) legend_colors = set() # add a color to this the first time you plot it for state in model.states: # bin label ax.text(-0.5 + ibin, -0.075, paramutils.simplify_state_name(state.name), rotation='vertical', size=8) sorted_to_states = {} for name in state.transitions.keys(): if name.find('IG') == 0 or name.find('TR') == 0: sorted_to_states[name] = int(paramutils.simplify_state_name(name)) else: sorted_to_states[name] = name sorted_to_states = sorted(sorted_to_states.items(), key=operator.itemgetter(1)) total = 0.0 for to_state, simple_to_state in sorted_to_states: prob = state.transitions[to_state] alpha = 0.6 width = 3 if 'insert' in str(simple_to_state): label = 'insert' color = '#3498db' # blue elif str(simple_to_state) == 'end': label = 'end' color = 'red' else: # regional/internal states assert to_state.find('IG') == 0 or to_state.find('TR') == 0 label = 'internal' color = 'green' label_to_use = None if color not in legend_colors: label_to_use = label legend_colors.add(color) # horizontal line at height total+prob ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, linewidth=width, alpha=alpha, label=label_to_use) # vertical line from total to total + prob ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=width) midpoint = 0.5*(prob + 2*total) # ax.text(ibin, midpoint, paramutils.simplify_state_name(to_state)) # nicely labels the midpoint of the chunk between lines, but there isn't really room for it total += prob ibin += 1 ax.get_xaxis().set_visible(False) plotting.mpl_finish(ax, self.base_plotdir + '/transitions', gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(model.states) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})
def make_transition_plot(self, gene_name, model): """ NOTE shares a lot with make_mutefreq_plot() in python/paramutils.py """ fig, ax = plotting.mpl_init() fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)]) ibin = 0 print utils.color_gene(utils.unsanitize_name(gene_name)) legend_colors = set() # add a color to this the first time you plot it for state in model.states: # bin label ax.text(-0.5 + ibin, -0.075, paramutils.simplify_state_name(state.name), rotation='vertical', size=8) sorted_to_states = {} for name in state.transitions.keys(): if name.find('IG') == 0: sorted_to_states[name] = int(paramutils.simplify_state_name(name)) else: sorted_to_states[name] = name sorted_to_states = sorted(sorted_to_states.items(), key=operator.itemgetter(1)) total = 0.0 for to_state, simple_to_state in sorted_to_states: prob = state.transitions[to_state] alpha = 0.6 width = 3 if 'insert' in str(simple_to_state): label = 'insert' color = '#3498db' # blue elif str(simple_to_state) == 'end': label = 'end' color = 'red' else: # regional/internal states assert to_state.find('IG') == 0 label = 'internal' color = 'green' label_to_use = None if color not in legend_colors: label_to_use = label legend_colors.add(color) # horizontal line at height total+prob ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, linewidth=width, alpha=alpha, label=label_to_use) # vertical line from total to total + prob ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=width) midpoint = 0.5*(prob + 2*total) # ax.text(ibin, midpoint, paramutils.simplify_state_name(to_state)) # nicely labels the midpoint of the chunk between lines, but there isn't really room for it total += prob ibin += 1 ax.get_xaxis().set_visible(False) plotting.mpl_finish(ax, self.base_plotdir + '/transitions', gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(model.states) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})
def peruse_naive_seqs(): from hist import Hist # hall = Hist(n_set_list[-1], n_set_list[0] - 0.5, n_set_list[-1] + 0.5) means = [] for n_set in n_set_list: plotdir = baseplotdir + '/' + str(n_set) hist = Hist(fname=plotdir + '/hmm/hamming_to_true_naive.csv') print '%2d %.2f' % (n_set, hist.get_mean()), # hall.set_ibin(hall.find_bin(n_set), hist.get_mean()) means.append(hist.get_mean()) import plotting fig, ax = plotting.mpl_init() # hall.mpl_plot(ax) ax.plot(n_set_list, means, marker='.') plotting.mpl_finish(ax, baseplotdir, 'means', xlabel='N simultaneous seqs', ylabel='mean hamming to true naive', ybounds=(0, None))
def make_mutefreq_plot(plotdir, gene_name, positions): import plotting """ NOTE shares a lot with make_transition_plot() in bin/plot-hmms.py. """ nuke_colors = {'A' : 'red', 'C' : 'blue', 'G' : 'orange', 'T' : 'green'} fig, ax = plotting.mpl_init() fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)]) ibin = 0 print utils.color_gene(utils.unsanitize_name(gene_name)) legend_colors = set() for info in positions: posname = info['name'] # make label below bin ax.text(-0.5 + ibin, -0.075, simplify_state_name(posname), rotation='vertical', size=8) total = 0.0 alpha = 0.6 for nuke, prob in sorted(info['nuke_freqs'].items(), key=operator.itemgetter(1), reverse=True): color = nuke_colors[nuke] label_to_use = None if color not in legend_colors: label_to_use = nuke legend_colors.add(color) # horizontal line at height total+prob ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, alpha=alpha, linewidth=3, label=label_to_use) # vertical line from total to total + prob ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=3) # # write [ACGT] at midpoint between total and total+prob # midpoint = 0.5*(prob + 2*total) # ... *redacted* total += prob ibin += 1 ax.get_xaxis().set_visible(False) plotting.mpl_finish(ax, plotdir, gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(positions) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})
def make_mutefreq_plot(plotdir, gene_name, positions): """ NOTE shares a lot with make_transition_plot() in bin/plot-hmms.py. """ nuke_colors = {'A' : 'red', 'C' : 'blue', 'G' : 'orange', 'T' : 'green'} fig, ax = plotting.mpl_init() fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)]) ibin = 0 print utils.color_gene(utils.unsanitize_name(gene_name)) legend_colors = set() for info in positions: posname = info['name'] # make label below bin ax.text(-0.5 + ibin, -0.075, simplify_state_name(posname), rotation='vertical', size=8) total = 0.0 alpha = 0.6 for nuke, prob in sorted(info['nuke_freqs'].items(), key=operator.itemgetter(1), reverse=True): color = nuke_colors[nuke] label_to_use = None if color not in legend_colors: label_to_use = nuke legend_colors.add(color) # horizontal line at height total+prob ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, alpha=alpha, linewidth=3, label=label_to_use) # vertical line from total to total + prob ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=3) # # write [ACGT] at midpoint between total and total+prob # midpoint = 0.5*(prob + 2*total) # ... *redacted* total += prob ibin += 1 ax.get_xaxis().set_visible(False) plotting.mpl_finish(ax, plotdir, gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(positions) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})
def peruse_forward_scores(): _, reco_info = seqfileopener.get_seqfile_info(simfname, is_data=False) #, n_max_queries=10000) logprobs, partialcorr_logprobs, corr_logprobs = OrderedDict(), OrderedDict(), OrderedDict() for n_set in n_set_list: print n_set # if n_set != 5: # continue logprobs[n_set], partialcorr_logprobs[n_set], corr_logprobs[n_set] = OrderedDict(), OrderedDict(), OrderedDict() with open(outputdir + '/' + str(n_set) + '-forward.csv') as csvfile: reader = csv.DictReader(csvfile) for line in reader: uidlist = line['unique_ids'].split(':') assert utils.from_same_event(reco_info, uidlist) reco_id = reco_info[uidlist[0]]['reco_id'] if reco_id in logprobs[n_set]: raise Exception('already had %s' % reco_id) logprobs[n_set][reco_id] = float(line['logprob']) factor = 1. / n_set partialcorr_logprobs[n_set][reco_id] = factor * float(line['logprob']) factor = (1. - 0.24 / pow(float(n_set), 0.9)) / n_set # factor = 1. / (0.77547824*n_set + 0.20327936) corr_logprobs[n_set][reco_id] = factor * float(line['logprob']) i_baseline = -1 deviations = get_deviations(logprobs, i_baseline) # fit_stuff(n_set_list, deviations) partialcorr_deviations = get_deviations(partialcorr_logprobs, i_baseline) signed_partialcorr_deviations = get_deviations(partialcorr_logprobs, i_baseline, signed=True) corr_deviations = get_deviations(corr_logprobs, i_baseline) signed_corr_deviations = get_deviations(corr_logprobs, i_baseline, signed=True) import plotting fig, ax = plotting.mpl_init() ax.plot(n_set_list, deviations, marker='.') plotting.mpl_finish(ax, baseplotdir, 'forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02)) # fig, ax = plotting.mpl_init() # ax.plot(n_set_list, partialcorr_deviations, marker='.') # ax.plot([n_set_list[0], n_set_list[-1]], [0, 0]) # plotting.mpl_finish(ax, baseplotdir, 'partially-corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02)) fig, ax = plotting.mpl_init() ax.plot(n_set_list, partialcorr_deviations, marker='.', label='1/n (abs)') ax.plot(n_set_list, signed_partialcorr_deviations, marker='.', label='1/n') ax.plot(n_set_list, corr_deviations, marker='.', label='1/crap (abs)') ax.plot(n_set_list, signed_corr_deviations, marker='.', label='1/crap') ax.plot([n_set_list[0], n_set_list[-1]], [0, 0]) plotting.mpl_finish(ax, baseplotdir, 'corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02)) fig, ax = plotting.mpl_init() ax.plot(n_set_list, signed_corr_deviations, marker='.') ax.plot([n_set_list[0], n_set_list[-1]], [0, 0]) plotting.mpl_finish(ax, baseplotdir, 'signed-corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02))
def peruse_forward_scores(): _, reco_info = seqfileopener.get_seqfile_info( simfname, is_data=False) #, n_max_queries=10000) logprobs, partialcorr_logprobs, corr_logprobs = OrderedDict(), OrderedDict( ), OrderedDict() for n_set in n_set_list: print n_set # if n_set != 5: # continue logprobs[n_set], partialcorr_logprobs[n_set], corr_logprobs[ n_set] = OrderedDict(), OrderedDict(), OrderedDict() with open(outputdir + '/' + str(n_set) + '-forward.csv') as csvfile: reader = csv.DictReader(csvfile) for line in reader: uidlist = line['unique_ids'].split(':') assert utils.from_same_event(reco_info, uidlist) reco_id = reco_info[uidlist[0]]['reco_id'] if reco_id in logprobs[n_set]: raise Exception('already had %s' % reco_id) logprobs[n_set][reco_id] = float(line['logprob']) factor = 1. / n_set partialcorr_logprobs[n_set][reco_id] = factor * float( line['logprob']) factor = (1. - 0.24 / pow(float(n_set), 0.9)) / n_set # factor = 1. / (0.77547824*n_set + 0.20327936) corr_logprobs[n_set][reco_id] = factor * float(line['logprob']) i_baseline = -1 deviations = get_deviations(logprobs, i_baseline) # fit_stuff(n_set_list, deviations) partialcorr_deviations = get_deviations(partialcorr_logprobs, i_baseline) signed_partialcorr_deviations = get_deviations(partialcorr_logprobs, i_baseline, signed=True) corr_deviations = get_deviations(corr_logprobs, i_baseline) signed_corr_deviations = get_deviations(corr_logprobs, i_baseline, signed=True) import plotting fig, ax = plotting.mpl_init() ax.plot(n_set_list, deviations, marker='.') plotting.mpl_finish(ax, baseplotdir, 'forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02)) # fig, ax = plotting.mpl_init() # ax.plot(n_set_list, partialcorr_deviations, marker='.') # ax.plot([n_set_list[0], n_set_list[-1]], [0, 0]) # plotting.mpl_finish(ax, baseplotdir, 'partially-corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02)) fig, ax = plotting.mpl_init() ax.plot(n_set_list, partialcorr_deviations, marker='.', label='1/n (abs)') ax.plot(n_set_list, signed_partialcorr_deviations, marker='.', label='1/n') ax.plot(n_set_list, corr_deviations, marker='.', label='1/crap (abs)') ax.plot(n_set_list, signed_corr_deviations, marker='.', label='1/crap') ax.plot([n_set_list[0], n_set_list[-1]], [0, 0]) plotting.mpl_finish(ax, baseplotdir, 'corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02)) fig, ax = plotting.mpl_init() ax.plot(n_set_list, signed_corr_deviations, marker='.') ax.plot([n_set_list[0], n_set_list[-1]], [0, 0]) plotting.mpl_finish(ax, baseplotdir, 'signed-corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02))
himax = Hist(75, 0., 400) for uid in annotations: himax.fill(chfo[uid]['imax']) utils.prep_dir(args.plotdir, wildlings=['*.svg', '*.csv']) import matplotlib from matplotlib import pyplot as plt fig, ax = plotting.mpl_init() xvals, yvals = zip(*[(v['imax'], v['max_abs_diff']) for v in chfo.values()]) plt.scatter(xvals, yvals, alpha=0.4) print 'writing to %s' % args.plotdir plotting.mpl_finish(ax, args.plotdir, 'hexbin', title=args.title, xlabel='break point', ylabel='abs mfreq diff') plotting.draw_no_root(hmaxval, plotdir=args.plotdir, plotname='mfreq-diff', shift_overflows=True, xtitle='abs mfreq diff', ytitle='seqs') hmaxval.write('%s/%s.csv' % (args.plotdir, 'mfreq-diff')) plotting.draw_no_root(himax, plotdir=args.plotdir, plotname='imax', shift_overflows=True,
def plot(self, plotdir, only_csv=False): utils.prep_dir(plotdir, wildling=None, multilings=['*.csv', '*.svg', '*.root']) for column in self.values: if self.only_correct_gene_fractions and column not in bool_columns: continue if column in bool_columns: right = self.values[column]['right'] wrong = self.values[column]['wrong'] errs = fraction_uncertainty.err(right, right+wrong) print ' %s\n correct up to allele: %4d / %-4d = %4.4f (-%.3f, +%.3f)' % (column, right, right+wrong, float(right) / (right + wrong), errs[0], errs[1]) hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column) plotting.draw_no_root(hist, plotname=column, plotdir=plotdir, write_csv=True, stats='0-bin', only_csv=only_csv) else: # TODO this is dumb... I should make the integer-valued ones histograms as well hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=True) log = '' if column.find('hamming_to_true_naive') >= 0: # TODO why doesn't this just use the config dicts in plotheaders or wherever? hist.title = 'hamming distance' else: hist.title = 'inferred - true' plotting.draw_no_root(hist, plotname=column, plotdir=plotdir, write_csv=True, log=log, only_csv=only_csv) for column in self.hists: plotting.draw_no_root(self.hists[column], plotname=column, plotdir=plotdir, write_csv=True, log=log, only_csv=only_csv) # per-gene support crap for region in utils.regions: if self.hists[region + '_allele_right_vs_per_gene_support'].integral(include_overflows=True) == 0: continue xvals = self.hists[region + '_allele_right_vs_per_gene_support'].get_bin_centers() #ignore_overflows=True) right = self.hists[region + '_allele_right_vs_per_gene_support'].bin_contents wrong = self.hists[region + '_allele_wrong_vs_per_gene_support'].bin_contents yvals = [float(r) / (r + w) if r + w > 0. else 0. for r, w in zip(right, wrong)] # remove values corresponding to bins with no entries while yvals.count(0.) > 0: iv = yvals.index(0.) xvals.pop(iv) right.pop(iv) wrong.pop(iv) yvals.pop(iv) tmphilos = [fraction_uncertainty.err(r, r + w) for r, w in zip(right, wrong)] yerrs = [err[1] - err[0] for err in tmphilos] # fitting a line isn't particularly informative, actually # params, cov = numpy.polyfit(xvals, yvals, 1, w=[1./(e*e) if e > 0. else 0. for e in yerrs], cov=True) # slope, slope_err = params[0], math.sqrt(cov[0][0]) # y_icpt, y_icpt_err = params[1], math.sqrt(cov[1][1]) # print '%s slope: %5.2f +/- %5.2f y-intercept: %5.2f +/- %5.2f' % (region, slope, slope_err, y_icpt, y_icpt_err) # print '%s' % region # for iv in range(len(xvals)): # print ' %5.2f %5.0f / %5.0f = %5.2f +/- %.3f' % (xvals[iv], right[iv], right[iv] + wrong[iv], yvals[iv], yerrs[iv]) fig, ax = plotting.mpl_init() ax.errorbar(xvals, yvals, yerr=yerrs, markersize=10, linewidth=1, marker='.') ax.plot((0, 1), (0, 1), color='black', linestyle='--', linewidth=3) # line with slope 1 and intercept 0 # linevals = [slope*x + y_icpt for x in [0] + xvals] # fitted line # ax.plot([0] + xvals, linevals) plotting.mpl_finish(ax, plotdir, region + '_allele_fraction_correct_vs_per_gene_support', xlabel='support', ylabel='fraction correct', xbounds=(-0.1, 1.1), ybounds=(-0.1, 1.1)) if not only_csv: plotting.make_html(plotdir)
def make_mutefreq_plot(plotdir, gene_name, positions, debug=False): import plotting """ NOTE shares a lot with make_transition_plot() in bin/plot-hmms.py. """ nuke_colors = {'A': 'red', 'C': 'blue', 'G': 'orange', 'T': 'green'} fig, ax = plotting.mpl_init() fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)]) ibin = 0 if debug: print ' %s' % utils.color_gene(utils.unsanitize_name(gene_name)) legend_colors = set() for info in positions: posname = info['name'] # make label below bin for position and germline nuke ax.text(-0.5 + ibin, -0.075, simplify_state_name(posname), rotation='vertical', size=8) ax.text(-0.5 + ibin, -0.15, info.get('gl_nuke', '?'), fontsize=10, fontweight='bold') sorted_nukes, _ = zip(*sorted(info['nuke_freqs'].items(), key=operator.itemgetter(1), reverse=True)) if 'gl_nuke' in info and info['gl_nuke'] in info[ 'nuke_freqs']: # put the germline nuke first if we have it (second clause is for states with germline N)) sorted_nukes = [info['gl_nuke']] + [ n for n in sorted_nukes if n != info['gl_nuke'] ] total = 0.0 alpha = 0.6 for nuke in sorted_nukes: prob = info['nuke_freqs'][nuke] color = nuke_colors[nuke] label_to_use = None if color not in legend_colors: label_to_use = nuke legend_colors.add(color) # horizontal line at height total+prob ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, alpha=alpha, linewidth=3, label=label_to_use) # vertical line from total to total + prob ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=3) # # write [ACGT] at midpoint between total and total+prob # midpoint = 0.5*(prob + 2*total) # ... *redacted* total += prob ibin += 1 ax.get_xaxis().set_visible(False) plotting.mpl_finish(ax, plotdir, gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(positions) + 3), leg_loc=(0.95, 0.1), adjust={ 'left': 0.1, 'right': 0.8 }, leg_prop={'size': 8})
def make_single_hexbin_size_vs_shm_plot(self, sorted_clusters, annotations, repertoire_size, base_plotdir, plotname, n_max_mutations=100, log_cluster_size=False, debug=False): import plotting import matplotlib.pyplot as plt def getnmutelist(cluster): return annotations[':'.join(cluster)]['n_mutations'] fig, ax = plotting.mpl_init() xvals, yvals = zip( *[[numpy.mean(getnmutelist(cluster)), len(cluster)] for cluster in sorted_clusters if numpy.mean(getnmutelist(cluster)) < n_max_mutations]) if log_cluster_size: yvals = [math.log(yv) for yv in yvals] hb = ax.hexbin(xvals, yvals, gridsize=n_max_mutations, cmap=plt.cm.Blues, bins='log') nticks = 5 yticks = [ yvals[0] + itick * (yvals[-1] - yvals[0]) / float(nticks - 1) for itick in range(nticks) ] if log_cluster_size: yticklabels = [math.exp(yt) for yt in yticks] yticklabels = [('%.0f' % yt) if yt > 5 else ('%.1f' % yt) for yt in yticklabels] else: yticklabels = [int(yt) for yt in yticks] if self.args.queries_to_include is not None: for cluster in sorted_clusters: queries_to_include_in_this_cluster = set(cluster) & set( self.args.queries_to_include) if len(queries_to_include_in_this_cluster) == 0: continue xval = numpy.mean(getnmutelist(cluster)) yval = len(cluster) if log_cluster_size: yval = math.log(yval) ax.plot([xval], [yval], color='red', marker='.', markersize=10) ax.text(xval, yval, ' '.join(queries_to_include_in_this_cluster), color='red', fontsize=8) ylabel = 'clonal family size' if log_cluster_size: ylabel += ' (log)' plotname += '-log' plotting.mpl_finish(ax, base_plotdir + '/overall', plotname, xlabel='mean N mutations', ylabel=ylabel, xbounds=[0, n_max_mutations], yticks=yticks, yticklabels=yticklabels)
def make_single_size_vs_shm_plot(self, sorted_clusters, annotations, repertoire_size, base_plotdir, plotname, n_max_mutations=100, plot_high_mutation=False, title=None, debug=False): import plotting def gety(minval, maxval, xmax, x): slope = (maxval - minval) / xmax return slope * x + minval def getnmutelist(cluster): return annotations[':'.join(cluster)]['n_mutations'] colors = ['#006600', '#3399ff', '#ffa500'] # goldenrod '#daa520' # red '#cc0000', # dark red '#990012' # purple '#a821c7' # grey '#808080' dpi = 80 xpixels = 450 ypixels = max(400, 10 * len(sorted_clusters)) fig, ax = plotting.mpl_init(figsize=(xpixels / dpi, ypixels / dpi)) min_linewidth = 0.3 max_linewidth = 12 # min_alpha = 0.1 # max_alpha = 1. # linewidth = 7 alpha = 0.55 ymin, ymax = 9999, 0 iclust_global = 0 yticks, yticklabels = [], [] high_mutation_clusters = [] biggest_n_mutations = None if debug: print ' %s %d x %d' % ( plotname, xpixels, ypixels ) #, utils.color('red', 'high mutation') if plot_high_mutation else '') print ' size frac yval median mean' for csize, cluster_group in itertools.groupby(sorted_clusters, key=lambda c: len(c)): cluster_group = sorted(list(cluster_group), key=lambda c: numpy.median(getnmutelist(c))) n_clusters = len(cluster_group) repfracstr = self.get_repfracstr(csize, repertoire_size) for iclust in range(len(cluster_group)): cluster = cluster_group[iclust] nmutelist = sorted(getnmutelist(cluster)) nmedian = numpy.median(nmutelist) nmean = numpy.mean( nmutelist) # maybe should use this instead of the median? if biggest_n_mutations is None or nmutelist[ -1] > biggest_n_mutations: biggest_n_mutations = nmutelist[-1] yval = len(sorted_clusters) - iclust_global if yval < ymin: ymin = yval if yval > ymax: ymax = yval yticks.append(yval) yticklabels.append('%d' % csize) # yticklabels.append(repfracstr) base_color = colors[iclust_global % len(colors)] if self.args.queries_to_include is not None: queries_to_include_in_this_cluster = set(cluster) & set( self.args.queries_to_include) if len(queries_to_include_in_this_cluster) > 0: base_color = 'red' if plot_high_mutation: xtext = 1.1 elif float(nmedian) / n_max_mutations < 0.5: xtext = 0.75 else: xtext = 0.1 ax.text(xtext * n_max_mutations, yval, ' '.join(queries_to_include_in_this_cluster), color='red', fontsize=8) if debug: print ' %5s %-10s %4.1f %6.1f %6.1f' % ( '%d' % csize if iclust == 0 else '', repfracstr if iclust == 0 else '', yval, nmedian, nmean), if nmedian > n_max_mutations and not plot_high_mutation: if debug: print '%s' % utils.color('red', 'high mutation') high_mutation_clusters.append(cluster) continue if debug: print '' nbins = nmutelist[-1] - nmutelist[0] + 1 hist = Hist(nbins, nmutelist[0] - 0.5, nmutelist[-1] + 0.5) for nm in nmutelist: hist.fill(nm) assert hist.overflow_contents() == 0. # includes underflows xmax = max(hist.bin_contents) # float(csize) for ibin in range(1, hist.n_bins + 1): linewidth = gety(min_linewidth, max_linewidth, xmax, hist.bin_contents[ibin]) color = base_color # alpha = gety(min_alpha, max_alpha, xmax, hist.bin_contents[ibin]) if hist.bin_contents[ibin] == 0.: color = 'grey' linewidth = min_linewidth alpha = 0.4 ax.plot([hist.low_edges[ibin], hist.low_edges[ibin + 1]], [yval, yval], color=color, linewidth=linewidth, alpha=alpha, solid_capstyle='butt') iclust_global += 1 xbounds = [-0.2, n_max_mutations] if not plot_high_mutation else [ n_max_mutations, biggest_n_mutations ] ybounds = [0.95 * ymin, 1.05 * ymax] n_ticks = 5 if len(yticks) > n_ticks: yticks = [ yticks[i] for i in range(0, len(yticks), int(len(yticks) / float(n_ticks - 1))) ] yticklabels = [ yticklabels[i] for i in range(0, len(yticklabels), int(len(yticklabels) / float(n_ticks - 1))) ] plotting.mpl_finish(ax, base_plotdir + '/overall', plotname, xlabel='N mutations', ylabel='clonal family size', title=title, xbounds=xbounds, ybounds=ybounds, yticks=yticks, yticklabels=yticklabels, adjust={'left': 0.18}) return high_mutation_clusters
def fullplot(self, plotdir, plotname, **kwargs): # i.e. full plotting process, not just the ax.plot type stuff above import plotting fig, ax = plotting.mpl_init() # this'll need to be updated when i want to use a kwarg for this fcn self.mpl_plot(ax) plotting.mpl_finish(ax, plotdir, plotname, **kwargs) self.write('%s/%s.csv'%(plotdir, plotname))