def __init__(self, args, base_plotdir, skip_boring_states=''): raise Exception('needs to be converted off root') self.base_plotdir = base_plotdir self.skip_boring_states = skip_boring_states plot_types = ('transitions', 'emissions') for ptype in plot_types: plotdir = self.base_plotdir + '/' + ptype + '/plots' utils.prep_dir(plotdir, wildlings='*.png') if args.hmmdir != None: filelist = glob.glob(args.hmmdir + '/*.yaml') else: filelist = utils.get_arg_list(args.infiles) if len(filelist) == 0: print 'ERROR zero files passed to modelplotter' sys.exit() for infname in filelist: gene_name = os.path.basename(infname).replace('.yaml', '') # the sanitized name, actually # # ---------------------------------------------------------------------------------------- # if utils.get_region(gene_name) == 'v' and 'IGHV4-39_star_' not in gene_name: # continue # # ---------------------------------------------------------------------------------------- with open(infname) as infile: model = yaml.load(infile) self.make_transition_plot(gene_name, model) self.make_emission_plot(gene_name, model) for ptype in plot_types: check_call(['./bin/makeHtml', self.base_plotdir + '/' + ptype, '1', 'null', 'png']) check_call(['./bin/permissify-www', self.base_plotdir])
def plot(self, plotdir, partition=None, infiles=None, annotations=None, only_csv=None): print ' plotting partitions' sys.stdout.flush() start = time.time() for subdir in self.subplotdirs: utils.prep_dir(plotdir + '/' + subdir, wildlings=['*.csv', '*.svg']) if partition is not None: # one partition assert infiles is None assert annotations is not None csize_hists = {'best' : plotting.get_cluster_size_hist(partition)} self.plot_within_vs_between_hists(partition, annotations, plotdir) elif infiles is not None: # plot the mean of a partition from each file subset_hists = [] for fname in infiles: cp = ClusterPath() cp.readfile(fname) subset_hists.append(plotting.get_cluster_size_hist(cp.partitions[cp.i_best])) csize_hists = {'best' : plotting.make_mean_hist(subset_hists)} for ih in range(len(subset_hists)): subset_hists[ih].write(plotdir + ('/subset-%d-cluster-sizes.csv' % ih)) else: assert False plotting.plot_cluster_size_hists(plotdir + '/overall/cluster-sizes.svg', csize_hists, title='', log='x') if not only_csv: for subdir in self.subplotdirs: plotting.make_html(plotdir + '/' + subdir) print '(%.1f sec)' % (time.time()-start)
def clean_plots(self, plotdir): self.mfreqer.clean_plots(plotdir + '/mute-freqs') utils.prep_dir(plotdir + '/overall', wildlings=('*.csv', '*.svg')) for column in self.counts: if column in self.columns_to_subset_by_gene: thisplotdir = plotdir + '/' + column utils.prep_dir(thisplotdir, wildlings=['*.csv', '*.svg'])
def plot(self, plotdir, only_csv=False): utils.prep_dir(plotdir, wildling=None, multilings=['*.csv', '*.svg', '*.root']) for column in self.values: if self.only_correct_gene_fractions and column not in bool_columns: continue if column in bool_columns: right = self.values[column]['right'] wrong = self.values[column]['wrong'] errs = fraction_uncertainty.err(right, right+wrong) print ' %s\n correct up to allele: %4d / %-4d = %4.4f (-%.3f, +%.3f)' % (column, right, right+wrong, float(right) / (right + wrong), errs[0], errs[1]) hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column) plotting.draw_no_root(hist, plotname=column, plotdir=plotdir, write_csv=True, stats='0-bin', only_csv=only_csv) else: # TODO this is dumb... I should make the integer-valued ones histograms as well hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=True) log = '' if column.find('hamming_to_true_naive') >= 0: # TODO why doesn't this just use the config dicts in plotheaders or wherever? hist.title = 'hamming distance' else: hist.title = 'inferred - true' plotting.draw_no_root(hist, plotname=column, plotdir=plotdir, write_csv=True, log=log, only_csv=only_csv) for column in self.hists: plotting.draw_no_root(self.hists[column], plotname=column, plotdir=plotdir, write_csv=True, log=log, only_csv=only_csv) if not only_csv: plotting.make_html(plotdir)
def clean_plots(self, plotdir, subset_by_gene): self.mfreqer.clean_plots(plotdir + '/mute-freqs') utils.prep_dir(plotdir + '/overall') #, multilings=('*.csv', '*.svg')) for column in self.counts: if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'): # option to subset deletion and (real) insertion plots by gene thisplotdir = plotdir + '/' + column utils.prep_dir(thisplotdir, wildlings=['*.csv', '*.svg'])
def prep_cmdfo(iclust, seqfos, queries_to_include, color_scale_vals, title): subworkdir = '%s/mds-%d' % (self.args.workdir, iclust) utils.prep_dir(subworkdir) tmpfname = '%s/seqs.fa' % subworkdir with open(tmpfname, 'w') as tmpfile: for sfo in seqfos: csval = None if sfo['name'] in color_scale_vals: csval = color_scale_vals[sfo['name']] tmpfile.write( '>%s%s\n%s\n' % (sfo['name'], (' %d' % csval) if csval is not None else '', sfo['seq'])) cmdstr = './bin/mds-run.py %s --aligned --plotdir %s --plotname %s --workdir %s --seed %d' % ( tmpfname, plotdir, get_fname(iclust), subworkdir, self.args.seed) if queries_to_include is not None: cmdstr += ' --queries-to-include %s' % ':'.join( queries_to_include) if title is not None: cmdstr += ' --title=%s' % title.replace(' ', '@') return { 'cmd_str': cmdstr, 'workdir': subworkdir, 'outfname': '%s/%s.svg' % (plotdir, get_fname(iclust)), 'workfnames': [tmpfname] }
def write(self, base_outdir, mean_freq_outfname): if not self.finalized: self.finalize() outdir = base_outdir + '/mute-freqs' utils.prep_dir(outdir, '*.csv') for gene in self.counts: counts, freqs, plotting_info = self.counts[gene], self.freqs[gene], self.plotting_info[gene] sorted_positions = sorted(counts) outfname = outdir + '/' + utils.sanitize_name(gene) + '.csv' with opener('w')(outfname) as outfile: nuke_header = [] for nuke in utils.nukes: nuke_header.append(nuke) nuke_header.append(nuke + '_lo_err') nuke_header.append(nuke + '_hi_err') writer = csv.DictWriter(outfile, ('position', 'mute_freq', 'lo_err', 'hi_err') + tuple(nuke_header)) writer.writeheader() for position in sorted_positions: row = {'position':position, 'mute_freq':counts[position]['freq'], 'lo_err':counts[position]['freq_lo_err'], 'hi_err':counts[position]['freq_hi_err']} for nuke in utils.nukes: row[nuke] = freqs[position][nuke] row[nuke + '_lo_err'] = freqs[position][nuke + '_lo_err'] row[nuke + '_hi_err'] = freqs[position][nuke + '_hi_err'] writer.writerow(row) assert 'REGION' in mean_freq_outfname self.mean_rates['all'].write(mean_freq_outfname.replace('REGION', 'all')) # hackey hackey hackey replacement... *sigh* for region in utils.regions: self.mean_rates[region].write(mean_freq_outfname.replace('REGION', region))
def __init__(self, args, base_plotdir, skip_boring_states=''): self.base_plotdir = base_plotdir self.skip_boring_states = skip_boring_states plot_types = ('transitions', 'emissions', 'pair-emissions') for ptype in plot_types: plotdir = self.base_plotdir + '/' + ptype + '/plots' utils.prep_dir(plotdir, '*.png') if args.hmmdir != None: filelist = glob.glob(args.hmmdir + '/*.yaml') else: filelist = utils.get_arg_list(args.infiles) if len(filelist) == 0: print 'ERROR zero files passed to modelplotter' sys.exit() for infname in filelist: gene_name = os.path.basename(infname).replace( '.yaml', '') # the sanitized name, actually # # ---------------------------------------------------------------------------------------- # if utils.get_region(gene_name) == 'v' and 'IGHV4-39_star_' not in gene_name: # continue # # ---------------------------------------------------------------------------------------- with open(infname) as infile: model = yaml.load(infile) self.make_transition_plot(gene_name, model) self.make_emission_plot(gene_name, model) # self.make_pair_emission_plot(gene_name, model) for ptype in plot_types: check_call([ './bin/makeHtml', self.base_plotdir + '/' + ptype, '1', 'null', 'png' ]) check_call(['./bin/permissify-www', self.base_plotdir])
def write_vdjalign_input(self, base_infname, n_procs): n_remaining = len(self.remaining_queries) queries_per_proc = float(n_remaining) / n_procs n_queries_per_proc = int(math.ceil(queries_per_proc)) written_queries = set() # make sure we actually write each query TODO remove this when you work out where they're disappearing to if n_procs == 1: # double check for rounding problems or whatnot assert n_queries_per_proc == n_remaining for iproc in range(n_procs): workdir = self.subworkdir(iproc, n_procs) if n_procs > 1: utils.prep_dir(workdir) with opener('w')(workdir + '/' + base_infname) as sub_infile: iquery = 0 for query_name in self.remaining_queries: # NOTE this is wasteful to loop of all the remaining queries for each process... but maybe not that wasteful if iquery >= n_remaining: break if iquery < iproc*n_queries_per_proc or iquery >= (iproc + 1)*n_queries_per_proc: # not for this process iquery += 1 continue sub_infile.write('>' + query_name + ' NUKES\n') seq = self.input_info[query_name]['seq'] if query_name in self.info['indels']: seq = self.info['indels'][query_name]['reversed_seq'] # use the query sequence with shm insertions and deletions reversed sub_infile.write(seq + '\n') written_queries.add(query_name) iquery += 1 not_written = self.remaining_queries - written_queries if len(not_written) > 0: raise Exception('didn\'t write %s to %s' % (':'.join(not_written), self.args.workdir))
def plot(self, plotdir, subset_by_gene=False, cyst_positions=None, tryp_positions=None): print ' plotting parameters' start = time.time() utils.prep_dir(plotdir + '/plots') #, multilings=('*.csv', '*.svg')) for column in self.counts: if column == 'all': continue values, gene_values = {}, {} if len(self.counts[column]) == 0: print 'ERROR no counts in %s' % column assert False for index, count in self.counts[column].iteritems(): gene = None if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'): # option to subset deletion and (real) insertion plots by gene if '_del' in column: region = column[0] else: region = column[1] assert region in utils.regions assert 'IGH' + region.upper() in index[1] # NOTE this is hackey, but it works find now and will fail obviously gene = index[1] # if I ever change the correlations to be incompatible. so screw it if gene not in gene_values: gene_values[gene] = {} column_val = index[0] if gene is not None: if column_val not in gene_values[gene]: gene_values[gene][column_val] = 0.0 gene_values[gene][column_val] += count if column_val not in values: values[column_val] = 0.0 values[column_val] += count try: # figure out whether this is an integer or string (only used outside this loop when we make the plots) int(column_val) var_type = 'int' except: var_type = 'string' if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'): # option to subset deletion and (real) insertion plots by gene thisplotdir = plotdir + '/' + column utils.prep_dir(thisplotdir + '/plots', multilings=['*.csv', '*.svg']) for gene in gene_values: plotname = utils.sanitize_name(gene) + '-' + column hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True) plotting.draw(hist, var_type, plotname=plotname, plotdir=thisplotdir, errors=True, write_csv=True) check_call(['./bin/makeHtml', thisplotdir, '3', 'null', 'svg']) check_call(['./bin/permissify-www', thisplotdir]) # NOTE this should really permissify starting a few directories higher up plotname = column hist = plotting.make_hist_from_dict_of_counts(values, var_type, plotname, sort=True) plotting.draw(hist, var_type, plotname=plotname, plotdir=plotdir, errors=True, write_csv=True) self.mutefreqer.plot(plotdir, cyst_positions, tryp_positions) #, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files if has_root: check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg']) check_call(['./bin/permissify-www', plotdir]) # NOTE this should really permissify starting a few directories higher up print ' parameter plot time: %.3f' % (time.time()-start)
def clean_plots(self, plotdir): self.mfreqer.clean_plots(plotdir + "/mute-freqs") utils.prep_dir(plotdir + "/overall") # , multilings=('*.csv', '*.svg')) for column in self.counts: if column in self.columns_to_subset_by_gene: thisplotdir = plotdir + "/" + column utils.prep_dir(thisplotdir, wildlings=["*.csv", "*.svg"])
def plot(self, plotdir): utils.prep_dir(plotdir + '/plots', wildling=None, multilings=['*.csv', '*.svg', '*.root']) for column in self.values: if self.only_correct_gene_fractions and column not in bool_columns: continue if column in bool_columns: right = self.values[column]['right'] wrong = self.values[column]['wrong'] errs = fraction_uncertainty.err(right, right+wrong) print ' %s\n correct up to allele: %4d / %-4d = %4.4f (-%.3f, +%.3f)' % (column, right, right+wrong, float(right) / (right + wrong), errs[0], errs[1]) hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column) plotting.draw(hist, 'bool', plotname=column, plotdir=plotdir, write_csv=True) else: # TODO this is dumb... I should make the integer-valued ones histograms as well hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=True) log = '' if column.find('hamming_to_true_naive') >= 0: hist.GetXaxis().SetTitle('hamming distance') else: hist.GetXaxis().SetTitle('inferred - true') plotting.draw(hist, 'int', plotname=column, plotdir=plotdir, write_csv=True, log=log) for column in self.hists: hist = plotting.make_hist_from_my_hist_class(self.hists[column], column) plotting.draw(hist, 'float', plotname=column, plotdir=plotdir, write_csv=True, log=log) check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg']) check_call(['./bin/permissify-www', plotdir]) # NOTE this should really permissify starting a few directories higher up
def plot(self, base_plotdir, only_csv=False): if not self.finalized: self.finalize(debug=debug) plotdir = base_plotdir + '/allele-finding' for old_gene_dir in glob.glob(plotdir + '/*'): # has to be a bit more hackey than elsewhere, since we have no way of knowing what genes might have had their own directories written last time we wrote to this dir if not os.path.isdir(old_gene_dir): raise Exception('not a directory: %s' % old_gene_dir) utils.prep_dir(old_gene_dir, wildlings=('*.csv', '*.svg')) os.rmdir(old_gene_dir) utils.prep_dir(plotdir, wildlings=('*.csv', '*.svg')) if only_csv: # not implemented return start = time.time() for gene in self.plotvals: if utils.get_region(gene) != 'v': continue for position in self.plotvals[gene]: if position not in self.fitted_positions[gene]: # we can make plots for the positions we didn't fit, but there's a *lot* of them and they're slow continue # if 'allele-finding' not in self.TMPxyvals[gene][position] or self.TMPxyvals[gene][position]['allele-finding'] is None: # continue plotting.make_allele_finding_plot(plotdir + '/' + utils.sanitize_name(gene), gene, position, self.plotvals[gene][position]) print ' allele finding plot time: %.1f' % (time.time()-start)
def plot(self, plotdir, subset_by_gene=False, cyst_positions=None, tryp_positions=None): print ' plotting parameters' # start = time.time() utils.prep_dir(plotdir + '/plots') #, multilings=('*.csv', '*.svg')) for column in self.counts: if column == 'all': continue values, gene_values = {}, {} if len(self.counts[column]) == 0: print 'ERROR no counts in %s' % column assert False for index, count in self.counts[column].iteritems(): gene = None if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'): # option to subset deletion and (real) insertion plots by gene if '_del' in column: region = column[0] else: region = column[1] assert region in utils.regions assert 'IGH' + region.upper() in index[1] # NOTE this is hackey, but it works find now and will fail obviously gene = index[1] # if I ever change the correlations to be incompatible. so screw it if gene not in gene_values: gene_values[gene] = {} column_val = index[0] if gene is not None: if column_val not in gene_values[gene]: gene_values[gene][column_val] = 0.0 gene_values[gene][column_val] += count if column_val not in values: values[column_val] = 0.0 values[column_val] += count try: # figure out whether this is an integer or string (only used outside this loop when we make the plots) int(column_val) var_type = 'int' except: var_type = 'string' if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'): # option to subset deletion and (real) insertion plots by gene thisplotdir = plotdir + '/' + column utils.prep_dir(thisplotdir + '/plots', multilings=['*.csv', '*.svg']) for gene in gene_values: plotname = utils.sanitize_name(gene) + '-' + column hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True) plotting.draw(hist, var_type, plotname=plotname, plotdir=thisplotdir, errors=True, write_csv=True) check_call(['./bin/makeHtml', thisplotdir, '3', 'null', 'svg']) check_call(['./bin/permissify-www', thisplotdir]) # NOTE this should really permissify starting a few directories higher up plotname = column hist = plotting.make_hist_from_dict_of_counts(values, var_type, plotname, sort=True) plotting.draw(hist, var_type, plotname=plotname, plotdir=plotdir, errors=True, write_csv=True) self.mutefreqer.plot(plotdir, cyst_positions, tryp_positions) #, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files if has_root: check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg']) check_call(['./bin/permissify-www', plotdir]) # NOTE this should really permissify starting a few directories higher up
def __init__(self, args, seed, sublabel=None): self.args = args if sublabel == None: self.workdir = self.args.workdir + '/recombinator' self.outfname = self.args.outfname else: # need a separate workdir for each subprocess self.workdir = self.args.workdir + '/recombinator-' + sublabel self.outfname = self.workdir + '/' + os.path.basename(self.args.outfname) utils.prep_dir(self.workdir) if not os.path.exists(self.args.parameter_dir): raise Exception('parameter dir ' + self.args.parameter_dir + ' d.n.e') # parameters that control recombination, erosion, and whatnot self.index_keys = {} # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict self.version_freq_table = {} # list of the probabilities with which each VDJ combo appears in data self.mute_models = {} # self.treeinfo = [] # list of newick-formatted tree strings with region-specific branch info tacked at the end for region in utils.regions: self.mute_models[region] = {} for model in ['gtr', 'gamma']: self.mute_models[region][model] = {} # first read info that doesn't depend on which person we're looking at self.glfo = utils.read_germline_set(self.args.datadir) # then read stuff that's specific to each person self.read_vdj_version_freqs(self.args.parameter_dir + '/' + utils.get_parameter_fname('all')) self.allowed_genes = self.get_allowed_genes(self.args.parameter_dir) # only really used if <self.args.uniform_vj_choice_probs> is set, but it also checks the sensibility of <self.args.only_genes> self.insertion_content_probs = None self.read_insertion_content() # read shm info NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals with opener('r')(self.args.gtrfname) as gtrfile: # read gtr parameters reader = csv.DictReader(gtrfile) for line in reader: parameters = line['parameter'].split('.') region = parameters[0][3].lower() assert region == 'v' or region == 'd' or region == 'j' model = parameters[1].lower() parameter_name = parameters[2] assert model in self.mute_models[region] self.mute_models[region][model][parameter_name] = line['value'] treegen = treegenerator.TreeGenerator(args, self.args.parameter_dir, seed=seed) self.treefname = self.workdir + '/trees.tre' treegen.generate_trees(seed, self.treefname) with opener('r')(self.treefname) as treefile: # read in the trees (and other info) that we just generated self.treeinfo = treefile.readlines() if not self.args.no_clean: os.remove(self.treefname) if os.path.exists(self.outfname): os.remove(self.outfname) elif not os.path.exists(os.path.dirname(os.path.abspath(self.outfname))): os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))
def __init__(self, args, seed, sublabel=None): self.args = args if sublabel == None: self.workdir = self.args.workdir + '/recombinator' self.outfname = self.args.outfname else: # need a separate workdir for each subprocess self.workdir = self.args.workdir + '/recombinator-' + sublabel self.outfname = self.workdir + '/' + os.path.basename(self.args.outfname) utils.prep_dir(self.workdir) if not self.args.simulate_partially_from_scratch: parameter_dir = self.args.parameter_dir else: # we start from scratch, except for the mute freq stuff parameter_dir = self.args.scratch_mute_freq_dir if parameter_dir is None or not os.path.exists(parameter_dir): raise Exception('parameter dir ' + parameter_dir + ' d.n.e') self.index_keys = {} # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict self.mute_models = {} # self.treeinfo = [] # list of newick-formatted tree strings with region-specific branch info tacked at the end for region in utils.regions: self.mute_models[region] = {} for model in ['gtr', 'gamma']: self.mute_models[region][model] = {} self.glfo = glutils.read_glfo(self.args.initial_datadir, self.args.chain, only_genes=self.args.only_genes) self.allowed_genes = self.get_allowed_genes(parameter_dir) # set of genes a) for which we read per-position mutation information and b) from which we choose when running partially from scratch self.version_freq_table = self.read_vdj_version_freqs(parameter_dir) # list of the probabilities with which each VDJ combo (plus other rearrangement parameters) appears in data self.insertion_content_probs = self.read_insertion_content(parameter_dir) self.all_mute_freqs = {} self.parameter_dir = parameter_dir # damnit, I guess I do need to save this in self # read shm info NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals with opener('r')(self.args.gtrfname) as gtrfile: # read gtr parameters reader = csv.DictReader(gtrfile) for line in reader: parameters = line['parameter'].split('.') region = parameters[0][3].lower() assert region == 'v' or region == 'd' or region == 'j' model = parameters[1].lower() parameter_name = parameters[2] assert model in self.mute_models[region] self.mute_models[region][model][parameter_name] = line['value'] treegen = treegenerator.TreeGenerator(args, parameter_dir, seed=seed) self.treefname = self.workdir + '/trees.tre' treegen.generate_trees(seed, self.treefname) with opener('r')(self.treefname) as treefile: # read in the trees (and other info) that we just generated self.treeinfo = treefile.readlines() os.remove(self.treefname) if os.path.exists(self.outfname): os.remove(self.outfname) elif not os.path.exists(os.path.dirname(os.path.abspath(self.outfname))): os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))
def run_sklearn_mds(n_components, n_clusters, seqfos, seed, reco_info=None, region=None, aligned=False, n_init=4, max_iter=300, eps=1e-3, n_jobs=-1, plotdir=None, debug=False): # NOTE set <n_components> to None to run plain kmeans, without mds TODO clean this up start = time.time() assert n_clusters is not None if 'sklearn' not in sys.modules: from sklearn import manifold # these are both slow af to import, even on local ssd from sklearn.cluster import KMeans if len(set(sfo['name'] for sfo in seqfos)) != len(seqfos): raise Exception('duplicate sequence ids in <seqfos>') if not aligned: # NOTE unlike the bios2mds version above, this modifies <seqfos> if debug: print 'align' seqfos = utils.align_many_seqs(seqfos) if debug: print ' distances' # translations = string.maketrans('ACGT-', '01234') # def convert(seq): # return [int(c) for c in seq.translate(translations)] # converted_seqs = [convert(x['seq']) for x in seqfos] # similarities = scipy.spatial.distance.pdist(converted_seqs, 'hamming') # similarities = scipy.spatial.distance.squareform(similarities) similarities = scipy.spatial.distance.squareform([utils.hamming_fraction(seqfos[i]['seq'], seqfos[j]['seq']) for i in range(len(seqfos)) for j in range(i + 1, len(seqfos))]) random_state = numpy.random.RandomState(seed=seed) pos = None if n_components is not None: if debug: print ' mds' mds = sys.modules['sklearn'].manifold.MDS(n_components=n_components, n_init=n_init, max_iter=max_iter, eps=eps, random_state=random_state, dissimilarity="precomputed", n_jobs=n_jobs) pos = mds.fit_transform(similarities) # pos = mds.fit(similarities).embedding_ if debug: print ' kmeans clustering with %d clusters' % n_clusters kmeans = sys.modules['sklearn'].cluster.KMeans(n_clusters=n_clusters, random_state=random_state).fit(pos if pos is not None else similarities) pcvals = {seqfos[iseq]['name'] : pos[iseq] if pos is not None else None for iseq in range(len(seqfos))} labels = {seqfos[iseq]['name'] : kmeans.labels_[iseq] for iseq in range(len(seqfos))} partition = utils.group_seqs_by_value(pcvals.keys(), lambda q: labels[q]) if plotdir is not None: utils.prep_dir(plotdir, wildlings=['*.svg']) if debug: print ' plot' plot_mds(n_components, pcvals, plotdir, 'mds', partition=partition) if reco_info is not None: labels = {uid : reco_info[uid][region + '_gene'] for uid in pcvals} plot_mds(n_components, pcvals, plotdir, 'true-genes', labels=labels) if debug: print ' kmeans time %.1f' % (time.time() - start) return partition
def plot(self, plotdir, only_csv=False): print ' plotting performance', import fraction_uncertainty import plotting start = time.time() for substr in self.subplotdirs: utils.prep_dir(plotdir + '/' + substr, wildlings=('*.csv', '*.svg')) for column in self.values: if column in plotconfig.gene_usage_columns: right = self.values[column]['right'] wrong = self.values[column]['wrong'] lo, hi = fraction_uncertainty.err(right, right + wrong) hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column) plotting.draw_no_root(hist, plotname=column, plotdir=plotdir + '/gene-call', write_csv=True, stats='0-bin', only_csv=only_csv) else: hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=False) if 'hamming_to_true_naive' in column: xtitle = 'hamming distance' tmpplotdir = plotdir + '/mutation' else: xtitle = 'inferred - true' if 'muted' in column: tmpplotdir = plotdir + '/mutation' else: tmpplotdir = plotdir + '/boundaries' plotting.draw_no_root(hist, plotname=column, plotdir=tmpplotdir, write_csv=True, only_csv=only_csv, xtitle=xtitle, shift_overflows=True) for column in self.hists: if '_vs_mute_freq' in column or '_vs_per_gene_support' in column: # only really care about the fraction, which we plot below continue plotting.draw_no_root(self.hists[column], plotname=column, plotdir=plotdir + '/mutation', write_csv=True, only_csv=only_csv, ytitle='counts', xtitle='inferred - true', shift_overflows=True) # fraction correct vs mute freq for region in utils.regions: hright = self.hists[region + '_gene_right_vs_mute_freq'] hwrong = self.hists[region + '_gene_wrong_vs_mute_freq'] if hright.integral(include_overflows=True) == 0: continue plotting.make_fraction_plot(hright, hwrong, plotdir + '/gene-call', region + '_fraction_correct_vs_mute_freq', xlabel='mut freq', ylabel='fraction correct up to allele', xbounds=(0., 0.5), only_csv=only_csv, write_csv=True) # per-gene support stuff for region in utils.regions: if self.hists[region + '_allele_right_vs_per_gene_support'].integral(include_overflows=True) == 0: continue hright = self.hists[region + '_allele_right_vs_per_gene_support'] hwrong = self.hists[region + '_allele_wrong_vs_per_gene_support'] plotting.make_fraction_plot(hright, hwrong, plotdir + '/gene-call', region + '_allele_fraction_correct_vs_per_gene_support', xlabel='support', ylabel='fraction with correct allele', xbounds=(-0.1, 1.1), only_csv=only_csv, write_csv=True) if not only_csv: # write html file and fix permissiions for substr in self.subplotdirs: plotting.make_html(plotdir + '/' + substr, n_columns=4) print '(%.1f sec)' % (time.time()-start)
def __init__(self, args, glfo, seed, workdir, outfname): # NOTE <gldir> is not in general the same as <args.initial_germline_dir> self.args = args self.glfo = glfo # NOTE in general *not* the same as <self.args.workdir> and <self.args.outfname> self.workdir = workdir self.outfname = outfname utils.prep_dir(self.workdir) # set <self.parameter_dir> (note that this is in general *not* the same as self.args.parameter_dir) if self.args.rearrange_from_scratch: # currently not allowed to mutate from scratch without also rearranging from scratch (enforced in bin/partis) if self.args.mutate_from_scratch: self.parameter_dir = None else: self.parameter_dir = self.args.scratch_mute_freq_dir # if you make up mute freqs from scratch, unless you're really careful you tend to get nonsense results for a lot of things (e.g. allele finding). So it's easier to copy over a reasonable set of mut freq parameters from somewhere. else: self.parameter_dir = self.args.parameter_dir + '/' + self.args.parameter_type self.index_keys = {} # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict self.mute_models = {} # self.treeinfo = [] # list of newick-formatted tree strings with region-specific branch info tacked at the end for region in utils.regions: self.mute_models[region] = {} for model in ['gtr', 'gamma']: self.mute_models[region][model] = {} self.allele_prevalence_freqs = glutils.read_allele_prevalence_freqs(args.allele_prevalence_fname) if args.allele_prevalence_fname is not None else {} self.version_freq_table = self.read_vdj_version_freqs() # list of the probabilities with which each VDJ combo (plus other rearrangement parameters) appears in data (none if rearranging from scratch) self.insertion_content_probs = self.read_insertion_content() # dummy/uniform if rearranging from scratch self.all_mute_freqs = {} # read shm info NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals with opener('r')(self.args.gtrfname) as gtrfile: # read gtr parameters reader = csv.DictReader(gtrfile) for line in reader: parameters = line['parameter'].split('.') region = parameters[0][3].lower() assert region == 'v' or region == 'd' or region == 'j' model = parameters[1].lower() parameter_name = parameters[2] assert model in self.mute_models[region] self.mute_models[region][model][parameter_name] = line['value'] treegen = treegenerator.TreeGenerator(args, self.parameter_dir, seed=seed) self.treefname = self.workdir + '/trees.tre' treegen.generate_trees(seed, self.treefname) with opener('r')(self.treefname) as treefile: # read in the trees (and other info) that we just generated self.treeinfo = treefile.readlines() os.remove(self.treefname) if os.path.exists(self.outfname): os.remove(self.outfname) elif not os.path.exists(os.path.dirname(os.path.abspath(self.outfname))): os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))
def plot(self, plotdir, only_csv=False): print ' plotting performance', start = time.time() for substr in self.subplotdirs: utils.prep_dir(plotdir + '/' + substr, wildlings=('*.csv', '*.svg')) for column in self.values: if column in bool_columns: right = self.values[column]['right'] wrong = self.values[column]['wrong'] lo, hi, _ = fraction_uncertainty.err(right, right + wrong) hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column) plotting.draw_no_root(hist, plotname=column, plotdir=plotdir + '/gene-call', write_csv=True, stats='0-bin', only_csv=only_csv) else: hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=False) if 'hamming_to_true_naive' in column: xtitle = 'hamming distance' tmpplotdir = plotdir + '/mutation' else: xtitle = 'inferred - true' if 'muted' in column: tmpplotdir = plotdir + '/mutation' else: tmpplotdir = plotdir + '/boundaries' plotting.draw_no_root(hist, plotname=column, plotdir=tmpplotdir, write_csv=True, only_csv=only_csv, xtitle=xtitle, shift_overflows=True) for column in self.hists: if '_vs_mute_freq' in column or '_vs_per_gene_support' in column: # only really care about the fraction, which we plot below continue plotting.draw_no_root(self.hists[column], plotname=column, plotdir=plotdir + '/mutation', write_csv=True, only_csv=only_csv, ytitle='counts', xtitle='inferred - true', shift_overflows=True) # fraction correct vs mute freq for region in utils.regions: hright = self.hists[region + '_gene_right_vs_mute_freq'] hwrong = self.hists[region + '_gene_wrong_vs_mute_freq'] if hright.integral(include_overflows=True) == 0: continue plotting.make_fraction_plot(hright, hwrong, plotdir + '/gene-call', region + '_fraction_correct_vs_mute_freq', xlabel='mut freq', ylabel='fraction correct up to allele', xbounds=(0., 0.5), only_csv=only_csv, write_csv=True) # per-gene support stuff for region in utils.regions: if self.hists[region + '_allele_right_vs_per_gene_support'].integral(include_overflows=True) == 0: continue hright = self.hists[region + '_allele_right_vs_per_gene_support'] hwrong = self.hists[region + '_allele_wrong_vs_per_gene_support'] plotting.make_fraction_plot(hright, hwrong, plotdir + '/gene-call', region + '_allele_fraction_correct_vs_per_gene_support', xlabel='support', ylabel='fraction with correct allele', xbounds=(-0.1, 1.1), only_csv=only_csv, write_csv=True) if not only_csv: # write html file and fix permissiions for substr in self.subplotdirs: plotting.make_html(plotdir + '/' + substr, n_columns=4) print '(%.1f sec)' % (time.time()-start)
def make_mean_plots(plotdir, subdirs, outdir): meanlist, variancelist = [], [] normalized_means = [] for sd in subdirs: with opener('r')(plotdir + '/' + sd + '/plots/means.csv') as meanfile: reader = csv.DictReader(meanfile) for line in reader: means = [float(m) for m in line['means'].split(':')] meanlist.append(numpy.mean(means)) variancelist.append(numpy.var(means)) nmvals = [ float(nm) for nm in line['normalized-means'].split(':') ] normalized_means += nmvals import matplotlib matplotlib.use('Agg') from matplotlib import pyplot # ---------------------------------------------------------------------------------------- # first make hexbin plot pyplot.subplot(111) pyplot.hexbin(meanlist, variancelist, gridsize=20, cmap=matplotlib.cm.gist_yarg, bins=None) # pyplot.axis([0, 5, 0, 2]) pyplot.xlabel('mean') pyplot.ylabel('variance') cb = pyplot.colorbar() cb.set_label('mean value') utils.prep_dir(outdir + '/plots', multilings=['*.png', '*.svg', '*.csv']) pyplot.savefig(outdir + '/plots/hexmeans.png') pyplot.clf() # ---------------------------------------------------------------------------------------- # then make normalized mean plot n, bins, patches = pyplot.hist(normalized_means, 50) pyplot.xlabel(r'$(x_i - \mu) / \sigma_i$') pyplot.title(r'$\sigma=' + str(math.sqrt(numpy.var(normalized_means))) + '$') # pyplot.axis([-10, 10, 0, 220]) pyplot.savefig(outdir + '/plots/means.png') check_call( ['./permissify-www', outdir] ) # NOTE this should really permissify starting a few directories higher up
def plot(self, plotdir, partition=None, infiles=None, annotations=None, only_csv=None): import plotting print ' plotting partitions' sys.stdout.flush() start = time.time() for subdir in self.subplotdirs: utils.prep_dir(plotdir + '/' + subdir, wildlings=['*.csv', '*.svg']) fnames = [] if partition is not None: # one partition assert infiles is None assert annotations is not None csize_hists = {'best': plotting.get_cluster_size_hist(partition)} # self.plot_within_vs_between_hists(partition, annotations, plotdir) fnames += self.plot_size_vs_shm(partition, annotations, plotdir) elif infiles is not None: # plot the mean of a partition from each file subset_hists = [] for fname in infiles: cp = ClusterPath() cp.readfile(fname) subset_hists.append( plotting.get_cluster_size_hist(cp.partitions[cp.i_best])) csize_hists = {'best': plotting.make_mean_hist(subset_hists)} for ih in range(len(subset_hists)): subset_hists[ih].write(plotdir + ('/subset-%d-cluster-sizes.csv' % ih)) else: assert False plotting.plot_cluster_size_hists(plotdir + '/overall/cluster-sizes.svg', csize_hists, title='', log='x') fnames.append(['cluster-sizes.svg']) if not only_csv: for subdir in self.subplotdirs: plotting.make_html(plotdir + '/' + subdir, fnames=fnames, new_table_each_row=True) print '(%.1f sec)' % (time.time() - start)
def write(self, base_outdir): print ' writing parameters' start = time.time() utils.prep_dir(base_outdir, multilings=('*.csv', '*.svg')) mute_start = time.time() self.mutefreqer.write( base_outdir, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv' ) # REGION is replace by each region in the three output files) print ' mut freq write time: %.3f' % (time.time() - mute_start) # print ' %d / %d cached' % (self.mutefreqer.n_cached, self.mutefreqer.n_cached + self.mutefreqer.n_not_cached) for column in self.counts: index = None outfname = None if column == 'all': index = utils.index_columns outfname = base_outdir + '/' + utils.get_parameter_fname( column='all') elif '_content' in column: index = [ column, ] outfname = base_outdir + '/' + column + '.csv' else: index = [ column, ] + utils.column_dependencies[column] outfname = base_outdir + '/' + utils.get_parameter_fname( column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with opener('w')(outfname) as outfile: out_fieldnames = list(index) out_fieldnames.append('count') out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line['count'] = count out_data.writerow(line) print ' parameter write time: %.3f' % (time.time() - start)
def compare_directories(args, plotdirlist, outdir): utils.prep_dir(outdir, wildlings=['*.png', '*.svg', '*.csv']) # read hists from <plotdirlist> allhists = OrderedDict() allvars = set() # all variables that appeared in any dir for idir in range(len(plotdirlist)): dirhists = get_hists_from_dir(plotdirlist[idir], args.names[idir]) allvars |= set(dirhists.keys()) allhists[args.names[idir]] = dirhists # then loop over all the <varname>s we found for varname in allvars: hlist = [allhists[dname].get(varname, Hist(1, 0, 1, title='null')) for dname in allhists] plot_single_variable(args, varname, hlist, outdir, pathnameclues=plotdirlist[0]) plotting.make_html(outdir, n_columns=4)
def kmeans_cluster_v_seqs(self, qr_seqs, swfo, plotdir=None, debug=False): if plotdir is not None: utils.prep_dir(plotdir, wildlings=['*.svg'], subdirs=[d for d in os.listdir(plotdir) if os.path.isdir(plotdir + '/' + d)], rm_subdirs=True) clusterfos = [] if debug: print 'kmeans clustering' print ' seqs family' for family, seqfos in self.get_family_groups(qr_seqs, swfo).items(): if debug: print ' %5d %s' % (len(seqfos), family) partition = mds.bios2mds_kmeans_cluster(self.n_mds_components, self.XXX_n_kmeans_clusters, seqfos, self.args.workdir + '/mds', self.args.seed, reco_info=self.reco_info, region=self.region, plotdir=plotdir + '/' + family if plotdir is not None else None) # partition = mds.run_sklearn_mds(self.n_mds_components, self.XXX_n_kmeans_clusters, seqfos, self.args.seed, reco_info=self.reco_info, region=self.region, plotdir=plotdir + '/' + family if plotdir is not None else None) clusterfos += self.get_clusterfos_from_partition(partition, qr_seqs) clusterfos = sorted(clusterfos, key=lambda c: len(c['seqfos']), reverse=True) return clusterfos
def __init__(self, args, base_plotdir): self.base_plotdir = base_plotdir self.eps_to_skip = 1e-3 print 'skipping eps %f' % self.eps_to_skip plot_types = ('transitions', 'emissions') for ptype in plot_types: plotdir = self.base_plotdir + '/' + ptype utils.prep_dir(plotdir, wildlings=['*.png', '*.svg']) if args.hmmdir != None: self.filelist = glob.glob(args.hmmdir + '/*.yaml') else: self.filelist = utils.get_arg_list(args.infiles) if len(self.filelist) == 0: raise Exception('zero files passed to modelplotter')
def write(self, base_outdir, mean_freq_outfname): if not self.finalized: self.finalize() outdir = base_outdir + '/mute-freqs' utils.prep_dir(outdir, '*.csv') for gene in self.counts: counts, freqs, plotting_info = self.counts[gene], self.freqs[ gene], self.plotting_info[gene] sorted_positions = sorted(counts) outfname = outdir + '/' + utils.sanitize_name(gene) + '.csv' with opener('w')(outfname) as outfile: nuke_header = [] for nuke in utils.nukes: nuke_header.append(nuke) nuke_header.append(nuke + '_obs') nuke_header.append(nuke + '_lo_err') nuke_header.append(nuke + '_hi_err') writer = csv.DictWriter( outfile, ('position', 'mute_freq', 'lo_err', 'hi_err') + tuple(nuke_header)) writer.writeheader() for position in sorted_positions: row = { 'position': position, 'mute_freq': counts[position]['freq'], 'lo_err': counts[position]['freq_lo_err'], 'hi_err': counts[position]['freq_hi_err'] } for nuke in utils.nukes: row[nuke] = freqs[position][nuke] row[nuke + '_obs'] = counts[position][nuke] row[nuke + '_lo_err'] = freqs[position][nuke + '_lo_err'] row[nuke + '_hi_err'] = freqs[position][nuke + '_hi_err'] writer.writerow(row) assert 'REGION' in mean_freq_outfname self.mean_rates['all'].write( mean_freq_outfname.replace( 'REGION', 'all')) # hackey hackey hackey replacement... *sigh* for region in utils.regions: self.mean_rates[region].write( mean_freq_outfname.replace('REGION', region))
def __init__(self, args, glfo, seed, workdir): # NOTE <gldir> is not in general the same as <args.initial_germline_dir> # rm workdir self.args = args self.glfo = glfo if len(glfo['seqs']['v']) > 100: # this is kind of a shitty criterion, but I don't know what would be better (we basically just want to warn people if they're simulating from data/germlines/human) print ' note: simulating with a very large number (%d) of V genes (the use of realistic diploid sets can be controlled either by using inferred germline sets that you\'ve got lying around (--reco-parameter-dir), or with --generate-germline-set)' % len(glfo['seqs']['v']) self.workdir = tempfile.mkdtemp() utils.prep_dir(self.workdir) assert self.args.parameter_dir is None self.reco_parameter_dir = self.args.reco_parameter_dir + '/' + self.args.parameter_type if self.args.reco_parameter_dir is not None else None self.shm_parameter_dir = self.args.shm_parameter_dir + '/' + self.args.parameter_type if self.args.shm_parameter_dir is not None else None self.index_keys = {} # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict self.mute_models = {} # self.treeinfo = [] # list of newick-formatted tree strings with region-specific branch info tacked at the end for region in utils.regions: self.mute_models[region] = {} for model in ['gtr', 'gamma']: self.mute_models[region][model] = {} self.allele_prevalence_freqs = glutils.read_allele_prevalence_freqs(args.allele_prevalence_fname) if args.allele_prevalence_fname is not None else {} self.version_freq_table = self.read_vdj_version_freqs() # list of the probabilities with which each VDJ combo (plus other rearrangement parameters) appears in data (none if rearranging from scratch) self.insertion_content_probs = self.read_insertion_content() # dummy/uniform if rearranging from scratch self.all_mute_freqs = {} # read shm info NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals with open(self.args.gtrfname, 'r') as gtrfile: # read gtr parameters reader = csv.DictReader(gtrfile) for line in reader: parameters = line['parameter'].split('.') region = parameters[0][3].lower() assert region == 'v' or region == 'd' or region == 'j' model = parameters[1].lower() parameter_name = parameters[2] assert model in self.mute_models[region] self.mute_models[region][model][parameter_name] = line['value'] treegen = treegenerator.TreeGenerator(args, self.shm_parameter_dir, seed=seed) self.treefname = self.workdir + '/trees.tre' treegen.generate_trees(seed, self.treefname, self.workdir) # NOTE not really a newick file, since I hack on the per-region branch length info at the end of each line with open(self.treefname, 'r') as treefile: # read in the trees (and other info) that we just generated self.treeinfo = treefile.readlines() os.remove(self.treefname) self.validation_values = {'heights' : {t : {'in' : [], 'out' : []} for t in ['all'] + utils.regions}}
def write(self, base_outdir): print " writing parameters", sys.stdout.flush() start = time.time() utils.prep_dir( base_outdir, subdirs=("hmms", "mute-freqs", "germline-sets"), wildlings=("*.csv", "*.yaml", "*.fasta") ) # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary self.mfreqer.write( base_outdir + "/mute-freqs", mean_freq_outfname=base_outdir + "/REGION-mean-mute-freqs.csv" ) # REGION is replace by each region in the three output files) genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + "_gene"].keys()] glutils.write_glfo(base_outdir + "/" + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=False) for column in self.counts: index = None outfname = None if column == "all": index = tuple(list(utils.index_columns) + ["cdr3_length"]) outfname = base_outdir + "/" + utils.get_parameter_fname(column="all") elif "_content" in column: index = [column] outfname = base_outdir + "/" + column + ".csv" else: index = [column] + utils.column_dependencies[column] outfname = base_outdir + "/" + utils.get_parameter_fname(column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with opener("w")(outfname) as outfile: out_fieldnames = list(index) out_fieldnames.append("count") out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line["count"] = count out_data.writerow(line) print "(%.1f sec)" % (time.time() - start)
def run_sklearn_mds(n_components, n_clusters, seqfos, seed, reco_info=None, region=None, aligned=False, n_init=4, max_iter=300, eps=1e-3, n_jobs=-1, plotdir=None): print '%s not testing this after moving these imports down here' % utils.color('red', 'hey') from sklearn import manifold # these are both slow af to import, even on local ssd from sklearn.cluster import KMeans if len(set(sfo['name'] for sfo in seqfos)) != len(seqfos): raise Exception('duplicate sequence ids in <seqfos>') print 'align' if not aligned: # NOTE unlike the bios2mds version above, this modifies <seqfos> seqfos = utils.align_many_seqs(seqfos) print ' distances' # translations = string.maketrans('ACGT-', '01234') # def convert(seq): # return [int(c) for c in seq.translate(translations)] # converted_seqs = [convert(x['seq']) for x in seqfos] # similarities = scipy.spatial.distance.pdist(converted_seqs, 'hamming') # similarities = scipy.spatial.distance.squareform(similarities) similarities = scipy.spatial.distance.squareform([utils.hamming_fraction(seqfos[i]['seq'], seqfos[j]['seq']) for i in range(len(seqfos)) for j in range(i + 1, len(seqfos))]) print ' mds' random_state = numpy.random.RandomState(seed=seed) mds = manifold.MDS(n_components=n_components, n_init=n_init, max_iter=max_iter, eps=eps, random_state=random_state, dissimilarity="precomputed", n_jobs=n_jobs) pos = mds.fit_transform(similarities) # pos = mds.fit(similarities).embedding_ print ' kmeans' kmeans = KMeans(n_clusters=n_clusters, random_state=random_state).fit(pos) pcvals = {seqfos[iseq]['name'] : pos[iseq] for iseq in range(len(seqfos))} labels = {seqfos[iseq]['name'] : kmeans.labels_[iseq] for iseq in range(len(seqfos))} def keyfunc(q): # should really integrate this with utils.collapse_naive_seqs()/utils.split_partition_with_criterion() return labels[q] partition = [list(group) for _, group in itertools.groupby(sorted(pcvals, key=keyfunc), key=keyfunc)] if plotdir is not None: utils.prep_dir(plotdir, wildlings=['*.svg']) print ' plot' plot_mds(n_components, pcvals, plotdir, 'mds', partition=partition) if reco_info is not None: labels = {uid : reco_info[uid][region + '_gene'] for uid in pcvals} plot_mds(n_components, pcvals, plotdir, 'true-genes', labels=labels) return partition
def make_mean_plots(plotdir, subdirs, outdir): meanlist, variancelist = [], [] normalized_means = [] for sd in subdirs: with opener('r')(plotdir + '/' + sd + '/plots/means.csv') as meanfile: reader = csv.DictReader(meanfile) for line in reader: means = [ float(m) for m in line['means'].split(':') ] meanlist.append(numpy.mean(means)) variancelist.append(numpy.var(means)) nmvals = [ float(nm) for nm in line['normalized-means'].split(':') ] normalized_means += nmvals import matplotlib matplotlib.use('Agg') from matplotlib import pyplot # ---------------------------------------------------------------------------------------- # first make hexbin plot pyplot.subplot(111) pyplot.hexbin(meanlist, variancelist, gridsize=20, cmap=matplotlib.cm.gist_yarg, bins=None) # pyplot.axis([0, 5, 0, 2]) pyplot.xlabel('mean') pyplot.ylabel('variance') cb = pyplot.colorbar() cb.set_label('mean value') utils.prep_dir(outdir + '/plots', multilings=['*.png', '*.svg', '*.csv']) pyplot.savefig(outdir + '/plots/hexmeans.png') pyplot.clf() # ---------------------------------------------------------------------------------------- # then make normalized mean plot n, bins, patches = pyplot.hist(normalized_means, 50) pyplot.xlabel(r'$(x_i - \mu) / \sigma_i$') pyplot.title(r'$\sigma=' + str(math.sqrt(numpy.var(normalized_means))) + '$') # pyplot.axis([-10, 10, 0, 220]) pyplot.savefig(outdir + '/plots/means.png') check_call(['./permissify-www', outdir]) # NOTE this should really permissify starting a few directories higher up
def write(self, base_outdir): print ' writing parameters', sys.stdout.flush() start = time.time() utils.prep_dir(base_outdir, subdirs=('hmms', 'mute-freqs', 'germline-sets'), wildlings=('*.csv', '*.yaml', '*.fasta')) # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary self.mfreqer.write(base_outdir + '/mute-freqs', mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files) genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + '_gene'].keys()] glutils.write_glfo(base_outdir + '/' + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=False) for column in self.counts: index = None outfname = None if column == 'all': index = tuple(list(utils.index_columns) + ['cdr3_length', ]) outfname = base_outdir + '/' + utils.get_parameter_fname(column='all') elif '_content' in column: index = [column,] outfname = base_outdir + '/' + column + '.csv' else: index = [column,] + utils.column_dependencies[column] outfname = base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with open(outfname, 'w') as outfile: out_fieldnames = list(index) out_fieldnames.append('count') out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line['count'] = count out_data.writerow(line) print '(%.1f sec)' % (time.time()-start)
def make_tree(all_genes, workdir, use_cache=False): aligned_fname = workdir + '/all-aligned.fa' raxml_label = 'xxx' raxml_output_fnames = [ '%s/RAxML_%s.%s' % (workdir, fn, raxml_label) for fn in ['parsimonyTree', 'log', 'result', 'info', 'bestTree'] ] treefname = [fn for fn in raxml_output_fnames if 'result' in fn][0] if use_cache: # don't re-run muxcle & raxml, just use the previous run's output tree file return treefname utils.prep_dir(workdir, wildlings=[ '*.' + raxml_label, os.path.basename(aligned_fname), 'out', 'err', os.path.basename(aligned_fname) + '.reduced' ]) # write and align an .fa with all alleles from any gl set with tempfile.NamedTemporaryFile() as tmpfile: for name, seq in all_genes.items(): tmpfile.write('>%s\n%s\n' % (name, seq)) tmpfile.flush() # BEWARE if you forget this you are f****d cmdstr = '%s -in %s -out %s' % (args.muscle_path, tmpfile.name, aligned_fname) print ' %s %s' % (utils.color('red', 'run'), cmdstr) utils.run_cmds(get_cmdfos(cmdstr, workdir, aligned_fname), ignore_stderr=True) # get a tree for the aligned .fa cmdstr = '%s -mGTRCAT -n%s -s%s -p1 -w%s' % (args.raxml_path, raxml_label, aligned_fname, workdir) print ' %s %s' % (utils.color('red', 'run'), cmdstr) utils.run_cmds(get_cmdfos(cmdstr, workdir, treefname), ignore_stderr=True) os.remove(aligned_fname) # rm muscle output for fn in [ f for f in raxml_output_fnames if f != treefname ]: # rm all the raxml outputs except what the one file we really want os.remove(fn) return treefname
def write(self, base_outdir, my_datadir=None): print ' writing parameters', sys.stdout.flush() start = time.time() utils.prep_dir(base_outdir, subdirs=('hmms', 'mute-freqs', 'germline-sets'), wildlings=('*.csv', '*.yaml', '*.fasta')) # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary self.mfreqer.write(base_outdir + '/mute-freqs', mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files) genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + '_gene'].keys()] glutils.write_glfo(base_outdir + '/' + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=True) for column in self.counts: index = None outfname = None if column == 'all': index = tuple(list(utils.index_columns) + ['cdr3_length', ]) outfname = base_outdir + '/' + utils.get_parameter_fname(column='all') elif '_content' in column: index = [column,] outfname = base_outdir + '/' + column + '.csv' else: index = [column,] + utils.column_dependencies[column] outfname = base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with opener('w')(outfname) as outfile: out_fieldnames = list(index) out_fieldnames.append('count') out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line['count'] = count out_data.writerow(line) print '(%.1f sec)' % (time.time()-start)
def plot(self, base_plotdir, cyst_positions=None, tryp_positions=None, only_csv=False): if not self.finalized: self.finalize() plotdir = base_plotdir + '/mute-freqs' utils.prep_dir(plotdir + '/plots', multilings=('*.csv', '*.svg')) for region in utils.regions: utils.prep_dir(plotdir + '/' + region + '/plots', multilings=('*.csv', '*.svg')) # utils.prep_dir(plotdir + '/' + region + '-per-base/plots', multilings=('*.csv', '*.png')) for gene in self.counts: counts, plotting_info = self.counts[gene], self.plotting_info[gene] sorted_positions = sorted(counts) genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='fixme', ytitle='fixme') #, title=utils.sanitize_name(gene)) for position in sorted_positions: hi_diff = abs(counts[position]['freq'] - counts[position]['freq_hi_err']) lo_diff = abs(counts[position]['freq'] - counts[position]['freq_lo_err']) err = 0.5*(hi_diff + lo_diff) genehist.set_ibin(genehist.find_bin(position), counts[position]['freq'], error=err) xline = None figsize = [3, 3] if utils.get_region(gene) == 'v' and cyst_positions is not None: xline = cyst_positions[gene]['cysteine-position'] figsize[0] *= 3.5 elif utils.get_region(gene) == 'j' and tryp_positions is not None: xline = int(tryp_positions[gene]) figsize[0] *= 2 plotting.draw_no_root(genehist, plotdir=plotdir + '/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv) # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # needs translation to mpl # make mean mute freq hists plotting.draw_no_root(self.mean_rates['all'], plotname='all-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv) for region in utils.regions: plotting.draw_no_root(self.mean_rates[region], plotname=region+'-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv) if not only_csv: # write html file and fix permissiions check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg']) for region in utils.regions: check_call(['./bin/makeHtml', plotdir + '/' + region, '1', 'null', 'svg']) # check_call(['./bin/makeHtml', plotdir + '/' + region + '-per-base', '1', 'null', 'png']) check_call(['./bin/permissify-www', plotdir]) # NOTE this should really permissify starting a few directories higher up
def write_vdjalign_input(self, base_infname, n_procs): queries_per_proc = float(len(self.remaining_queries)) / n_procs n_queries_per_proc = int(math.ceil(queries_per_proc)) if n_procs == 1: # double check for rounding problems or whatnot assert n_queries_per_proc == len(self.remaining_queries) for iproc in range(n_procs): workdir = self.args.workdir if n_procs > 1: workdir += '/sw-' + str(iproc) utils.prep_dir(workdir) with opener('w')(workdir + '/' + base_infname) as sub_infile: for iquery in range(iproc*n_queries_per_proc, (iproc + 1)*n_queries_per_proc): if iquery >= len(self.remaining_queries): break query_name = self.remaining_queries[iquery] sub_infile.write('>' + query_name + ' NUKES\n') seq = self.input_info[query_name]['seq'] if query_name in self.info['indels']: seq = self.info['indels'][query_name]['reversed_seq'] # use the query sequence with shm insertions and deletions reversed sub_infile.write(seq + '\n')
def write(self, base_outdir): print ' writing parameters', sys.stdout.flush() start = time.time() utils.prep_dir(base_outdir, multilings=('*.csv', '*.svg')) # mute_start = time.time() self.mutefreqer.write(base_outdir, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files) # print ' mut freq write time: %.3f' % (time.time() - mute_start) # print ' %d / %d cached' % (self.mutefreqer.n_cached, self.mutefreqer.n_cached + self.mutefreqer.n_not_cached) for column in self.counts: index = None outfname = None if column == 'all': index = utils.index_columns outfname = base_outdir + '/' + utils.get_parameter_fname(column='all') elif '_content' in column: index = [column,] outfname = base_outdir + '/' + column + '.csv' else: index = [column,] + utils.column_dependencies[column] outfname = base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with opener('w')(outfname) as outfile: out_fieldnames = list(index) out_fieldnames.append('count') out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line['count'] = count out_data.writerow(line) print '(%.1f sec)' % (time.time()-start)
def split_input(self, n_procs, infname=None, info=None, prefix='sub'): """ If <infname> is specified split the csv info from it into <n_procs> input files in subdirectories labelled with '<prefix>-' within <self.args.workdir> If <info> is specified, instead split the list <info> into pieces and return a list of the resulting lists """ if info is None: assert infname is not None info = [] with opener('r')(infname) as infile: reader = csv.DictReader(infile) for line in reader: info.append(line) else: assert infname is None # make sure only *one* of 'em is specified outlists = [] queries_per_proc = float(len(info)) / n_procs n_queries_per_proc = int(math.ceil(queries_per_proc)) for iproc in range(n_procs): if infname is None: outlists.append([]) else: subworkdir = self.args.workdir + '/' + prefix + '-' + str( iproc) utils.prep_dir(subworkdir) sub_outfile = opener('w')(subworkdir + '/' + os.path.basename(infname)) writer = csv.DictWriter(sub_outfile, reader.fieldnames) writer.writeheader() for iquery in range(iproc * n_queries_per_proc, (iproc + 1) * n_queries_per_proc): if iquery >= len(info): break if infname is None: outlists[-1].append(info[iquery]) else: writer.writerow(info[iquery]) if infname is None: return outlists
def finalize_tigger(self): utils.prep_dir(os.getenv('www') + '/partis/tmp', wildling='*.svg') for gene in self.counts: if utils.get_region(gene) != 'v': continue print '\n%s' % gene print ' position x-icpt y-icpt slope mut / total' mean_x_icpt = {'sum' : 0., 'total' : 0.} for position in sorted(self.counts[gene].keys()): self.freqs[gene][position]['tigger'] = self.tigger_calcs(position, self.counts[gene][position], mean_x_icpt) print mean_x_icpt if mean_x_icpt['total'] > 0.: print mean_x_icpt['sum'] / mean_x_icpt['total'] assert False for gene in self.freqs: if utils.get_region(gene) != 'v': continue info = {p : self.freqs[gene][p]['tigger-fits'] for p in self.freqs[gene]} x_intercepts = [-v['intercept'] / v['slope'] for k, v in info.items() if v['intercept'] is not None and v['intercept'] < 0.3] print sorted(x_intercepts) print sum(x_intercepts) / float(len(x_intercepts)) print numpy.median(x_intercepts)
def write_vdjalign_input(self, base_infname, n_procs): queries_per_proc = float(len(self.remaining_queries)) / n_procs n_queries_per_proc = int(math.ceil(queries_per_proc)) if n_procs == 1: # double check for rounding problems or whatnot assert n_queries_per_proc == len(self.remaining_queries) for iproc in range(n_procs): workdir = self.args.workdir if n_procs > 1: workdir += '/sw-' + str(iproc) utils.prep_dir(workdir) with opener('w')(workdir + '/' + base_infname) as sub_infile: for iquery in range(iproc * n_queries_per_proc, (iproc + 1) * n_queries_per_proc): if iquery >= len(self.remaining_queries): break query_name = self.remaining_queries[iquery] sub_infile.write('>' + query_name + ' NUKES\n') seq = self.input_info[query_name]['seq'] if query_name in self.info['indels']: seq = self.info['indels'][query_name][ 'reversed_seq'] # use the query sequence with shm insertions and deletions reversed sub_infile.write(seq + '\n')
def __init__(self, germlines, plotdir, name): self.germlines = germlines self.plotdir = plotdir self.name = name utils.prep_dir(self.plotdir + '/plots', wildling=None, multilings=['*.csv', '*.svg', '*.root']) self.values = {} for column in utils.index_columns: if column == 'cdr3_length': # kind of finicky to figure out what this is, so I don't always set it continue self.values[column] = {} if column in bool_columns: self.values[column]['right'] = 0 self.values[column]['wrong'] = 0 self.values['hamming_to_true_naive'] = {} for region in utils.regions: self.values[region + '_hamming_to_true_naive'] = {} self.values[region + '_hamming_to_true_naive_normed'] = {} # for bound in utils.boundaries: # self.counts[bound + '_insertion_content'] = {'A':0, 'C':0, 'G':0, 'T':0} # base content of each insertion # self.counts['seq_content'] = {'A':0, 'C':0, 'G':0, 'T':0} # n_bins, xmin, xmax = 100, 0.0, 1.0 self.hists = {} self.hists['mute_freqs'] = Hist(30, -0.05, 0.05)
def __init__(self, args, base_plotdir): self.base_plotdir = base_plotdir self.eps_to_skip = 1e-3 print 'skipping eps %f' % self.eps_to_skip plot_types = ('transitions', 'emissions') for ptype in plot_types: plotdir = self.base_plotdir + '/' + ptype utils.prep_dir(plotdir, wildlings=['*.png', '*.svg']) if args.hmmdir != None: filelist = glob.glob(args.hmmdir + '/*.yaml') else: filelist = utils.get_arg_list(args.infiles) if len(filelist) == 0: raise Exception('zero files passed to modelplotter') for infname in filelist: gene_name = os.path.basename(infname).replace('.yaml', '') # the sanitized name, actually with open(infname) as infile: model = yaml.load(infile) self.make_transition_plot(gene_name, model) self.make_emission_plot(gene_name, model)
def write_vdjalign_input(self, base_infname): # first make a list of query names so we can iterate over an ordered collection ordered_info = [] for query_name in self.input_info: ordered_info.append(query_name) queries_per_proc = float(len(self.input_info)) / self.args.n_procs n_queries_per_proc = int(math.ceil(queries_per_proc)) if self.args.n_procs == 1: # double check for rounding problems or whatnot assert n_queries_per_proc == len(self.input_info) for iproc in range(self.args.n_procs): workdir = self.args.workdir if self.args.n_procs > 1: workdir += '/sw-' + str(iproc) utils.prep_dir(workdir) infname = workdir + '/' + base_infname with opener('w')(workdir + '/' + base_infname) as sub_infile: for iquery in range(iproc*n_queries_per_proc, (iproc + 1)*n_queries_per_proc): if iquery >= len(ordered_info): break query_name = ordered_info[iquery] sub_infile.write('>' + str(query_name) + ' NUKES\n') sub_infile.write(self.input_info[query_name]['seq'] + '\n')
def write_hmms(self, parameter_dir, sw_matches): print 'writing hmms with info from %s' % parameter_dir start = time.time() from hmmwriter import HmmWriter hmm_dir = parameter_dir + '/hmms' utils.prep_dir(hmm_dir, '*.yaml') gene_list = self.args.only_genes if gene_list == None: # if specific genes weren't specified, do the ones for which we have matches gene_list = [] for region in utils.regions: for gene in self.germline_seqs[region]: if sw_matches == None or gene in sw_matches: # shouldn't be None really, but I'm testing something gene_list.append(gene) for gene in gene_list: if self.args.debug: print ' %s' % utils.color_gene(gene) writer = HmmWriter(parameter_dir, hmm_dir, gene, self.args.naivety, self.germline_seqs[utils.get_region(gene)][gene], self.args) writer.write() print ' time to write hmms: %.3f' % (time.time()-start)
def __init__(self, args, base_plotdir): self.base_plotdir = base_plotdir self.eps_to_skip = 1e-3 print 'skipping eps %f' % self.eps_to_skip plot_types = ('transitions', 'emissions') for ptype in plot_types: plotdir = self.base_plotdir + '/' + ptype utils.prep_dir(plotdir, wildlings=['*.png', '*.svg']) if args.hmmdir != None: filelist = glob.glob(args.hmmdir + '/*.yaml') else: filelist = utils.get_arg_list(args.infiles) if len(filelist) == 0: raise Exception('zero files passed to modelplotter') for infname in filelist: gene_name = os.path.basename(infname).replace( '.yaml', '') # the sanitized name, actually with open(infname) as infile: model = yaml.load(infile) self.make_transition_plot(gene_name, model) self.make_emission_plot(gene_name, model)
def write_hmms(self, parameter_dir, sw_matches): print 'writing hmms with info from %s' % parameter_dir start = time.time() from hmmwriter import HmmWriter hmm_dir = parameter_dir + '/hmms' utils.prep_dir(hmm_dir, '*.yaml') gene_list = self.args.only_genes if gene_list == None: # if specific genes weren't specified, do the ones for which we have matches gene_list = [] for region in utils.regions: for gene in self.germline_seqs[region]: if sw_matches == None or gene in sw_matches: # shouldn't be None really, but I'm testing something gene_list.append(gene) for gene in gene_list: if self.args.debug: print ' %s' % utils.color_gene(gene) writer = HmmWriter( parameter_dir, hmm_dir, gene, self.args.naivety, self.germline_seqs[utils.get_region(gene)][gene], self.args) writer.write() print ' time to write hmms: %.3f' % (time.time() - start)
def write_vdjalign_input(self, base_infname): # first make a list of query names so we can iterate over an ordered collection ordered_info = [] for query_name in self.input_info: ordered_info.append(query_name) queries_per_proc = float(len(self.input_info)) / self.args.n_procs n_queries_per_proc = int(math.ceil(queries_per_proc)) if self.args.n_procs == 1: # double check for rounding problems or whatnot assert n_queries_per_proc == len(self.input_info) for iproc in range(self.args.n_procs): workdir = self.args.workdir if self.args.n_procs > 1: workdir += '/sw-' + str(iproc) utils.prep_dir(workdir) infname = workdir + '/' + base_infname with opener('w')(workdir + '/' + base_infname) as sub_infile: for iquery in range(iproc * n_queries_per_proc, (iproc + 1) * n_queries_per_proc): if iquery >= len(ordered_info): break query_name = ordered_info[iquery] sub_infile.write('>' + str(query_name) + ' NUKES\n') sub_infile.write(self.input_info[query_name]['seq'] + '\n')
def split_input(self, n_procs, infname=None, info=None, prefix='sub'): """ If <infname> is specified split the csv info from it into <n_procs> input files in subdirectories labelled with '<prefix>-' within <self.args.workdir> If <info> is specified, instead split the list <info> into pieces and return a list of the resulting lists """ if info is None: assert infname is not None info = [] with opener('r')(infname) as infile: reader = csv.DictReader(infile) for line in reader: info.append(line) else: assert infname is None # make sure only *one* of 'em is specified outlists = [] queries_per_proc = float(len(info)) / n_procs n_queries_per_proc = int(math.ceil(queries_per_proc)) for iproc in range(n_procs): if infname is None: outlists.append([]) else: subworkdir = self.args.workdir + '/' + prefix + '-' + str(iproc) utils.prep_dir(subworkdir) sub_outfile = opener('w')(subworkdir + '/' + os.path.basename(infname)) writer = csv.DictWriter(sub_outfile, reader.fieldnames) writer.writeheader() for iquery in range(iproc*n_queries_per_proc, (iproc + 1)*n_queries_per_proc): if iquery >= len(info): break if infname is None: outlists[-1].append(info[iquery]) else: writer.writerow(info[iquery]) if infname is None: return outlists
def plot(self, base_plotdir, cyst_positions=None, tryp_positions=None): if not self.finalized: self.finalize() plotdir = base_plotdir + '/mute-freqs' utils.prep_dir(plotdir + '/plots', multilings=('*.csv', '*.svg')) for region in utils.regions: utils.prep_dir(plotdir + '/' + region + '/plots', multilings=('*.csv', '*.svg')) utils.prep_dir(plotdir + '/' + region + '-per-base/plots', multilings=('*.csv', '*.png')) for gene in self.counts: counts, plotting_info = self.counts[gene], self.plotting_info[gene] sorted_positions = sorted(counts) hist = TH1D('hist_' + utils.sanitize_name(gene), '', sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5) for position in sorted_positions: hist.SetBinContent(hist.FindBin(position), counts[position]['freq']) hi_diff = abs(counts[position]['freq'] - counts[position]['freq_hi_err']) lo_diff = abs(counts[position]['freq'] - counts[position]['freq_lo_err']) err = 0.5*(hi_diff + lo_diff) hist.SetBinError(hist.FindBin(position), err) plotfname = plotdir + '/' + utils.get_region(gene) + '/plots/' + utils.sanitize_name(gene) + '.svg' xline = None if utils.get_region(gene) == 'v' and cyst_positions is not None: xline = cyst_positions[gene]['cysteine-position'] elif utils.get_region(gene) == 'j' and tryp_positions is not None: xline = int(tryp_positions[gene]) plotting.draw(hist, 'int', plotdir=plotdir + '/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, draw_str='e') #, cwidth=4000, cheight=1000) paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # for region in utils.regions: # utils.prep_dir(plotdir + '/' + region + '/tmp/plots', multilings=('*.csv', '*.svg')) # for gene in self.tmpcounts: # for position in self.tmpcounts[gene]: # roothist = plotting.make_hist_from_my_hist_class(self.tmpcounts[gene][position]['muted'], gene + '_' + str(position)) # plotting.draw(roothist, 'int', plotdir=plotdir + '/' + utils.get_region(gene) + '/tmp', plotname=utils.sanitize_name(gene) + '_' + str(position), errors=True, write_csv=True) #, cwidth=4000, cheight=1000) # make mean mute freq hists hist = plotting.make_hist_from_my_hist_class(self.mean_rates['all'], 'all-mean-freq') plotting.draw(hist, 'float', plotname='all-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True) for region in utils.regions: hist = plotting.make_hist_from_my_hist_class(self.mean_rates[region], region+'-mean-freq') plotting.draw(hist, 'float', plotname=region+'-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True) check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg']) # then write html file and fix permissiions for region in utils.regions: check_call(['./bin/makeHtml', plotdir + '/' + region, '1', 'null', 'svg']) check_call(['./bin/makeHtml', plotdir + '/' + region + '-per-base', '1', 'null', 'png']) check_call(['./bin/permissify-www', plotdir]) # NOTE this should really permissify starting a few directories higher up
def plot(self, base_plotdir, cyst_positions=None, tryp_positions=None, only_csv=False): if not self.finalized: self.finalize() plotdir = base_plotdir + '/mute-freqs' overall_plotdir = plotdir + '/overall' utils.prep_dir(overall_plotdir, multilings=('*.csv', '*.svg')) for region in utils.regions: utils.prep_dir(plotdir + '/' + region, multilings=('*.csv', '*.svg')) # utils.prep_dir(plotdir + '/' + region + '-per-base/plots', multilings=('*.csv', '*.png')) if self.tigger: utils.prep_dir(plotdir + '/tigger', multilings=('*.csv', '*.svg')) for gene in self.freqs: freqs = self.freqs[gene] sorted_positions = sorted(freqs.keys()) genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='fixme', ytitle='fixme') #, title=utils.sanitize_name(gene)) for position in sorted_positions: hi_diff = abs(freqs[position]['freq'] - freqs[position]['freq_hi_err']) lo_diff = abs(freqs[position]['freq'] - freqs[position]['freq_lo_err']) err = 0.5*(hi_diff + lo_diff) genehist.set_ibin(genehist.find_bin(position), freqs[position]['freq'], error=err) xline = None figsize = [3, 3] if utils.get_region(gene) == 'v' and cyst_positions is not None: xline = cyst_positions[gene] figsize[0] *= 3.5 elif utils.get_region(gene) == 'j' and tryp_positions is not None: xline = tryp_positions[gene] figsize[0] *= 2 plotting.draw_no_root(genehist, plotdir=plotdir + '/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv) # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # needs translation to mpl # make mean mute freq hists plotting.draw_no_root(self.mean_rates['all'], plotname='all-mean-freq', plotdir=overall_plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv) for region in utils.regions: plotting.draw_no_root(self.mean_rates[region], plotname=region+'-mean-freq', plotdir=overall_plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv) if self.tigger: self.tigger_plot(only_csv) if not only_csv: # write html file and fix permissiions plotting.make_html(overall_plotdir) for region in utils.regions: plotting.make_html(plotdir + '/' + region, n_columns=1)
def __init__(self, args, seed, sublabel=None, total_length_from_right=-1): self.args = args if sublabel == None: self.workdir = self.args.workdir + '/recombinator' self.outfname = self.args.outfname else: # need a separate workdir for each subprocess self.workdir = self.args.workdir + '/recombinator-' + sublabel self.outfname = self.workdir + '/' + os.path.basename( self.args.outfname) utils.prep_dir(self.workdir) if not os.path.exists(self.args.parameter_dir): raise Exception('ERROR ' + self.args.parameter_dir + ' d.n.e') # parameters that control recombination, erosion, and whatnot self.total_length_from_right = total_length_from_right # measured from right edge of j, only write to file this much of the sequence (our read lengths are 130 by this def'n a.t.m.) self.all_seqs = {} # all the Vs, all the Ds... self.index_keys = { } # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict self.version_freq_table = { } # list of the probabilities with which each VDJ combo appears in data self.mute_models = {} # self.treeinfo = [] # list of newick-formatted tree strings with region-specific branch info tacked at the end for region in utils.regions: self.mute_models[region] = {} for model in ['gtr', 'gamma']: self.mute_models[region][model] = {} # first read info that doesn't depend on which person we're looking at self.all_seqs = utils.read_germlines(self.args.datadir) with opener('r')( self.args.datadir + '/v-meta.json' ) as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')( self.args.datadir + '/j_tryp.csv' ) as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = { row[0]: row[1] for row in tryp_reader } # WARNING: this doesn't filter out the header line # then read stuff that's specific to each person self.read_vdj_version_freqs(self.args.parameter_dir + '/' + utils.get_parameter_fname('all')) self.read_insertion_content() if self.args.naivety == 'M': # read shm info if non-naive is requested # NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals with opener('r')( self.args.gtrfname) as gtrfile: # read gtr parameters reader = csv.DictReader(gtrfile) for line in reader: parameters = line['parameter'].split('.') region = parameters[0][3].lower() assert region == 'v' or region == 'd' or region == 'j' model = parameters[1].lower() parameter_name = parameters[2] assert model in self.mute_models[region] self.mute_models[region][model][parameter_name] = line[ 'value'] treegen = treegenerator.TreeGenerator(args, self.args.parameter_dir, seed=seed) self.treefname = self.workdir + '/trees.tre' treegen.generate_trees(seed, self.treefname) with opener('r')( self.treefname ) as treefile: # read in the trees (and other info) that we just generated self.treeinfo = treefile.readlines() if not self.args.no_clean: os.remove(self.treefname) if os.path.exists(self.outfname): os.remove(self.outfname) elif not os.path.exists(os.path.dirname(os.path.abspath( self.outfname))): os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))
def clean_plots(self, plotdir): for substr in self.subplotdirs: utils.prep_dir(plotdir + '/' + substr, wildlings=('*.csv', '*.svg'))
default='partis') parser.add_argument('--overwrite', action='store_true') parser.add_argument('--igbdir', default='./packages/ncbi-igblast-1.6.1/bin') parser.add_argument('--glfo-dir') parser.add_argument('--simulation-germline-dir') parser.add_argument('--locus', default='igh') parser.add_argument('--region', default='v') parser.add_argument('--species', default='human') parser.add_argument('--slurm', action='store_true') parser.add_argument('--changeo-path', default=os.getenv('HOME') + '/.local') parser.add_argument('--condapath', default=os.getenv('HOME') + '/miniconda3/bin') args = parser.parse_args() # ---------------------------------------------------------------------------------------- outdir = os.path.dirname( args.outfname ) # kind of annoying having <args.workdir> and <outdir>, but the former is for stuff we don't want to keep (not much... maybe just .cmd file), and the latter is for stuff we do assert outdir.split('/')[-1] == args.locus outdir = outdir.rstrip('/' + args.locus) utils.prep_dir(args.workdir, wildlings=['*.cmd', '*.fa', '*.sh']) #'*.fmt7']) utils.prep_dir(outdir, allow_other_files=True) initialize_germline_info(outdir) # deal with igblast germline crap outfname = run_alignment( args, outdir) # get the alignments, either with igblast or partis run_tigger(outfname, args.outfname, outdir) os.rmdir(args.workdir)
utils.print_reco_event(annotations[uid]) n_above_cutoff = len( [_ for cfo in chfo.values() if cfo['max_abs_diff'] > args.cutoff]) chimeric_fraction = n_above_cutoff / float(len(chfo)) print ' %d / %d = %.3f above chimeric cutoff' % (n_above_cutoff, len(chfo), chimeric_fraction) hmaxval = Hist(45, 0., 0.65) for uid in annotations: hmaxval.fill(chfo[uid]['max_abs_diff']) himax = Hist(75, 0., 400) for uid in annotations: himax.fill(chfo[uid]['imax']) utils.prep_dir(args.plotdir, wildlings=['*.svg', '*.csv']) import matplotlib from matplotlib import pyplot as plt fig, ax = plotting.mpl_init() xvals, yvals = zip(*[(v['imax'], v['max_abs_diff']) for v in chfo.values()]) plt.scatter(xvals, yvals, alpha=0.4) print 'writing to %s' % args.plotdir plotting.mpl_finish(ax, args.plotdir, 'hexbin', title=args.title, xlabel='break point', ylabel='abs mfreq diff')
def __init__( self, args, glfo, seed, workdir, outfname ): # NOTE <gldir> is not in general the same as <args.initial_germline_dir> # rm workdir self.args = args self.glfo = glfo # NOTE in general *not* the same as <self.args.workdir> and <self.args.outfname> self.workdir = tempfile.mkdtemp() self.outfname = outfname utils.prep_dir(self.workdir) # set <self.parameter_dir> (note that this is in general *not* the same as self.args.parameter_dir) if self.args.rearrange_from_scratch: # currently not allowed to mutate from scratch without also rearranging from scratch (enforced in bin/partis) if self.args.mutate_from_scratch: self.parameter_dir = None else: self.parameter_dir = self.args.scratch_mute_freq_dir # if you make up mute freqs from scratch, unless you're really careful you tend to get nonsense results for a lot of things (e.g. allele finding). So it's easier to copy over a reasonable set of mut freq parameters from somewhere. else: self.parameter_dir = self.args.parameter_dir + '/' + self.args.parameter_type self.index_keys = { } # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict self.mute_models = {} # self.treeinfo = [] # list of newick-formatted tree strings with region-specific branch info tacked at the end for region in utils.regions: self.mute_models[region] = {} for model in ['gtr', 'gamma']: self.mute_models[region][model] = {} self.allele_prevalence_freqs = glutils.read_allele_prevalence_freqs( args.allele_prevalence_fname ) if args.allele_prevalence_fname is not None else {} self.version_freq_table = self.read_vdj_version_freqs( ) # list of the probabilities with which each VDJ combo (plus other rearrangement parameters) appears in data (none if rearranging from scratch) self.insertion_content_probs = self.read_insertion_content( ) # dummy/uniform if rearranging from scratch self.all_mute_freqs = {} # read shm info NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals with open(self.args.gtrfname, 'r') as gtrfile: # read gtr parameters reader = csv.DictReader(gtrfile) for line in reader: parameters = line['parameter'].split('.') region = parameters[0][3].lower() assert region == 'v' or region == 'd' or region == 'j' model = parameters[1].lower() parameter_name = parameters[2] assert model in self.mute_models[region] self.mute_models[region][model][parameter_name] = line['value'] treegen = treegenerator.TreeGenerator(args, self.parameter_dir, seed=seed) self.treefname = self.workdir + '/trees.tre' treegen.generate_trees( seed, self.treefname ) # NOTE not really a newick file, since I hack on the per-region branch length info at the end of each line with open( self.treefname, 'r' ) as treefile: # read in the trees (and other info) that we just generated self.treeinfo = treefile.readlines() os.remove(self.treefname) if os.path.exists(self.outfname): os.remove(self.outfname) elif not os.path.exists(os.path.dirname(os.path.abspath( self.outfname))): os.makedirs(os.path.dirname(os.path.abspath(self.outfname))) self.validation_values = { 'heights': {t: { 'in': [], 'out': [] } for t in ['all'] + utils.regions} }
def get_gls_gen_annotation_performance_plots(args, region, baseoutdir): assert region == 'v' # needs to be implemented import plotting import plotconfig methcolors = { # NOTE started from scolors in bin/plot-gl-set-trees.py htmlcolorcods.com, and slide each one a little rightward 'tigger-default' : '#dd4d39', 'igdiscover' : '#55ab7a', #60ac84', 'partis' : '#6b83ca', #758bcd', 'full' : '#858585', } lstyledict = {} # 'tigger-default' : '--'} lwdict = {'full' : 9, 'igdiscover' : 8, 'partis' : 5, 'tigger-default' : 2} # methods are sorted below, so it's always [full, igdiscover, partis, tigger] linewidths = [lwdict[m] for m in args.methods] colors = [methcolors[meth] for meth in args.methods] linestyles = [lstyledict.get(m, '-') for m in args.methods] alphas = [0.8 if m in ['full', 'igdiscover'] else 1 for m in args.methods] varname = args.action varval = 'simu' plotnames = ['v_hamming_to_true_naive', 'v_muted_bases'] xtitles = ['V distance to true naive', 'inferred - true'] meanvals = {pn : {m : [] for m in args.methods} for pn in plotnames} print ' annotations: %s' % get_outdir(args, baseoutdir, varname, varval, n_events=args.gls_gen_events) all_hists = {pn : [] for pn in plotnames} for iproc in range(args.iteststart, args.n_tests): outdir = get_outdir(args, baseoutdir, varname, varval, n_events=args.gls_gen_events) + '/' + str(iproc) # duplicates code in bin/test-germline-inference.py plotdir = outdir + '/annotation-performance-plots' print ' %s' % plotdir if not args.only_print: utils.prep_dir(plotdir, wildlings=['*.png', '*.svg', '*.csv']) # shenanigans for the six (three easy and thre hard) of 'em that go in the paper pdf make_legend = (iproc > 2) or (iproc == 0) # and args.gls_gen_difficulty == 'easy') make_xtitle = (iproc > 2) or (iproc == 2) make_ytitle = (iproc > 2) or (args.gls_gen_difficulty == 'easy') for plotname in plotnames: hfnames = {meth : get_gls_fname(region, outdir, meth, args.locus, annotation_performance_plots=True) for meth in args.methods} for hfn in hfnames.values(): if not os.path.exists(hfn): raise Exception('%s d.n.e.: need to first run non-plotting (without --plot) --annotation-performance-plots (which involves re-running partis, I think the difference being partis is now running with --plot-annotation-performance' % hfn) hists = {meth : Hist(fname=hfnames[meth] + '/' + plotname + '.csv', title=methstr(meth) if make_legend else None) for meth in args.methods} for meth in args.methods: if hists[meth].overflow_contents() != 0.0: print ' %s %s non-zero under/overflow %f' % (utils.color('red', 'error'), methstr(meth), hists[meth].overflow_contents()) meanvals[plotname][meth].append(hists[meth].get_mean()) if args.only_print: continue plotting.draw_no_root(hists[args.methods[0]], log='y', plotdir=plotdir, plotname=plotname, more_hists=[hists[m] for m in args.methods[1:]], colors=colors, ytitle='sequences' if make_ytitle else None, xtitle=xtitles[plotnames.index(plotname)] if make_xtitle else '', plottitle=gls_sim_str(args.gls_gen_difficulty, iproc), linewidths=linewidths, linestyles=linestyles, alphas=alphas, remove_empty_bins=True, square_bins=True) all_hists[plotname].append(hists) print ' total plots' plotdir = get_outdir(args, baseoutdir, varname, varval, n_events=args.gls_gen_events) + '/annotation-performance-plots' print ' %s' % plotdir if not args.only_print: utils.prep_dir(plotdir, wildlings=['*.png', '*.svg', '*.csv']) for plotname in plotnames: total_hists = {} for meth in args.methods: xmin = min([hdict[meth].xmin for hdict in all_hists[plotname]]) xmax = max([hdict[meth].xmax for hdict in all_hists[plotname]]) total_hists[meth] = Hist(xmax - xmin, xmin, xmax, title=all_hists[plotname][0][meth].title) for hdict in all_hists[plotname]: assert hdict[meth].integral(include_overflows=True) > 100 # make sure it isn't normalized (this is a shitty way to do this) bin_centers = hdict[meth].get_bin_centers() for ibin in range(len(hdict[meth].low_edges)): xval = bin_centers[ibin] for _ in range(int(hdict[meth].bin_contents[ibin])): total_hists[meth].fill(xval) plotting.draw_no_root(total_hists[args.methods[0]], log='y', plotdir=plotdir, plotname='total-' + plotname, more_hists=[total_hists[m] for m in args.methods[1:]], colors=colors, ytitle='sequences' if make_ytitle else None, xtitle=xtitles[plotnames.index(plotname)], plottitle=gls_sim_str(args.gls_gen_difficulty, iproc=''), linewidths=linewidths, linestyles=linestyles, alphas=alphas, remove_empty_bins=True, square_bins=True) for plotname in plotnames: if 'muted_bases' in plotname: # mean value isn't meaningful continue print plotname for meth in args.methods: mean = float(sum(meanvals[plotname][meth])) / len(meanvals[plotname][meth]) err = numpy.std(meanvals[plotname][meth], ddof=1) / math.sqrt(len(meanvals[plotname][meth])) print ' %15s %6.3f / %d = %6.2f +/- %6.2f' % (methstr(meth), sum(meanvals[plotname][meth]), len(meanvals[plotname][meth]), mean, err)
def clean_plots(self, plotdir): for substr in ['overall', ] + utils.regions: # + [r + '-per-base' for r in utils.regions]: utils.prep_dir(plotdir + '/' + substr, wildlings=('*.csv', '*.svg'))