class ParameterCounter(object): """ class to keep track of how many times we've seen each gene version, erosion length, insertion (length and base content), and mutation """ def __init__(self, glfo, args): self.glfo = glfo self.args = args self.mfreqer = MuteFreqer(self.glfo, exclusions=args.region_end_exclusions) self.reco_total = 0 # total number of recombination events self.mute_total = 0 # total number of sequences self.counts = {} self.counts['all'] = {} for column in utils.column_dependencies: self.counts[column] = {} self.string_columns = set([r + '_gene' for r in utils.regions]) for bound in utils.boundaries: self.counts[bound + '_insertion_content'] = { n: 0 for n in utils.nukes } # base content of each insertion self.string_columns.add(bound + '_insertion_content') self.counts['aa_cdr3_length'] = {} self.counts['non_vj_length'] = {} self.counts['seq_content'] = { n: 0 for n in utils.nukes } # now I'm adding the aa content, I wish this had nucleotide in the name, but I don't want to change it since it corresponds to a million existing file paths self.counts['cluster_size'] = {} self.init_aa_stuff() self.counts['seq_aa_content'] = {a: 0 for a in self.all_aa} self.string_columns.add('seq_content') self.string_columns.add('seq_aa_content') self.no_write_columns = [ 'aa_cdr3_length', 'non_vj_length', 'seq_aa_content' ] # don't write these to the parameter dir, since a) cdr3 length is better viewed as an output of more fundamental parameters (gene choice, insertion + deletion lengths) and b) I"m adding them waaay long after the others, and I don't want to add a new file to the established parameter directory structure. (I'm adding these because I want them plotted) self.columns_to_subset_by_gene = [ e + '_del' for e in utils.all_erosions ] + [b + '_insertion' for b in utils.boundaries] self.mean_columns = ['aa_cdr3_length', 'non_vj_length'] # ---------------------------------------------------------------------------------------- def init_aa_stuff(self): codons = itertools.product( utils.nukes + ['N'], repeat=3 ) # I cannot for the life of me find anything in Bio that will give me the list of amino acids, wtf, but I'm tired of googling, this will be fine self.all_aa = set([utils.ltranslate(''.join(c)) for c in codons]) # ---------------------------------------------------------------------------------------- def get_index(self, info, deps): index = [] for ic in deps: if ic[2:] == '_insertion': # insertion length index.append(len(info[ic])) else: assert 'insertion' not in ic assert 'content' not in ic index.append(info[ic]) return tuple(index) # ---------------------------------------------------------------------------------------- def increment(self, info): self.increment_per_family_params(info) for iseq in range(len(info['seqs'])): self.increment_per_sequence_params(info, iseq) # ---------------------------------------------------------------------------------------- def increment_per_sequence_params(self, info, iseq): """ increment parameters that differ for each sequence within the clonal family """ self.mute_total += 1 self.mfreqer.increment(info, iseq) for nuke in utils.nukes: self.counts['seq_content'][nuke] += info['seqs'][iseq].count(nuke) # aa seq content stuff nseq = info['seqs'][iseq] if info['v_5p_del'] > 0: nseq = info['v_5p_del'] * utils.ambig_base + nseq if len(info['fv_insertion']) > 0: nseq = nseq[len(info['fv_insertion']):] if len(nseq) % 3 != 0: nseq += utils.ambig_base * ( 3 - (len(nseq) % 3) ) # I think I could replace this with the new utils.pad_nuc_seq() aaseq = utils.ltranslate(nseq) for aa in self.all_aa: self.counts['seq_aa_content'][aa] += aaseq.count(aa) # ---------------------------------------------------------------------------------------- def increment_per_family_params(self, info): """ increment parameters that are the same for the entire clonal family """ def sub_increment(column, index): if index not in self.counts[column]: self.counts[column][index] = 0 self.counts[column][index] += 1 self.reco_total += 1 all_index = self.get_index( info, tuple(list(utils.index_columns) + [ 'cdr3_length', ]) ) # NOTE this cdr3_length is for getting a unique index for the rearrangement event parameters, and is thus unrelated to the key aa_cdr3_length for plotting if all_index not in self.counts['all']: self.counts['all'][all_index] = 0 self.counts['all'][all_index] += 1 for deps in utils.column_dependency_tuples: column = deps[0] index = self.get_index(info, deps) sub_increment(column, index) # have to be done separately, since they're not index columns (and we don't want them to be, since they're better viewed as derivative -- see note in self.write()) sub_increment( 'aa_cdr3_length', (info['cdr3_length'] / 3, ) ) # oh, jeez, this has to be a tuple to match the index columns, that's ugly sub_increment('non_vj_length', (utils.get_non_vj_len(info), )) sub_increment('cluster_size', (len(info['unique_ids']), )) for bound in utils.boundaries: for nuke in info[bound + '_insertion']: if nuke == utils.ambig_base: continue self.counts[bound + '_insertion_content'][nuke] += 1 # ---------------------------------------------------------------------------------------- def clean_plots(self, plotdir): self.mfreqer.clean_plots(plotdir + '/mute-freqs') utils.prep_dir(plotdir + '/overall', wildlings=('*.csv', '*.svg')) for column in self.counts: if column in self.columns_to_subset_by_gene: thisplotdir = plotdir + '/' + column utils.prep_dir(thisplotdir, wildlings=['*.csv', '*.svg']) # ---------------------------------------------------------------------------------------- def plot( self, plotdir, only_csv=False, only_overall=False, make_per_base_plots=False ): # NOTE most of the time in here is taken up by mutefrequer.finalize() (if it write() wasn't called first, that is) import plotting print ' plotting parameters in %s' % plotdir, sys.stdout.flush() start = time.time() self.clean_plots(plotdir) self.mfreqer.plot(plotdir + '/mute-freqs', only_csv=only_csv, only_overall=only_overall, make_per_base_plots=make_per_base_plots) overall_plotdir = plotdir + '/overall' for column in self.counts: if column == 'all': continue values, gene_values = {}, {} for index, count in self.counts[column].iteritems(): column_val = index[0] if column_val not in values: values[column_val] = 0.0 values[column_val] += count if column in self.columns_to_subset_by_gene: gene = index[ 1] # NOTE this is hackey, but it works find now and will fail obviously if I ever change the correlations to be incompatible. so screw it utils.split_gene(gene) # checks validity of gene if gene not in gene_values: gene_values[gene] = {} if column_val not in gene_values[gene]: gene_values[gene][column_val] = 0.0 gene_values[gene][column_val] += count var_type = 'string' if column in self.string_columns else 'int' hist = hutils.make_hist_from_dict_of_counts( values, var_type, column) plotting.draw_no_root( hist, plotname=column, plotdir=overall_plotdir, xtitle=plotconfig.xtitles.get(column, column), plottitle=plotconfig.plot_titles.get(column, column), errors=True, write_csv=True, only_csv=only_csv, stats='mean' if column in self.mean_columns else None, normalize=True) if column in self.columns_to_subset_by_gene and not only_overall: thisplotdir = plotdir + '/' + column for gene in gene_values: plotname = utils.sanitize_name(gene) + '-' + column hist = hutils.make_hist_from_dict_of_counts( gene_values[gene], var_type, plotname) plotting.draw_no_root(hist, plotname=plotname, plotdir=thisplotdir, xtitle=plotconfig.plot_titles.get( column, column), plottitle=gene, errors=True, write_csv=True, only_csv=only_csv) if not only_csv: plotting.make_html(thisplotdir) if not only_csv: plotting.make_html(overall_plotdir) print '(%.1f sec)' % (time.time() - start) # ---------------------------------------------------------------------------------------- def write( self, base_outdir ): # NOTE most of the time in here is taken up by mutefrequer.finalize() (if it plot() wasn't called first, that is) print ' writing parameters to %s' % base_outdir, sys.stdout.flush() start = time.time() if os.path.exists(base_outdir + '/' + glutils.glfo_dir): for tmploc in [ l for l in utils.loci if os.path.exists(base_outdir + '/' + glutils.glfo_dir + '/' + l) ]: glutils.remove_glfo_files(base_outdir + '/' + glutils.glfo_dir, tmploc, print_warning=False) utils.prep_dir( base_outdir, subdirs=('hmms', 'mute-freqs', glutils.glfo_dir), wildlings=('*.csv', '*.yaml', '*.fasta') ) # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary self.mfreqer.write( base_outdir + '/mute-freqs', mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv' ) # REGION is replace by each region in the three output files) genes_with_counts = [ g[0] for r in utils.regions for g in self.counts[r + '_gene'].keys() ] glutils.write_glfo(base_outdir + '/' + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=False) for column in self.counts: index = None outfname = None if column in self.no_write_columns: continue elif column == 'all': index = tuple(list(utils.index_columns) + [ 'cdr3_length', ]) outfname = base_outdir + '/' + utils.get_parameter_fname( column='all') elif '_content' in column or column == 'cluster_size': index = [ column, ] outfname = base_outdir + '/' + column + '.csv' else: index = [ column, ] + utils.column_dependencies[column] outfname = base_outdir + '/' + utils.get_parameter_fname( column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with open(outfname, 'w') as outfile: out_fieldnames = list(index) out_fieldnames.append('count') out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line['count'] = count out_data.writerow(line) print '(%.1f sec)' % (time.time() - start)
class ParameterCounter(object): """ class to keep track of how many times we've seen each gene version, erosion length, insertion (length and base content), and mutation """ def __init__(self, glfo, args): self.glfo = glfo self.args = args self.mfreqer = MuteFreqer(self.glfo) self.reco_total = 0 # total number of recombination events self.mute_total = 0 # total number of sequences self.counts = {} self.counts['all'] = {} for column in utils.column_dependencies: self.counts[column] = {} for bound in utils.boundaries: self.counts[bound + '_insertion_content'] = {n : 0 for n in utils.nukes} # base content of each insertion self.counts['seq_content'] = {n : 0 for n in utils.nukes} # ---------------------------------------------------------------------------------------- def get_index(self, info, deps): index = [] for ic in deps: if ic[2:] == '_insertion': # insertion length index.append(len(info[ic])) else: assert 'insertion' not in ic assert 'content' not in ic index.append(info[ic]) return tuple(index) # ---------------------------------------------------------------------------------------- def increment_all_params(self, info): self.increment_per_sequence_params(info) self.increment_per_family_params(info) # ---------------------------------------------------------------------------------------- def increment_per_sequence_params(self, info): """ increment parameters that differ for each sequence within the clonal family """ self.mute_total += 1 self.mfreqer.increment(info) seq = info['seq'] for nuke in seq: if nuke in utils.ambiguous_bases: continue self.counts['seq_content'][nuke] += 1 # ---------------------------------------------------------------------------------------- def increment_per_family_params(self, info): """ increment parameters that are the same for the entire clonal family """ self.reco_total += 1 all_index = self.get_index(info, tuple(list(utils.index_columns) + ['cdr3_length', ])) if all_index not in self.counts['all']: self.counts['all'][all_index] = 0 self.counts['all'][all_index] += 1 for deps in utils.column_dependency_tuples: column = deps[0] index = self.get_index(info, deps) if index not in self.counts[column]: self.counts[column][index] = 0 self.counts[column][index] += 1 for bound in utils.boundaries: for nuke in info[bound + '_insertion']: if nuke in utils.ambiguous_bases: continue self.counts[bound + '_insertion_content'][nuke] += 1 # ---------------------------------------------------------------------------------------- def clean_plots(self, plotdir, subset_by_gene): self.mfreqer.clean_plots(plotdir + '/mute-freqs') utils.prep_dir(plotdir + '/overall') #, multilings=('*.csv', '*.svg')) for column in self.counts: if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'): # option to subset deletion and (real) insertion plots by gene thisplotdir = plotdir + '/' + column utils.prep_dir(thisplotdir, wildlings=['*.csv', '*.svg']) # ---------------------------------------------------------------------------------------- def plot(self, plotdir, subset_by_gene=False, cyst_positions=None, tryp_positions=None, only_csv=False): print ' plotting parameters', sys.stdout.flush() start = time.time() self.clean_plots(plotdir, subset_by_gene) self.mfreqer.plot(plotdir + '/mute-freqs', cyst_positions, tryp_positions, only_csv=only_csv) #, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files overall_plotdir = plotdir + '/overall' for column in self.counts: if column == 'all': continue values, gene_values = {}, {} if len(self.counts[column]) == 0: raise Exception('no counts in %s' % column) for index, count in self.counts[column].iteritems(): gene = None if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'): # option to subset deletion and (real) insertion plots by gene if '_del' in column: region = column[0] else: region = column[1] assert region in utils.regions assert 'IGH' + region.upper() in index[1] # NOTE this is hackey, but it works find now and will fail obviously gene = index[1] # if I ever change the correlations to be incompatible. so screw it if gene not in gene_values: gene_values[gene] = {} column_val = index[0] if gene is not None: if column_val not in gene_values[gene]: gene_values[gene][column_val] = 0.0 gene_values[gene][column_val] += count if column_val not in values: values[column_val] = 0.0 values[column_val] += count try: # figure out whether this is an integer or string (only used outside this loop when we make the plots) int(column_val) var_type = 'int' except: var_type = 'string' if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'): # option to subset deletion and (real) insertion plots by gene thisplotdir = plotdir + '/' + column for gene in gene_values: plotname = utils.sanitize_name(gene) + '-' + column hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True) plotting.draw_no_root(hist, plotname=plotname, plotdir=thisplotdir, errors=True, write_csv=True, only_csv=only_csv) if not only_csv: plotting.make_html(thisplotdir) plotname = column hist = plotting.make_hist_from_dict_of_counts(values, var_type, plotname, sort=True) plotting.draw_no_root(hist, plotname=plotname, plotdir=overall_plotdir, errors=True, write_csv=True, only_csv=only_csv) if not only_csv: plotting.make_html(overall_plotdir) print '(%.1f sec)' % (time.time()-start) # ---------------------------------------------------------------------------------------- def write(self, base_outdir, my_datadir=None): print ' writing parameters', sys.stdout.flush() start = time.time() utils.prep_dir(base_outdir, subdirs=('hmms', 'mute-freqs', 'germline-sets'), wildlings=('*.csv', '*.yaml', '*.fasta')) # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary self.mfreqer.write(base_outdir + '/mute-freqs', mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files) genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + '_gene'].keys()] glutils.write_glfo(base_outdir + '/' + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=True) for column in self.counts: index = None outfname = None if column == 'all': index = tuple(list(utils.index_columns) + ['cdr3_length', ]) outfname = base_outdir + '/' + utils.get_parameter_fname(column='all') elif '_content' in column: index = [column,] outfname = base_outdir + '/' + column + '.csv' else: index = [column,] + utils.column_dependencies[column] outfname = base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with opener('w')(outfname) as outfile: out_fieldnames = list(index) out_fieldnames.append('count') out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line['count'] = count out_data.writerow(line) print '(%.1f sec)' % (time.time()-start)
class ParameterCounter(object): """ class to keep track of how many times we've seen each gene version, erosion length, insertion (length and base content), and mutation """ def __init__( self, germline_seqs ): #, base_outdir='', plotdir='', write_parameters=True, plot_parameters=True): self.total = 0 self.counts = {} self.counts['all'] = {} for column in utils.column_dependencies: self.counts[column] = {} for bound in utils.boundaries: self.counts[bound + '_insertion_content'] = { 'A': 0, 'C': 0, 'G': 0, 'T': 0 } # base content of each insertion self.counts['seq_content'] = {'A': 0, 'C': 0, 'G': 0, 'T': 0} self.mutefreqer = MuteFreqer( germline_seqs ) #, self.base_outdir, self.plotdir, write_parameters=self.write_parameters, plot_parameters=self.plot_parameters) # ---------------------------------------------------------------------------------------- def clean(self): """ remove all the parameter files """ self.mutefreqer.clean() for column in self.counts: if column == 'all': os.remove(self.base_outdir + '/' + utils.get_parameter_fname(column='all')) else: index = [ column, ] + utils.column_dependencies[column] os.remove(self.base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index)) # ---------------------------------------------------------------------------------------- def get_index(self, info, deps): index = [] for ic in deps: if ic[2:] == '_insertion': # insertion length index.append(len(info[ic])) else: assert 'insertion' not in ic assert 'content' not in ic index.append(info[ic]) return tuple(index) # ---------------------------------------------------------------------------------------- def increment(self, info): self.total += 1 all_index = self.get_index(info, utils.index_columns) if all_index not in self.counts['all']: self.counts['all'][all_index] = 0 self.counts['all'][all_index] += 1 for deps in utils.column_dependency_tuples: column = deps[0] index = self.get_index(info, deps) if index not in self.counts[column]: self.counts[column][index] = 0 self.counts[column][index] += 1 for bound in utils.boundaries: for nuke in info[bound + '_insertion']: self.counts[bound + '_insertion_content'][nuke] += 1 for nuke in info['seq']: self.counts['seq_content'][nuke] += 1 self.mutefreqer.increment(info) # ---------------------------------------------------------------------------------------- def __str__(self): return_str = [] print 'hm I think I was too lazy to put \'all\' in this string' print ' or [vdj]_insertion_content or seq_content' for column in self.counts: return_str.append('%s\n' % column) return_str.append('%20s' % column) for dep in utils.column_dependencies[column]: return_str.append('%20s' % dep) return_str.append('\n') for index, count in self.counts[column].iteritems(): for val in index: return_str.append('%20s' % str(val)) return_str.append( ' %d / %d = %f\n' % (count, self.total, float(count) / self.total)) return ''.join(return_str) # ---------------------------------------------------------------------------------------- def plot(self, plotdir, subset_by_gene=False, cyst_positions=None, tryp_positions=None): print ' plotting parameters' start = time.time() utils.prep_dir(plotdir + '/plots') #, multilings=('*.csv', '*.svg')) for column in self.counts: if column == 'all': continue values, gene_values = {}, {} if len(self.counts[column]) == 0: print 'ERROR no counts in %s' % column assert False for index, count in self.counts[column].iteritems(): gene = None if subset_by_gene and ( '_del' in column or column == 'vd_insertion' or column == 'dj_insertion' ): # option to subset deletion and (real) insertion plots by gene if '_del' in column: region = column[0] else: region = column[1] assert region in utils.regions assert 'IGH' + region.upper() in index[ 1] # NOTE this is hackey, but it works find now and will fail obviously gene = index[ 1] # if I ever change the correlations to be incompatible. so screw it if gene not in gene_values: gene_values[gene] = {} column_val = index[0] if gene is not None: if column_val not in gene_values[gene]: gene_values[gene][column_val] = 0.0 gene_values[gene][column_val] += count if column_val not in values: values[column_val] = 0.0 values[column_val] += count try: # figure out whether this is an integer or string (only used outside this loop when we make the plots) int(column_val) var_type = 'int' except: var_type = 'string' if subset_by_gene and ( '_del' in column or column == 'vd_insertion' or column == 'dj_insertion' ): # option to subset deletion and (real) insertion plots by gene thisplotdir = plotdir + '/' + column utils.prep_dir(thisplotdir + '/plots', multilings=['*.csv', '*.svg']) for gene in gene_values: plotname = utils.sanitize_name(gene) + '-' + column hist = plotting.make_hist_from_dict_of_counts( gene_values[gene], var_type, plotname, sort=True) plotting.draw(hist, var_type, plotname=plotname, plotdir=thisplotdir, errors=True, write_csv=True) check_call(['./bin/makeHtml', thisplotdir, '3', 'null', 'svg']) check_call( ['./bin/permissify-www', thisplotdir] ) # NOTE this should really permissify starting a few directories higher up plotname = column hist = plotting.make_hist_from_dict_of_counts(values, var_type, plotname, sort=True) plotting.draw(hist, var_type, plotname=plotname, plotdir=plotdir, errors=True, write_csv=True) self.mutefreqer.plot( plotdir, cyst_positions, tryp_positions ) #, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files if has_root: check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg']) check_call( ['./bin/permissify-www', plotdir] ) # NOTE this should really permissify starting a few directories higher up print ' parameter plot time: %.3f' % (time.time() - start) # ---------------------------------------------------------------------------------------- def write(self, base_outdir): print ' writing parameters' start = time.time() utils.prep_dir(base_outdir, multilings=('*.csv', '*.svg')) mute_start = time.time() self.mutefreqer.write( base_outdir, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv' ) # REGION is replace by each region in the three output files) print ' mut freq write time: %.3f' % (time.time() - mute_start) # print ' %d / %d cached' % (self.mutefreqer.n_cached, self.mutefreqer.n_cached + self.mutefreqer.n_not_cached) for column in self.counts: index = None outfname = None if column == 'all': index = utils.index_columns outfname = base_outdir + '/' + utils.get_parameter_fname( column='all') elif '_content' in column: index = [ column, ] outfname = base_outdir + '/' + column + '.csv' else: index = [ column, ] + utils.column_dependencies[column] outfname = base_outdir + '/' + utils.get_parameter_fname( column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with opener('w')(outfname) as outfile: out_fieldnames = list(index) out_fieldnames.append('count') out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line['count'] = count out_data.writerow(line) print ' parameter write time: %.3f' % (time.time() - start)
class ParameterCounter(object): """ class to keep track of how many times we've seen each gene version, erosion length, insertion (length and base content), and mutation """ def __init__(self, glfo, args): self.glfo = glfo self.args = args self.mfreqer = MuteFreqer(self.glfo, exclusions=args.region_end_exclusions) self.reco_total = 0 # total number of recombination events self.mute_total = 0 # total number of sequences self.counts = {} self.counts['all'] = {} for column in utils.column_dependencies: self.counts[column] = {} self.string_columns = set([r + '_gene' for r in utils.regions]) for bound in utils.boundaries: self.counts[bound + '_insertion_content'] = { n: 0 for n in utils.nukes } # base content of each insertion self.string_columns.add(bound + '_insertion_content') self.counts['seq_content'] = {n: 0 for n in utils.nukes} self.string_columns.add('seq_content') self.columns_to_subset_by_gene = [ e + '_del' for e in utils.all_erosions ] + [b + '_insertion' for b in utils.boundaries] # ---------------------------------------------------------------------------------------- def get_index(self, info, deps): index = [] for ic in deps: if ic[2:] == '_insertion': # insertion length index.append(len(info[ic])) else: assert 'insertion' not in ic assert 'content' not in ic index.append(info[ic]) return tuple(index) # ---------------------------------------------------------------------------------------- def increment(self, info): self.increment_per_family_params(info) for iseq in range(len(info['seqs'])): self.increment_per_sequence_params(info, iseq) # ---------------------------------------------------------------------------------------- def increment_per_sequence_params(self, info, iseq): """ increment parameters that differ for each sequence within the clonal family """ self.mute_total += 1 self.mfreqer.increment(info, iseq) for nuke in info['seqs'][iseq]: if nuke in utils.ambiguous_bases: continue self.counts['seq_content'][nuke] += 1 # ---------------------------------------------------------------------------------------- def increment_per_family_params(self, info): """ increment parameters that are the same for the entire clonal family """ self.reco_total += 1 all_index = self.get_index( info, tuple(list(utils.index_columns) + [ 'cdr3_length', ])) if all_index not in self.counts['all']: self.counts['all'][all_index] = 0 self.counts['all'][all_index] += 1 for deps in utils.column_dependency_tuples: column = deps[0] index = self.get_index(info, deps) if index not in self.counts[column]: self.counts[column][index] = 0 self.counts[column][index] += 1 for bound in utils.boundaries: for nuke in info[bound + '_insertion']: if nuke in utils.ambiguous_bases: continue self.counts[bound + '_insertion_content'][nuke] += 1 # ---------------------------------------------------------------------------------------- def clean_plots(self, plotdir): self.mfreqer.clean_plots(plotdir + '/mute-freqs') utils.prep_dir(plotdir + '/overall', wildlings=('*.csv', '*.svg')) for column in self.counts: if column in self.columns_to_subset_by_gene: thisplotdir = plotdir + '/' + column utils.prep_dir(thisplotdir, wildlings=['*.csv', '*.svg']) # ---------------------------------------------------------------------------------------- def plot(self, plotdir, only_csv=False, only_overall=False): import plotting print ' plotting parameters', sys.stdout.flush() start = time.time() self.clean_plots(plotdir) self.mfreqer.plot(plotdir + '/mute-freqs', only_csv=only_csv, only_overall=only_overall) overall_plotdir = plotdir + '/overall' for column in self.counts: if column == 'all': continue values, gene_values = {}, {} for index, count in self.counts[column].iteritems(): column_val = index[0] if column_val not in values: values[column_val] = 0.0 values[column_val] += count if column in self.columns_to_subset_by_gene: gene = index[ 1] # NOTE this is hackey, but it works find now and will fail obviously if I ever change the correlations to be incompatible. so screw it utils.split_gene(gene) # checks validity of gene if gene not in gene_values: gene_values[gene] = {} if column_val not in gene_values[gene]: gene_values[gene][column_val] = 0.0 gene_values[gene][column_val] += count var_type = 'string' if column in self.string_columns else 'int' hist = plotting.make_hist_from_dict_of_counts(values, var_type, column, sort=True) plotting.draw_no_root( hist, plotname=column, plotdir=overall_plotdir, xtitle=plotconfig.xtitles.get(column, column), plottitle=plotconfig.plot_titles.get(column, column), errors=True, write_csv=True, only_csv=only_csv) if column in self.columns_to_subset_by_gene and not only_overall: thisplotdir = plotdir + '/' + column for gene in gene_values: plotname = utils.sanitize_name(gene) + '-' + column hist = plotting.make_hist_from_dict_of_counts( gene_values[gene], var_type, plotname, sort=True) plotting.draw_no_root(hist, plotname=plotname, plotdir=thisplotdir, xtitle=plotconfig.plot_titles.get( column, column), plottitle=gene, errors=True, write_csv=True, only_csv=only_csv) if not only_csv: plotting.make_html(thisplotdir) if not only_csv: plotting.make_html(overall_plotdir) print '(%.1f sec)' % (time.time() - start) # ---------------------------------------------------------------------------------------- def write(self, base_outdir): print ' writing parameters', sys.stdout.flush() start = time.time() if os.path.exists(base_outdir + '/' + glutils.glfo_dir): glutils.remove_glfo_files( base_outdir + '/' + glutils.glfo_dir, self.glfo['locus'] ) # NOTE I think this will fail if I ever start having multiple loci in one dir utils.prep_dir( base_outdir, subdirs=('hmms', 'mute-freqs', glutils.glfo_dir), wildlings=('*.csv', '*.yaml', '*.fasta') ) # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary self.mfreqer.write( base_outdir + '/mute-freqs', mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv' ) # REGION is replace by each region in the three output files) genes_with_counts = [ g[0] for r in utils.regions for g in self.counts[r + '_gene'].keys() ] glutils.write_glfo(base_outdir + '/' + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=False) for column in self.counts: index = None outfname = None if column == 'all': index = tuple(list(utils.index_columns) + [ 'cdr3_length', ]) outfname = base_outdir + '/' + utils.get_parameter_fname( column='all') elif '_content' in column: index = [ column, ] outfname = base_outdir + '/' + column + '.csv' else: index = [ column, ] + utils.column_dependencies[column] outfname = base_outdir + '/' + utils.get_parameter_fname( column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with open(outfname, 'w') as outfile: out_fieldnames = list(index) out_fieldnames.append('count') out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line['count'] = count out_data.writerow(line) print '(%.1f sec)' % (time.time() - start)
class ParameterCounter(object): """ class to keep track of how many times we've seen each gene version, erosion length, insertion (length and base content), and mutation """ def __init__(self, glfo, args): self.glfo = glfo self.args = args self.mfreqer = MuteFreqer(self.glfo) self.reco_total = 0 # total number of recombination events self.mute_total = 0 # total number of sequences self.counts = {} self.counts["all"] = {} for column in utils.column_dependencies: self.counts[column] = {} self.string_columns = set([r + "_gene" for r in utils.regions]) for bound in utils.boundaries: self.counts[bound + "_insertion_content"] = {n: 0 for n in utils.nukes} # base content of each insertion self.string_columns.add(bound + "_insertion_content") self.counts["seq_content"] = {n: 0 for n in utils.nukes} self.string_columns.add("seq_content") self.columns_to_subset_by_gene = [e + "_del" for e in utils.real_erosions + utils.effective_erosions] + [ b + "_insertion" for b in utils.boundaries ] # ---------------------------------------------------------------------------------------- def get_index(self, info, deps): index = [] for ic in deps: if ic[2:] == "_insertion": # insertion length index.append(len(info[ic])) else: assert "insertion" not in ic assert "content" not in ic index.append(info[ic]) return tuple(index) # ---------------------------------------------------------------------------------------- def increment(self, info): self.increment_per_family_params(info) for iseq in range(len(info["seqs"])): self.increment_per_sequence_params(info, iseq) # ---------------------------------------------------------------------------------------- def increment_per_sequence_params(self, info, iseq): """ increment parameters that differ for each sequence within the clonal family """ self.mute_total += 1 self.mfreqer.increment(info, iseq) for nuke in info["seqs"][iseq]: if nuke in utils.ambiguous_bases: continue self.counts["seq_content"][nuke] += 1 # ---------------------------------------------------------------------------------------- def increment_per_family_params(self, info): """ increment parameters that are the same for the entire clonal family """ self.reco_total += 1 all_index = self.get_index(info, tuple(list(utils.index_columns) + ["cdr3_length"])) if all_index not in self.counts["all"]: self.counts["all"][all_index] = 0 self.counts["all"][all_index] += 1 for deps in utils.column_dependency_tuples: column = deps[0] index = self.get_index(info, deps) if index not in self.counts[column]: self.counts[column][index] = 0 self.counts[column][index] += 1 for bound in utils.boundaries: for nuke in info[bound + "_insertion"]: if nuke in utils.ambiguous_bases: continue self.counts[bound + "_insertion_content"][nuke] += 1 # ---------------------------------------------------------------------------------------- def clean_plots(self, plotdir): self.mfreqer.clean_plots(plotdir + "/mute-freqs") utils.prep_dir(plotdir + "/overall") # , multilings=('*.csv', '*.svg')) for column in self.counts: if column in self.columns_to_subset_by_gene: thisplotdir = plotdir + "/" + column utils.prep_dir(thisplotdir, wildlings=["*.csv", "*.svg"]) # ---------------------------------------------------------------------------------------- def plot(self, plotdir, only_csv=False, only_overall=False): print " plotting parameters", sys.stdout.flush() start = time.time() self.clean_plots(plotdir) self.mfreqer.plot(plotdir + "/mute-freqs", only_csv=only_csv, only_overall=only_overall) overall_plotdir = plotdir + "/overall" for column in self.counts: if column == "all": continue values, gene_values = {}, {} for index, count in self.counts[column].iteritems(): column_val = index[0] if column_val not in values: values[column_val] = 0.0 values[column_val] += count if column in self.columns_to_subset_by_gene: gene = index[ 1 ] # NOTE this is hackey, but it works find now and will fail obviously if I ever change the correlations to be incompatible. so screw it utils.split_gene(gene) # checks validity of gene if gene not in gene_values: gene_values[gene] = {} if column_val not in gene_values[gene]: gene_values[gene][column_val] = 0.0 gene_values[gene][column_val] += count var_type = "string" if column in self.string_columns else "int" hist = plotting.make_hist_from_dict_of_counts(values, var_type, column, sort=True) plotting.draw_no_root( hist, plotname=column, plotdir=overall_plotdir, xtitle=plotconfig.xtitles.get(column, column), plottitle=plotconfig.plot_titles.get(column, column), errors=True, write_csv=True, only_csv=only_csv, ) if column in self.columns_to_subset_by_gene and not only_overall: thisplotdir = plotdir + "/" + column for gene in gene_values: plotname = utils.sanitize_name(gene) + "-" + column hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True) plotting.draw_no_root( hist, plotname=plotname, plotdir=thisplotdir, xtitle=plotconfig.plot_titles.get(column, column), plottitle=gene, errors=True, write_csv=True, only_csv=only_csv, ) if not only_csv: plotting.make_html(thisplotdir) if not only_csv: plotting.make_html(overall_plotdir) print "(%.1f sec)" % (time.time() - start) # ---------------------------------------------------------------------------------------- def write(self, base_outdir): print " writing parameters", sys.stdout.flush() start = time.time() utils.prep_dir( base_outdir, subdirs=("hmms", "mute-freqs", "germline-sets"), wildlings=("*.csv", "*.yaml", "*.fasta") ) # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary self.mfreqer.write( base_outdir + "/mute-freqs", mean_freq_outfname=base_outdir + "/REGION-mean-mute-freqs.csv" ) # REGION is replace by each region in the three output files) genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + "_gene"].keys()] glutils.write_glfo(base_outdir + "/" + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=False) for column in self.counts: index = None outfname = None if column == "all": index = tuple(list(utils.index_columns) + ["cdr3_length"]) outfname = base_outdir + "/" + utils.get_parameter_fname(column="all") elif "_content" in column: index = [column] outfname = base_outdir + "/" + column + ".csv" else: index = [column] + utils.column_dependencies[column] outfname = base_outdir + "/" + utils.get_parameter_fname(column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with opener("w")(outfname) as outfile: out_fieldnames = list(index) out_fieldnames.append("count") out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line["count"] = count out_data.writerow(line) print "(%.1f sec)" % (time.time() - start)
class ParameterCounter(object): """ class to keep track of how many times we've seen each gene version, erosion length, insertion (length and base content), and mutation """ def __init__(self, germline_seqs): #, base_outdir='', plotdir='', write_parameters=True, plot_parameters=True): self.total = 0 self.counts = {} self.counts['all'] = {} for column in utils.column_dependencies: self.counts[column] = {} for bound in utils.boundaries: self.counts[bound + '_insertion_content'] = {'A':0, 'C':0, 'G':0, 'T':0} # base content of each insertion self.counts['seq_content'] = {'A':0, 'C':0, 'G':0, 'T':0} self.mutefreqer = MuteFreqer(germline_seqs) #, self.base_outdir, self.plotdir, write_parameters=self.write_parameters, plot_parameters=self.plot_parameters) # ---------------------------------------------------------------------------------------- def clean(self): """ remove all the parameter files """ self.mutefreqer.clean() for column in self.counts: if column == 'all': os.remove(self.base_outdir + '/' + utils.get_parameter_fname(column='all')) else: index = [column,] + utils.column_dependencies[column] os.remove(self.base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index)) # ---------------------------------------------------------------------------------------- def get_index(self, info, deps): index = [] for ic in deps: if ic[2:] == '_insertion': # insertion length index.append(len(info[ic])) else: assert 'insertion' not in ic assert 'content' not in ic index.append(info[ic]) return tuple(index) # ---------------------------------------------------------------------------------------- def increment(self, info): self.total += 1 all_index = self.get_index(info, utils.index_columns) if all_index not in self.counts['all']: self.counts['all'][all_index] = 0 self.counts['all'][all_index] += 1 for deps in utils.column_dependency_tuples: column = deps[0] index = self.get_index(info, deps) if index not in self.counts[column]: self.counts[column][index] = 0 self.counts[column][index] += 1 for bound in utils.boundaries: for nuke in info[bound + '_insertion']: self.counts[bound + '_insertion_content'][nuke] += 1 for nuke in info['seq']: self.counts['seq_content'][nuke] += 1 self.mutefreqer.increment(info) # ---------------------------------------------------------------------------------------- def __str__(self): return_str = [] print 'hm I think I was too lazy to put \'all\' in this string' print ' or [vdj]_insertion_content or seq_content' for column in self.counts: return_str.append('%s\n' % column) return_str.append('%20s' % column) for dep in utils.column_dependencies[column]: return_str.append('%20s' % dep) return_str.append('\n') for index, count in self.counts[column].iteritems(): for val in index: return_str.append('%20s' % str(val)) return_str.append(' %d / %d = %f\n' % (count, self.total, float(count) / self.total)) return ''.join(return_str) # ---------------------------------------------------------------------------------------- def plot(self, plotdir, subset_by_gene=False, cyst_positions=None, tryp_positions=None): print ' plotting parameters' start = time.time() utils.prep_dir(plotdir + '/plots') #, multilings=('*.csv', '*.svg')) for column in self.counts: if column == 'all': continue values, gene_values = {}, {} if len(self.counts[column]) == 0: print 'ERROR no counts in %s' % column assert False for index, count in self.counts[column].iteritems(): gene = None if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'): # option to subset deletion and (real) insertion plots by gene if '_del' in column: region = column[0] else: region = column[1] assert region in utils.regions assert 'IGH' + region.upper() in index[1] # NOTE this is hackey, but it works find now and will fail obviously gene = index[1] # if I ever change the correlations to be incompatible. so screw it if gene not in gene_values: gene_values[gene] = {} column_val = index[0] if gene is not None: if column_val not in gene_values[gene]: gene_values[gene][column_val] = 0.0 gene_values[gene][column_val] += count if column_val not in values: values[column_val] = 0.0 values[column_val] += count try: # figure out whether this is an integer or string (only used outside this loop when we make the plots) int(column_val) var_type = 'int' except: var_type = 'string' if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'): # option to subset deletion and (real) insertion plots by gene thisplotdir = plotdir + '/' + column utils.prep_dir(thisplotdir + '/plots', multilings=['*.csv', '*.svg']) for gene in gene_values: plotname = utils.sanitize_name(gene) + '-' + column hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True) plotting.draw(hist, var_type, plotname=plotname, plotdir=thisplotdir, errors=True, write_csv=True) check_call(['./bin/makeHtml', thisplotdir, '3', 'null', 'svg']) check_call(['./bin/permissify-www', thisplotdir]) # NOTE this should really permissify starting a few directories higher up plotname = column hist = plotting.make_hist_from_dict_of_counts(values, var_type, plotname, sort=True) plotting.draw(hist, var_type, plotname=plotname, plotdir=plotdir, errors=True, write_csv=True) self.mutefreqer.plot(plotdir, cyst_positions, tryp_positions) #, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files if has_root: check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg']) check_call(['./bin/permissify-www', plotdir]) # NOTE this should really permissify starting a few directories higher up print ' parameter plot time: %.3f' % (time.time()-start) # ---------------------------------------------------------------------------------------- def write(self, base_outdir): print ' writing parameters' start = time.time() utils.prep_dir(base_outdir, multilings=('*.csv', '*.svg')) mute_start = time.time() self.mutefreqer.write(base_outdir, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files) print ' mut freq write time: %.3f' % (time.time() - mute_start) # print ' %d / %d cached' % (self.mutefreqer.n_cached, self.mutefreqer.n_cached + self.mutefreqer.n_not_cached) for column in self.counts: index = None outfname = None if column == 'all': index = utils.index_columns outfname = base_outdir + '/' + utils.get_parameter_fname(column='all') elif '_content' in column: index = [column,] outfname = base_outdir + '/' + column + '.csv' else: index = [column,] + utils.column_dependencies[column] outfname = base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with opener('w')(outfname) as outfile: out_fieldnames = list(index) out_fieldnames.append('count') out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line['count'] = count out_data.writerow(line) print ' parameter write time: %.3f' % (time.time()-start)