def print_stuff(line): cluster_index = sorted_clusters.index(cluster) naive_cdr3, matureiseq0_cdr3 = utils.subset_sequences( line, iseq=0, restrict_to_region='cdr3' ) # line['naive_seq'][(line['codon_positions']['v']):((line['codon_positions']['j'])+3)] #get nt sequence of CDR3 from first base of cysteine through last base of tryptophan # mature_cdr3_seqs = [] # trying to translate the consensus cdr3 so I can search these with my seed seqs # for iseq in range(len(line['unique_ids'])): # naive_cdr3_seq, mature_cdr3_seq = utils.subset_sequences(line, iseq=iseq, restrict_to_region='cdr3') # mature_cdr3_seqs.append(mature_cdr3_seq) # translated_cdr3 = Seq().... not done cdr3_aa = '%-30s' % Seq(naive_cdr3).translate() if any('-ig' in s for s in line['unique_ids']): cdr3_aa = utils.color('red', cdr3_aa, width=30) print '%4s %s %s %s %5d %5d %5d %7.3f %8.4f %2d %s %4.2f' % ( cluster_index, utils.color_gene(line['v_gene'], width=15), utils.color_gene(line['d_gene'], width=15), utils.color_gene(line['j_gene'], width=10), len(line['unique_ids']), numpy.mean(line['n_mutations']), numpy.median(line['n_mutations']), numpy.mean(line['mut_freqs']), float(len(cluster)) / n_total, (line['cdr3_length'] / 3), cdr3_aa, utils.fay_wu_h(line, debug=False), )
def print_match(self, region, gene, query_seq, score, glbounds, qrbounds, codon_pos, warnings, skipping=False): if self.debug < 2: return out_str_list = [] buff_str = (20 - len(gene)) * ' ' tmp_val = score if self.args.apply_choice_probs_in_sw and self.get_choice_prob(region, gene) != 0.0: tmp_val = score / self.get_choice_prob(region, gene) if self.args.apply_choice_probs_in_sw: out_str_list.append('%8s%s%s%9.1e * %3.0f = %-6.1f' % (' ', utils.color_gene(gene), buff_str, self.get_choice_prob(region, gene), tmp_val, score)) else: out_str_list.append('%8s%s%s%9s%3s %6.0f ' % (' ', utils.color_gene(gene), '', '', buff_str, score)) out_str_list.append('%4d%4d %s\n' % (glbounds[0], glbounds[1], self.germline_seqs[region][gene][glbounds[0]:glbounds[1]])) out_str_list.append('%46s %4d%4d' % ('', qrbounds[0], qrbounds[1])) out_str_list.append(' %s ' % (utils.color_mutants(self.germline_seqs[region][gene][glbounds[0]:glbounds[1]], query_seq[qrbounds[0]:qrbounds[1]]))) if region != 'd': out_str_list.append('(%s %d)' % (utils.conserved_codon_names[region], codon_pos)) if warnings[gene] != '': out_str_list.append('WARNING ' + warnings[gene]) if skipping: out_str_list.append('skipping!') if self.args.outfname is None: print ''.join(out_str_list) else: out_str_list.append('\n') self.outfile.write(''.join(out_str_list))
def add_new_allele(glfo, newfo, remove_template_genes=False, debug=False): """ Add a new allele to <glfo>, specified by <newfo> which is of the form: {'gene' : 'IGHV3-71*01+C35T.T47G', 'seq' : 'ACTG yadda yadda CGGGT', 'template-gene' : 'IGHV3-71*01'} If <remove_template_genes>, we also remove 'template-gene' from <glfo>. """ template_gene = newfo["template-gene"] region = utils.get_region(template_gene) if template_gene not in glfo["seqs"][region]: raise Exception("unknown template gene %s" % template_gene) new_gene = newfo["gene"] if region == "v": glfo["cyst-positions"][new_gene] = glfo["cyst-positions"][template_gene] elif region == "j": glfo["tryp-positions"][new_gene] = glfo["tryp-positions"][template_gene] glfo["seqs"][region][new_gene] = newfo["seq"] if debug: print " adding new allele to glfo:" print " template %s %s" % (glfo["seqs"][region][template_gene], utils.color_gene(template_gene)) print " new %s %s" % ( utils.color_mutants(glfo["seqs"][region][template_gene], newfo["seq"]), utils.color_gene(new_gene), ) if remove_template_genes: remove_gene(glfo, template_gene, debug=True)
def make_transition_plot(self, gene_name, model): """ NOTE shares a lot with make_mutefreq_plot() in python/paramutils.py """ fig, ax = plotting.mpl_init() fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)]) ibin = 0 print utils.color_gene(utils.unsanitize_name(gene_name)) legend_colors = set() # add a color to this the first time you plot it for state in model.states: # bin label ax.text(-0.5 + ibin, -0.075, paramutils.simplify_state_name(state.name), rotation='vertical', size=8) sorted_to_states = {} for name in state.transitions.keys(): if name.find('IG') == 0: sorted_to_states[name] = int(paramutils.simplify_state_name(name)) else: sorted_to_states[name] = name sorted_to_states = sorted(sorted_to_states.items(), key=operator.itemgetter(1)) total = 0.0 for to_state, simple_to_state in sorted_to_states: prob = state.transitions[to_state] alpha = 0.6 width = 3 if 'insert' in str(simple_to_state): label = 'insert' color = '#3498db' # blue elif str(simple_to_state) == 'end': label = 'end' color = 'red' else: # regional/internal states assert to_state.find('IG') == 0 label = 'internal' color = 'green' label_to_use = None if color not in legend_colors: label_to_use = label legend_colors.add(color) # horizontal line at height total+prob ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, linewidth=width, alpha=alpha, label=label_to_use) # vertical line from total to total + prob ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=width) midpoint = 0.5*(prob + 2*total) # ax.text(ibin, midpoint, paramutils.simplify_state_name(to_state)) # nicely labels the midpoint of the chunk between lines, but there isn't really room for it total += prob ibin += 1 ax.get_xaxis().set_visible(False) plotting.mpl_finish(ax, self.base_plotdir + '/transitions', gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(model.states) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})
def add_new_allele(self, gene, fitfo, n_candidate_snps, debug=False): # figure out what the new nukes are old_seq = self.glfo['seqs'][utils.get_region(gene)][gene] new_seq = old_seq mutfo = {} for pos in sorted(fitfo['candidates'][n_candidate_snps]): obs_counts = {nuke : self.counts[gene][pos][n_candidate_snps][nuke] for nuke in utils.nukes} # NOTE it's super important to only use the counts from sequences with <n_candidate_snps> total mutations sorted_obs_counts = sorted(obs_counts.items(), key=operator.itemgetter(1), reverse=True) original_nuke = self.mfreqer.counts[gene][pos]['gl_nuke'] new_nuke = None for nuke, _ in sorted_obs_counts: # take the most common one that isn't the existing gl nuke if nuke != original_nuke: new_nuke = nuke break print ' %3d (%s --> %s)' % (pos, original_nuke, new_nuke), assert old_seq[pos] == original_nuke mutfo[pos] = {'original' : original_nuke, 'new' : new_nuke} new_seq = new_seq[:pos] + new_nuke + new_seq[pos+1:] new_name, mutfo = glutils.get_new_allele_name_and_change_mutfo(gene, mutfo) print '' print ' %s %s' % (old_seq, utils.color_gene(gene)) print ' %s %s' % (utils.color_mutants(old_seq, new_seq), utils.color_gene(new_name)) # and add it to the set of new alleles for this gene self.new_allele_info.append({ 'template-gene' : gene, 'gene' : new_name, 'seq' : new_seq, 'aligned-seq' : None })
def get_dbg_str(indelfo): if len(indelfo['qr_gap_seq']) != len(indelfo['gl_gap_seq']): print indelfo['qr_gap_seq'] print indelfo['gl_gap_seq'] raise Exception('different length qr and gl gap seqs (see previous lines)') qrprintstr, glprintstr = [], [] for ich in range(len(indelfo['qr_gap_seq'])): qrb, glb = indelfo['qr_gap_seq'][ich], indelfo['gl_gap_seq'][ich] qrcolor, glcolor = None, None if qrb in utils.gap_chars or glb in utils.gap_chars: qrcolor = 'light_blue' glcolor = 'light_blue' elif qrb in utils.ambiguous_bases: qrcolor = 'light_blue' elif glb in utils.ambiguous_bases: glcolor = 'light_blue' elif qrb != glb: qrcolor = 'red' qrprintstr.append(utils.color(qrcolor, qrb if qrb not in utils.gap_chars else '*')) # change it to a start just cause that's what it originally was... at some point should switch to just leaving it whatever gap char it was glprintstr.append(utils.color(glcolor, glb if glb not in utils.gap_chars else '*')) qrprintstr = ''.join(qrprintstr) glprintstr = ''.join(glprintstr) gene_str = '' gwidth = str(len('query')) if 'v' in indelfo['genes']: gene_str = utils.color_gene(indelfo['genes']['v'], width=int(gwidth), leftpad=True) gwidth = str(utils.len_excluding_colors(gene_str)) dj_gene_str = ' '.join([utils.color_gene(indelfo['genes'][r]) for r in 'dj' if r in indelfo['genes']]) dbg_str_list = [(' %' + gwidth + 's %s %s') % (gene_str, glprintstr, dj_gene_str), (' %' + gwidth + 's %s') % ('query', qrprintstr)] for idl in indelfo['indels']: dbg_str_list.append('%10s: %d base%s at %d (%s)' % (idl['type'], idl['len'], utils.plural(idl['len']), idl['pos'], idl['seqstr'])) return '\n'.join(dbg_str_list)
def reassign_template_counts(self, msa_info, new_alleles, debug=False): # XXX need to update family_groups here if len(new_alleles) == 0: return if debug: print ' template new' print ' size snps snps assigned', if self.reco_info is not None: print ' true', print '' dbg_print = debug # don't print all the tiny clusters templates = {newfo['template-gene'] : newfo['gene'] for newfo in new_alleles.values()} self.adjusted_glcounts = {} for clusterfo in sorted(msa_info, key=lambda cfo: len(cfo['seqfos']), reverse=True): sorted_glcounts, true_sorted_glcounts = self.get_glcounts(clusterfo) # it would be nice to not re-call this for the clusters we already called it on above for gene, counts in sorted_glcounts: # <gene> is the one assigned by sw before allele clustering if debug and len(clusterfo['seqfos']) < 5: if dbg_print: print ' not printing clusters smaller than 5' dbg_print = False if gene not in self.adjusted_glcounts: # add it before we decide whether to switch it, so a template gene with zero counts will be in there with zero counts self.adjusted_glcounts[gene] = 0 if gene in templates: # if this was a template for a new allele, we have to decide whether to apportion some or all of the sequences in this cluster to that new allele template_gene = gene template_cpos = utils.cdn_pos(self.glfo, self.region, template_gene) cons_seq = clusterfo['cons_seq'] template_seq = self.glfo['seqs'][self.region][template_gene] new_allele_seq = new_alleles[templates[template_gene]]['seq'] compare_len = min([template_cpos, len(cons_seq), len(template_seq), len(new_allele_seq)]) # NOTE this doesn't account for indels, i.e. the template and consensus sequences are in general different lengths, but that's ok, it'll just inflate the hamming distance for sequences that differ from consensus by indels, and all we care is finding the one that doesn't have any indels n_template_snps = utils.hamming_distance(cons_seq[:compare_len], template_seq[:compare_len]) n_new_snps = utils.hamming_distance(cons_seq[:compare_len], new_allele_seq[:compare_len]) if debug and dbg_print: print ' %5d %3d %3d' % (len(clusterfo['seqfos']), n_template_snps, n_new_snps), if n_new_snps < n_template_snps: # reassign to the new allele gene = templates[template_gene] if gene not in self.adjusted_glcounts: # add it before we decide whether to switch it, so a template gene with zero counts will be in there with zero counts self.adjusted_glcounts[gene] = 0 if debug and dbg_print: print ' %s' % utils.color_gene(gene, width=15), if self.reco_info is not None: true_gene = true_sorted_glcounts[0][0] # NOTE this is the most *common* simulated gene in the cluster, not necessarily the one corresponding to these particular sequences... but clusters with new alleles should generally be dominated by one gene, so oh, well if true_gene == gene: print ' %s' % utils.color('green', 'ok'), else: print ' %s' % utils.color_gene(true_gene, width=15), print '' self.adjusted_glcounts[gene] += counts if debug: print ' final counts:' for gene, counts in sorted(self.adjusted_glcounts.items(), key=operator.itemgetter(1), reverse=True): print ' %4d %s' % (counts, utils.color_gene(gene))
def make_transition_plot(self, gene_name, model): """ NOTE shares a lot with make_mutefreq_plot() in python/paramutils.py """ fig, ax = plotting.mpl_init() fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)]) ibin = 0 print utils.color_gene(utils.unsanitize_name(gene_name)) legend_colors = set() # add a color to this the first time you plot it for state in model.states: # bin label ax.text(-0.5 + ibin, -0.075, paramutils.simplify_state_name(state.name), rotation='vertical', size=8) sorted_to_states = {} for name in state.transitions.keys(): if name.find('IG') == 0 or name.find('TR') == 0: sorted_to_states[name] = int(paramutils.simplify_state_name(name)) else: sorted_to_states[name] = name sorted_to_states = sorted(sorted_to_states.items(), key=operator.itemgetter(1)) total = 0.0 for to_state, simple_to_state in sorted_to_states: prob = state.transitions[to_state] alpha = 0.6 width = 3 if 'insert' in str(simple_to_state): label = 'insert' color = '#3498db' # blue elif str(simple_to_state) == 'end': label = 'end' color = 'red' else: # regional/internal states assert to_state.find('IG') == 0 or to_state.find('TR') == 0 label = 'internal' color = 'green' label_to_use = None if color not in legend_colors: label_to_use = label legend_colors.add(color) # horizontal line at height total+prob ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, linewidth=width, alpha=alpha, label=label_to_use) # vertical line from total to total + prob ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=width) midpoint = 0.5*(prob + 2*total) # ax.text(ibin, midpoint, paramutils.simplify_state_name(to_state)) # nicely labels the midpoint of the chunk between lines, but there isn't really room for it total += prob ibin += 1 ax.get_xaxis().set_visible(False) plotting.mpl_finish(ax, self.base_plotdir + '/transitions', gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(model.states) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})
def finalize(self, sorted_gene_counts, debug=False): # NOTE <sorted_gene_counts> is usually/always floats instead of integers assert not self.finalized easycounts = {gene : counts for gene, counts in sorted_gene_counts} total_counts = sum([counts for counts in easycounts.values()]) self.genes_to_keep = set() if debug: print ' removing least likely genes (%.1f total counts)' % total_counts print ' %-20s %5s (%s) removed genes (counts)' % ('genes to keep', 'counts', 'snps'), def count_str(cnt): if cnt < 10.: return '%.1f' % cnt else: return '%.0f' % cnt class_counts = self.separate_into_classes(sorted_gene_counts, easycounts) for iclass in range(len(class_counts)): gclass = class_counts[iclass] n_from_this_class = 0 for ig in range(len(gclass)): gfo = gclass[ig] if self.args.n_max_total_alleles is not None and len(self.genes_to_keep) >= self.args.n_max_total_alleles: # command line can specify the total number of alleles break if float(gfo['counts']) / total_counts < self.args.min_allele_prevalence_fraction: # always skip everybody that's super uncommon pass elif ig == 0: # keep the first one from this class self.genes_to_keep.add(gfo['gene']) n_from_this_class += 1 elif utils.hamming_distance(gclass[0]['seq'], gclass[ig]['seq']) == 0: # don't keep it if it's indistinguishable from the most common one (the matches are probably mostly really the best one) pass # don't keep it elif n_from_this_class < self.args.n_alleles_per_gene: # always keep the most common <self.args.n_alleles_per_gene> in each class self.genes_to_keep.add(gfo['gene']) n_from_this_class += 1 else: pass # don't keep it if debug and gfo['gene'] in self.genes_to_keep: snpstr = ' ' if ig == 0 else '(%d)' % utils.hamming_distance(gclass[0]['seq'], gfo['seq']) print '\n %-s %7s %-3s' % (utils.color_gene(gfo['gene'], width=20), count_str(gfo['counts']), snpstr), if debug: if n_from_this_class == 0: print '\n %-s %7s %-3s' % (utils.color('blue', 'none', width=20, padside='right'), '-', ''), removedfo = [gfo for gfo in gclass if gfo['gene'] not in self.genes_to_keep] if len(removedfo) > 0: removal_strs = ['%s (%s)' % (utils.color_gene(gfo['gene']), count_str(gfo['counts'])) for gfo in removedfo] print ' %s' % ' '.join(removal_strs), if debug: print '' self.genes_to_remove = set(self.glfo['seqs'][self.region]) - self.genes_to_keep print ' keeping %d / %d %s gene%s' % (len(self.genes_to_keep), len(self.glfo['seqs'][self.region]), self.region, utils.plural(len(self.genes_to_keep))) # print ' removing %d %s genes: %d with no matches, %d with unconvincing matches' % (len(self.genes_to_remove), self.region, len(set(self.glfo['seqs'][self.region]) - set(easycounts)), len(set(easycounts) - self.genes_to_keep)) self.finalized = True
def remove_gene(glfo, gene, debug=False): """ remove <gene> from <glfo> """ region = utils.get_region(gene) if gene in glfo["seqs"][region]: if debug: print " removing %s from glfo" % utils.color_gene(gene) del glfo["seqs"][region][gene] if region in utils.conserved_codons[glfo["chain"]]: del glfo[utils.conserved_codons[glfo["chain"]][region] + "-positions"][gene] else: if debug: print " can't remove %s from glfo, it's not there" % utils.color_gene(gene)
def remove_gene(glfo, gene, debug=False): """ remove <gene> from <glfo> """ region = utils.get_region(gene) if gene in glfo['seqs'][region]: if debug: print ' removing %s from glfo' % utils.color_gene(gene) del glfo['seqs'][region][gene] if region in utils.conserved_codons[glfo['locus']]: del glfo[utils.conserved_codons[glfo['locus']][region] + '-positions'][gene] else: if debug: print ' can\'t remove %s from glfo, it\'s not there' % utils.color_gene(gene)
def find_partial_failures(self, fostream_name): unique_ids = [] for line in open(fostream_name.replace('.fostream', '')).readlines(): if len(self.sim_need) == 0: return if len(line.strip()) == 0: # skip blank lines continue line = line.replace('"', '') line = line.split(';') unique_id = line[0] if 'NA' not in line: # skip lines that were ok unique_ids.append(unique_id) continue if unique_id not in self.sim_need: continue if unique_id not in self.siminfo: continue # not looking for this <unique_id> a.t.m. info = {} info['unique_id'] = unique_id for stuff in line: for region in utils.regions: # add the first instance of IGH[VDJ] (if it's there at all) if 'IGH' + region.upper( ) in stuff and region + '_gene' not in info: genes = re.findall( 'IGH' + region.upper() + '[^ ][^ ]*', stuff) if len(genes) == 0: print 'ERROR no %s genes in %s' % (region, stuff) gene = genes[0] if gene not in self.germline_seqs[region]: print 'ERROR bad gene %s for %s' % (gene, unique_id) sys.exit() info[region + '_gene'] = gene self.perfplotter.add_partial_fail(self.siminfo[unique_id], info) if self.args.debug: print '%-20s partial fail %s %s %s' % ( unique_id, utils.color_gene(info['v_gene']) if 'v_gene' in info else '', utils.color_gene(info['d_gene']) if 'd_gene' in info else '', utils.color_gene(info['j_gene']) if 'j_gene' in info else ''), print ' (true %s %s %s)' % tuple([ self.siminfo[unique_id][region + '_gene'] for region in utils.regions ]) self.failtails[unique_id] = info self.n_partially_failed += 1 self.sim_need.remove(unique_id) return unique_ids
def find_partial_failures(self, fostream_name): unique_ids = [] for line in open(fostream_name.replace(".fostream", "")).readlines(): if len(self.sim_need) == 0: return if len(line.strip()) == 0: # skip blank lines continue line = line.replace('"', "") line = line.split(";") unique_id = line[0] if "NA" not in line: # skip lines that were ok unique_ids.append(unique_id) continue if unique_id not in self.sim_need: continue if unique_id not in self.siminfo: continue # not looking for this <unique_id> a.t.m. info = {} info["unique_id"] = unique_id for stuff in line: for region in utils.regions: # add the first instance of IGH[VDJ] (if it's there at all) if "IGH" + region.upper() in stuff and region + "_gene" not in info: genes = re.findall("IGH" + region.upper() + "[^ ][^ ]*", stuff) if len(genes) == 0: print "ERROR no %s genes in %s" % (region, stuff) gene = genes[0] if gene not in self.germline_seqs[region]: print "ERROR bad gene %s for %s" % (gene, unique_id) sys.exit() info[region + "_gene"] = gene self.perfplotter.add_partial_fail(self.siminfo[unique_id], info) if self.args.debug: print "%-20s partial fail %s %s %s" % ( unique_id, utils.color_gene(info["v_gene"]) if "v_gene" in info else "", utils.color_gene(info["d_gene"]) if "d_gene" in info else "", utils.color_gene(info["j_gene"]) if "j_gene" in info else "", ), print " (true %s %s %s)" % tuple( [self.siminfo[unique_id][region + "_gene"] for region in utils.regions] ) self.failtails[unique_id] = info self.n_partially_failed += 1 self.sim_need.remove(unique_id) return unique_ids
def write_hmm_input(self, csv_fname, sw_info, parameter_dir, preclusters=None, hmm_type='', pair_hmm=False, stripped=False): print ' writing input' csvfile = opener('w')(csv_fname) start = time.time() # write header header = ['names', 'k_v_min', 'k_v_max', 'k_d_min', 'k_d_max', 'only_genes', 'seqs'] # I wish I had a good c++ csv reader csvfile.write(' '.join(header) + '\n') skipped_gene_matches = set() assert hmm_type != '' if hmm_type == 'k=1': # single vanilla hmm nsets = [[qn] for qn in self.input_info.keys()] elif hmm_type == 'k=2': # pair hmm nsets = self.get_pairs(preclusters) elif hmm_type == 'k=preclusters': # run the k-hmm on each cluster in <preclusters> assert preclusters != None nsets = [ val for key, val in preclusters.id_clusters.items() if len(val) > 1 ] # <nsets> is a list of sets (well, lists) of query names # nsets = [] # for cluster in preclusters.id_clusters.values(): # nsets += itertools.combinations(cluster, 5) elif hmm_type == 'k=nsets': # run on *every* combination of queries which has length <self.args.n_sets> if self.args.all_combinations: nsets = itertools.combinations(self.input_info.keys(), self.args.n_sets) else: # put the first n together, and the second group of n (not the self.input_info is and OrderedDict) nsets = [] keylist = self.input_info.keys() this_set = [] for iquery in range(len(keylist)): if iquery % self.args.n_sets == 0: # every nth query, start a new group if len(this_set) > 0: nsets.append(this_set) this_set = [] this_set.append(keylist[iquery]) if len(this_set) > 0: nsets.append(this_set) else: assert False for query_names in nsets: non_failed_names = self.remove_sw_failures(query_names, sw_info) if len(non_failed_names) == 0: continue combined_query = self.combine_queries(sw_info, non_failed_names, parameter_dir, stripped=stripped, skipped_gene_matches=skipped_gene_matches) if len(combined_query) == 0: # didn't find all regions continue csvfile.write('%s %d %d %d %d %s %s\n' % # NOTE csv.DictWriter can handle tsvs, so this should really be switched to use that (':'.join([str(qn) for qn in non_failed_names]), combined_query['k_v']['min'], combined_query['k_v']['max'], combined_query['k_d']['min'], combined_query['k_d']['max'], ':'.join(combined_query['only_genes']), ':'.join(combined_query['seqs']))) if len(skipped_gene_matches) > 0: print ' not found in %s, i.e. were never the best sw match for any query, so removing from consideration for hmm:' % (parameter_dir) for region in utils.regions: print ' %s: %s' % (region, ' '.join([utils.color_gene(gene) for gene in skipped_gene_matches if utils.get_region(gene) == region])) csvfile.close() print ' input write time: %.3f' % (time.time()-start)
def skip_gene(gene): if self.args.debug: print ' %s in list of genes to skip' % utils.color_gene(gene) if gene not in genes_actually_skipped: genes_actually_skipped[gene] = 0 genes_actually_skipped[gene] += 1 qr_info['skip_gene'] = True
def sim_gene_count_str( kgene ): # figure out simulation genes and counts for the uids assigned to <kgene> if annotations is None or self.reco_info is None: return '' uids_this_gene = [ uid for uid, line in annotations.items() if line[region + '_gene'] == kgene ] sim_genes = { } # simulation genes for the uids that we assigned to <kgene> (note that self.simcounts doesn't have this per-uid information) for uid in uids_this_gene: sgene = self.reco_info[uid][region + '_gene'] if sgene not in sim_genes: sim_genes[sgene] = 0 sim_genes[sgene] += 1 sorted_sim_gene_counts = sorted(sim_genes.items(), key=operator.itemgetter(1), reverse=True) count_str = ' '.join([ utils.color('blue' if sg == kgene else 'red', str(c)) for sg, c in sorted_sim_gene_counts ]) sgene_str = ' '.join( [utils.color_gene(sg) for sg, _ in sorted_sim_gene_counts]) return '%s %s' % (count_str, sgene_str)
def add_new_allele(glfo, newfo, remove_template_genes, debug=False): """ Add a new allele to <glfo>, specified by <newfo> which is of the form: {'template-gene' : 'IGHV3-71*01', 'gene' : 'IGHV3-71*01+C35T.T47G', 'seq' : 'ACTG yadda yadda CGGGT'} If <remove_template_genes>, we also remove 'template-gene' from <glfo>. """ template_gene = newfo['template-gene'] region = utils.get_region(template_gene) if template_gene not in glfo['seqs'][region]: raise Exception('unknown template gene %s' % template_gene) new_gene = newfo['gene'] if region == 'v': glfo['cyst-positions'][new_gene] = glfo['cyst-positions'][template_gene] elif region == 'j': glfo['tryp-positions'][new_gene] = glfo['tryp-positions'][template_gene] glfo['seqs'][region][new_gene] = newfo['seq'] if debug: print ' adding new allele to glfo:' print ' template %s %s' % (glfo['seqs'][region][template_gene], utils.color_gene(template_gene)) print ' new %s %s' % (utils.color_mutants(glfo['seqs'][region][template_gene], newfo['seq']), utils.color_gene(new_gene)) if remove_template_genes: remove_gene(glfo, template_gene, debug=True)
def skip_gene(gene): print ' %s in list of genes to skip' % utils.color_gene( gene) if gene not in genes_actually_skipped: genes_actually_skipped[gene] = 0 genes_actually_skipped[gene] += 1 line['skip_gene'] = True
def choose_some_alleles(region, genes_to_use, allelic_groups, n_alleles_per_gene, debug=False): """ choose a gene (i.e. a primary and sub-version) from <allelic_groups>, and its attendant alleles """ # NOTE also modifies <allelic_groups> if len(allelic_groups[region]) == 0: raise Exception('ran out of %s alleles (either --n-genes-per-region or --n-alleles-per-gene are probably too big)' % region) available_versions = None while available_versions is None or len(available_versions) == 0: if available_versions is not None: print ' %s couldn\'t find any versions that have %d alleles, so trying again' % (utils.color('red', 'warning'), n_alleles) n_alleles = numpy.random.choice(n_alleles_per_gene[region]) available_versions = [(pv, subv) for pv in allelic_groups[region] for subv in allelic_groups[region][pv] if len(allelic_groups[region][pv][subv]) >= n_alleles] ichoice = numpy.random.randint(0, len(available_versions) - 1) if len(available_versions) > 1 else 0 # numpy.random.choice() can't handle list of tuples (and barfs if you give it only one thing to choose from) primary_version, sub_version = available_versions[ichoice] new_alleles = set(numpy.random.choice(list(allelic_groups[region][primary_version][sub_version]), size=n_alleles, replace=False)) if debug: print ' %8s %5s %s' % (primary_version, sub_version, ' '.join([utils.color_gene(g, width=15) for g in new_alleles])) assert len(new_alleles & genes_to_use) == 0 # make sure none of the new alleles are already in <genes_to_use> genes_to_use |= new_alleles # actually add them to the final set # remove stuff we've used from <allelic_groups> del allelic_groups[region][primary_version][sub_version] # remove this sub-version (we don't want any more alleles from it) if len(allelic_groups[region][primary_version]) == 0: del allelic_groups[region][primary_version]
def add_new_allele(glfo, newfo, remove_template_genes=False, debug=False): """ Add a new allele to <glfo>, specified by <newfo> which is of the form: {'gene' : 'IGHV3-71*01+C35T.T47G', 'seq' : 'ACTG yadda yadda CGGGT', 'template-gene' : 'IGHV3-71*01'} If <remove_template_genes>, we also remove 'template-gene' from <glfo>. """ template_gene = newfo['template-gene'] region = utils.get_region(template_gene) if template_gene not in glfo['seqs'][region]: raise Exception('unknown template gene %s' % template_gene) new_gene = newfo['gene'] if region == 'v': glfo['cyst-positions'][new_gene] = glfo['cyst-positions'][template_gene] elif region == 'j': glfo['tryp-positions'][new_gene] = glfo['tryp-positions'][template_gene] glfo['seqs'][region][new_gene] = newfo['seq'] if debug: print ' adding new allele to glfo:' print ' template %s %s' % (glfo['seqs'][region][template_gene], utils.color_gene(template_gene)) print ' new %s %s' % (utils.color_mutants(glfo['seqs'][region][template_gene], newfo['seq']), utils.color_gene(new_gene)) if remove_template_genes: remove_gene(glfo, template_gene, debug=True)
def trim_and_remove_genes(region, gene, seq, glfo, template_glfo, debug=False): nearest_template_gene = glutils.find_nearest_gene_using_names( template_glfo, gene) nearest_template_seq = template_glfo['seqs'][region][nearest_template_gene] # extra_bases = glfo['cyst-positions'][gene] - template_glfo['cyst-positions'][nearest_template_gene] # not right if there's some internal gaps in the alignment aligned_nearest_template_seq, aligned_seq = utils.align_seqs( nearest_template_seq, seq) if debug: print ' %s' % utils.color_gene(gene) utils.color_mutants(aligned_nearest_template_seq, aligned_seq, print_result=True, ref_label='template ', extra_str=' ') if aligned_seq[0] not in utils.gap_chars and aligned_nearest_template_seq[ 0] not in utils.gap_chars: if debug: print ' ok' elif aligned_seq[0] in utils.gap_chars: if debug: print ' %s, removing' % utils.color('red', 'too small') glutils.remove_gene(glfo, gene) else: if debug: print ' extra bases %s' % utils.color_gene(gene) extra_bases = len(aligned_nearest_template_seq) - len( aligned_nearest_template_seq.lstrip('-')) seq = seq[extra_bases:] if debug: print ' removed %d bases' % extra_bases if seq in glfo['seqs'][region].values(): print ' trimmed seq already in glfo under name %s, so removing it' % ' '.join( [ utils.color_gene(g) for g, s in glfo['seqs'][region].items() if s == seq ]) glutils.remove_gene(glfo, gene, debug=True) return glfo['seqs'][region][gene] = seq glfo['cyst-positions'][gene] -= extra_bases # utils.color_mutants(nearest_template_seq, seq, print_result=True, ref_label='template ', align=True, extra_str=' ') assert utils.codon_unmutated('cyst', glfo['seqs'][region][gene], glfo['cyst-positions'][gene], debug=True)
def print_match(self, region, gene, query_seq, score, glbounds, qrbounds, codon_pos, warnings, skipping=False): if self.debug < 2: return out_str_list = [] buff_str = (20 - len(gene)) * ' ' tmp_val = score if self.args.apply_choice_probs_in_sw and self.get_choice_prob( region, gene) != 0.0: tmp_val = score / self.get_choice_prob(region, gene) if self.args.apply_choice_probs_in_sw: out_str_list.append( '%8s%s%s%9.1e * %3.0f = %-6.1f' % (' ', utils.color_gene(gene), buff_str, self.get_choice_prob(region, gene), tmp_val, score)) else: out_str_list.append( '%8s%s%s%9s%3s %6.0f ' % (' ', utils.color_gene(gene), '', '', buff_str, score)) out_str_list.append( '%4d%4d %s\n' % (glbounds[0], glbounds[1], self.germline_seqs[region][gene][glbounds[0]:glbounds[1]])) out_str_list.append('%46s %4d%4d' % ('', qrbounds[0], qrbounds[1])) out_str_list.append(' %s ' % (utils.color_mutants( self.germline_seqs[region][gene][glbounds[0]:glbounds[1]], query_seq[qrbounds[0]:qrbounds[1]]))) if region != 'd': out_str_list.append( '(%s %d)' % (utils.conserved_codon_names[region], codon_pos)) if warnings[gene] != '': out_str_list.append('WARNING ' + warnings[gene]) if skipping: out_str_list.append('skipping!') if self.args.outfname is None: print ''.join(out_str_list) else: out_str_list.append('\n') self.outfile.write(''.join(out_str_list))
def parse_ramesh_seqs(glseqs, outdir, debug=False): for locus in glseqs: glutils.remove_glfo_files(outdir, locus) # write to a glfo dir without extra info for region in glseqs[locus]: fn = glutils.get_fname(outdir, locus, region) if not os.path.exists(os.path.dirname(fn)): os.makedirs(os.path.dirname(fn)) with open(fn, 'w') as ofile: for gene, seq in glseqs[locus][region].items(): ofile.write('>%s\n%s\n' % (gene, seq)) # figure out extra info template_glfo = glutils.read_glfo('data/germlines/macaque', locus) glfo = glutils.read_glfo(outdir, locus, template_glfo=template_glfo, remove_bad_genes=True, debug=True) # trim non-coding stuff upstream of v (and remove non-full-length ones) gene_groups = {} for region in ['v']: group_labels = sorted( set([utils.gene_family(g) for g in glfo['seqs'][region]])) gene_groups[region] = [(glabel, { g: glfo['seqs'][region][g] for g in glfo['seqs'][region] if utils.gene_family(g) == glabel }) for glabel in group_labels] for region in [r for r in utils.regions if r in gene_groups]: if debug: print '%s' % utils.color('reverse_video', utils.color('green', region)) for group_label, group_seqs in gene_groups[ region]: # ok, this isn't really doing anything any more if debug: print ' %s' % utils.color('blue', group_label) for gene, seq in group_seqs.items(): trim_and_remove_genes(region, gene, seq, glfo, template_glfo, debug=debug) # remove any seqs with ambiguous bases for region in [r for r in utils.regions if r in glfo['seqs']]: for gene, seq in glfo['seqs'][region].items(): if utils.ambig_frac(seq) > 0.: if debug: print ' %d ambiguous bases: %s' % ( len(seq) * utils.ambig_frac(seq), utils.color_gene(gene)) glutils.remove_gene(glfo, gene) # glutils.print_glfo(glfo) # write final result glutils.write_glfo(outdir, glfo, debug=True)
def finalize(self, debug=False): assert not self.finalized self.mfreqer.finalize() start = time.time() gene_results = {'not_enough_obs_to_fit' : set(), 'didnt_find_anything_with_fit' : set(), 'new_allele' : set()} if debug: print '\nlooking for new alleles:' for gene in sorted(self.mfreqer.counts): if utils.get_region(gene) != 'v': continue if debug: print '\n%s (observed %d %s)' % (utils.color_gene(gene), self.gene_obs_counts[gene], utils.plural_str('time', self.gene_obs_counts[gene])) positions_to_try_to_fit, xyvals = self.get_positions_to_fit(gene, gene_results, debug=debug) if positions_to_try_to_fit is None: continue fitfo = {n : {} for n in ('min_snp_ratios', 'candidates')} for istart in range(1, self.n_max_snps): if debug: if istart == 1: print ' resid. / ndof' print ' position ratio (m=0 / m>%5.2f) muted / obs ' % self.big_y_icpt_bounds[0] print ' %d %s' % (istart, utils.plural_str('snp', istart)) subxyvals = {pos : {k : v[istart : istart + self.max_fit_length] for k, v in xyvals[pos].items()} for pos in positions_to_try_to_fit} self.fit_istart(gene, istart, positions_to_try_to_fit, subxyvals, fitfo, debug=debug) if istart not in fitfo['candidates']: # if it didn't get filled, we didn't have enough observations to do the fit break istart_candidates = [] if debug: print ' evaluating each snp hypothesis' print ' snps min ratio' for istart in fitfo['candidates']: if debug: print ' %2d %9s' % (istart, fstr(fitfo['min_snp_ratios'][istart])), if self.is_a_candidate(gene, fitfo, istart, debug=debug): istart_candidates.append(istart) if len(istart_candidates) > 0: n_candidate_snps = min(istart_candidates) # add the candidate with the smallest number of snps to the germline set, and run again (if the firs gene_results['new_allele'].add(gene) print '\n found a new allele candidate separated from %s by %d %s at %s:' % (utils.color_gene(gene), n_candidate_snps, utils.plural_str('snp', n_candidate_snps), utils.plural_str('position', n_candidate_snps)), self.add_new_allele(gene, fitfo, n_candidate_snps, debug=debug) else: gene_results['didnt_find_anything_with_fit'].add(gene) if debug: print ' no new alleles' if debug: print 'found new alleles for %d %s (there were also %d without new alleles, and %d without enough observations to fit)' % (len(gene_results['new_allele']), utils.plural_str('gene', len(gene_results['new_allele'])), len(gene_results['didnt_find_anything_with_fit']), len(gene_results['not_enough_obs_to_fit'])) print ' allele finding time: %.1f' % (time.time()-start) self.finalized = True
def find_partial_failures(self, fostream_name): unique_ids = [] for line in open(fostream_name.replace('.fostream', '')).readlines(): if len(self.sim_need) == 0: return if len(line.strip()) == 0: # skip blank lines continue line = line.replace('"', '') line = line.split(';') unique_id = line[0] if 'NA' not in line: # skip lines that were ok unique_ids.append(unique_id) continue if unique_id not in self.sim_need: continue if unique_id not in self.siminfo: continue # not looking for this <unique_id> a.t.m. info = {} info['unique_id'] = unique_id for stuff in line: for region in utils.regions: # add the first instance of IGH[VDJ] (if it's there at all) if 'IGH'+region.upper() in stuff and region+'_gene' not in info: genes = re.findall('IGH' + region.upper() + '[^ ][^ ]*', stuff) if len(genes) == 0: print 'ERROR no %s genes in %s' % (region, stuff) gene = genes[0] if gene not in self.germline_seqs[region]: print 'ERROR bad gene %s for %s' % (gene, unique_id) sys.exit() info[region + '_gene'] = gene self.perfplotter.add_partial_fail(self.siminfo[unique_id], info) if self.args.debug: print '%-20s partial fail %s %s %s' % (unique_id, utils.color_gene(info['v_gene']) if 'v_gene' in info else '', utils.color_gene(info['d_gene']) if 'd_gene' in info else '', utils.color_gene(info['j_gene']) if 'j_gene' in info else ''), print ' (true %s %s %s)' % tuple([self.siminfo[unique_id][region + '_gene'] for region in utils.regions]) self.failtails[unique_id] = info self.n_partially_failed += 1 self.sim_need.remove(unique_id) return unique_ids
def skip_gene(gene): if self.args.debug: print ' %s in list of genes to skip' % utils.color_gene( gene) if gene not in genes_actually_skipped: genes_actually_skipped[gene] = 0 genes_actually_skipped[gene] += 1 qr_info['skip_gene'] = True
def remove_gene(glfo, gene, debug=False): """ remove <gene> from <glfo> """ if debug: print ' removing %s from glfo' % utils.color_gene(gene) region = utils.get_region(gene) if region in utils.conserved_codons: del glfo[utils.conserved_codons[region] + '-positions'][gene] del glfo['seqs'][region][gene]
def set_per_gene_support(self, true_line, inf_line, region): if inf_line[region + '_per_gene_support'].keys()[0] != inf_line[region + '_gene']: print ' WARNING best-supported gene %s not same as viterbi gene %s' % (utils.color_gene(inf_line[region + '_per_gene_support'].keys()[0]), utils.color_gene(inf_line[region + '_gene'])) support = inf_line[region + '_per_gene_support'].values()[0] # sorted, ordered dict with gene : logprob key-val pairs if true_line[region + '_gene'] == inf_line[region + '_gene']: # NOTE this requires allele to be correct, but set_bool_column() does not self.hists[region + '_allele_right_vs_per_gene_support'].fill(support) else: self.hists[region + '_allele_wrong_vs_per_gene_support'].fill(support)
def get_single_performance(region, outdir, method, debug=False): sglfo = glutils.read_glfo(outdir + '/germlines/simulation', locus=args.locus) iglfo = glutils.read_glfo(outdir + '/' + method + '/sw/germline-sets', locus=args.locus) glutils.synchronize_glfos(ref_glfo=sglfo, new_glfo=iglfo, region=region) missing_alleles = set(sglfo['seqs'][region]) - set(iglfo['seqs'][region]) spurious_alleles = set(iglfo['seqs'][region]) - set(sglfo['seqs'][region]) if debug: if len(missing_alleles) > 0: print ' %2d missing %s' % (len(missing_alleles), ' '.join([utils.color_gene(g) for g in missing_alleles])) if len(spurious_alleles) > 0: print ' %2d spurious %s' % (len(spurious_alleles), ' '.join([utils.color_gene(g) for g in spurious_alleles])) if len(missing_alleles) == 0 and len(spurious_alleles) == 0: print ' none missing' return { 'missing' : len(missing_alleles), 'spurious' : len(spurious_alleles), 'total' : len([g for g in sglfo['seqs'][region] if '+' in g]), # anybody with a '+' should be a new allele }
def set_per_gene_support(self, true_line, inf_line, region): if inf_line[region + '_per_gene_support'].keys()[0] != inf_line[region + '_gene']: print ' WARNING best-supported gene %s not same as viterbi gene %s' % ( utils.color_gene( inf_line[region + '_per_gene_support'].keys()[0]), utils.color_gene(inf_line[region + '_gene'])) support = inf_line[region + '_per_gene_support'].values()[ 0] # sorted, ordered dict with gene : logprob key-val pairs if true_line[region + '_gene'] == inf_line[ region + '_gene']: # NOTE this requires allele to be correct, but set_bool_column() does not self.hists[region + '_allele_right_vs_per_gene_support'].fill(support) else: self.hists[region + '_allele_wrong_vs_per_gene_support'].fill(support)
def print_results(gl_sets): tmpfo = { 'missing': set(gl_sets['sim']) - set(gl_sets['inf']), 'spurious': set(gl_sets['inf']) - set(gl_sets['sim']), 'ok': set(gl_sets['inf']) & set(gl_sets['sim']) } for name, genes in tmpfo.items(): print ' %9s %2d: %s' % (name, len(genes), ' '.join( [utils.color_gene(g) for g in genes]))
def build_v_gene_set(glfo, introns): total_d_counts = {} refseqs = {} for d_gene, counts in introns.items(): total_d_counts[d_gene] = sum(counts.values()) for d_gene, _ in sorted(total_d_counts.items(), key=operator.itemgetter(1), reverse=True): counts = introns[d_gene] # first decide on the reference sequences refseq, column_counts = None, None for seq in sorted(counts, key=len, reverse=True): if refseq is None: # first one, i.e. the longest refseq = seq column_counts = [{n : 0 for n in utils.nukes} for i in range(len(refseq))] ioffset = len(refseq) - len(seq) partial_refseq = refseq[ioffset:] assert len(partial_refseq) == len(seq) for ibase in range(ioffset, len(refseq)): column_counts[ibase][seq[ibase - ioffset]] += counts[seq] refseqs[d_gene] = [] for basecounts in column_counts: most_common_base = sorted(basecounts.items(), key=operator.itemgetter(1), reverse=True)[0][0] refseqs[d_gene].append(most_common_base) refseqs[d_gene] = ''.join(refseqs[d_gene]) n_ok = 0 mutecounts = {} for seq in sorted(counts, key=len, reverse=True): # print ' %3d %150s' % (count, seq) partial_refseq = refseqs[d_gene][len(refseqs[d_gene]) - len(seq):] if seq == partial_refseq: n_ok += counts[seq] else: # utils.color_mutants(partial_refseq, seq, print_result=True, extra_str=' ') n_mutes = utils.hamming_distance(partial_refseq, seq) if n_mutes not in mutecounts: mutecounts[n_mutes] = 0 mutecounts[n_mutes] += counts[seq] print ' %s %4d / %-4d ok' % (utils.color_gene(d_gene, width=10), n_ok, n_ok + sum(mutecounts.values())), if len(mutecounts) > 0: print '(mean of %.1f mutations among the other %d' % (numpy.average(mutecounts.keys(), weights=mutecounts.values()), sum(mutecounts.values())), print '' # add the intronic v genes to glfo for d_gene, refseq in refseqs.items(): glfo['seqs']['v'][utils.generate_dummy_v(d_gene)] = refseq glfo['cyst-positions'][utils.generate_dummy_v(d_gene)] = len(refseq) - 3 # write a glfo dir with everything glutils.write_glfo(outdir + '/germlines/imgt-and-intronic', glfo, debug=True) # remove the original v genes, and write a glfo dir with just the intronic ones glutils.remove_genes(glfo, [g for g in glfo['seqs']['v'] if 'xDx' not in g], debug=True) glutils.write_glfo(outdir + '/germlines/intronic', glfo, debug=True)
def make_mutefreq_plot(plotdir, gene_name, positions): import plotting """ NOTE shares a lot with make_transition_plot() in bin/plot-hmms.py. """ nuke_colors = {'A' : 'red', 'C' : 'blue', 'G' : 'orange', 'T' : 'green'} fig, ax = plotting.mpl_init() fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)]) ibin = 0 print utils.color_gene(utils.unsanitize_name(gene_name)) legend_colors = set() for info in positions: posname = info['name'] # make label below bin ax.text(-0.5 + ibin, -0.075, simplify_state_name(posname), rotation='vertical', size=8) total = 0.0 alpha = 0.6 for nuke, prob in sorted(info['nuke_freqs'].items(), key=operator.itemgetter(1), reverse=True): color = nuke_colors[nuke] label_to_use = None if color not in legend_colors: label_to_use = nuke legend_colors.add(color) # horizontal line at height total+prob ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, alpha=alpha, linewidth=3, label=label_to_use) # vertical line from total to total + prob ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=3) # # write [ACGT] at midpoint between total and total+prob # midpoint = 0.5*(prob + 2*total) # ... *redacted* total += prob ibin += 1 ax.get_xaxis().set_visible(False) plotting.mpl_finish(ax, plotdir, gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(positions) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})
def getvalstr(gene, val): if gene is None or (utils.get_region(gene) == 'd' and not utils.has_d_gene(utils.get_locus(gene))): return '%s %5.2s %s %-16s%s' % (cstr, ' - ', cstr, ' - ', 4 * ' ' if latex else '') else: if latex: gstr = utils.shorten_gene_name(gene, use_one_based_indexing=True, n_max_mutstrs=5) if emph_genes is not None and gene in emph_genes: gstr = '\\color{red}{\\textbf{%s}}' % gstr else: gstr = utils.color_gene(gene, width=18) return '%s %s%5.2f%s %s %-20s' % (cstr, estr, 100 * val, estr, cstr, gstr)
def write_inf_glfo( args ): # read default glfo, restrict it to the specified alleles, and write to somewhere where all the methods can read it # NOTE this dir should *not* be modified by any of the methods inf_glfo = glutils.read_glfo('data/germlines/human', locus=args.locus, only_genes=args.inf_v_genes + args.dj_genes) print ' writing initial inference glfo with %d v: %s' % (len( inf_glfo['seqs']['v']), ' '.join( [utils.color_gene(g) for g in inf_glfo['seqs']['v']])) glutils.write_glfo(args.inf_glfo_dir, inf_glfo)
def print_data_pair_results(gl_sets): assert len(gl_sets) == 2 # would need to update ds_1, ds_2 = gl_sets.keys() tmpfo = { ds_1: set(gl_sets[ds_1]) - set(gl_sets[ds_2]), ds_2: set(gl_sets[ds_2]) - set(gl_sets[ds_1]), 'both': set(gl_sets[ds_2]) & set(gl_sets[ds_1]) } for name, genes in tmpfo.items(): print ' %9s %2d: %s' % (name, len(genes), ' '.join( [utils.color_gene(g) for g in genes]))
def make_mutefreq_plot(plotdir, gene_name, positions): """ NOTE shares a lot with make_transition_plot() in bin/plot-hmms.py. """ nuke_colors = {'A' : 'red', 'C' : 'blue', 'G' : 'orange', 'T' : 'green'} fig, ax = plotting.mpl_init() fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)]) ibin = 0 print utils.color_gene(utils.unsanitize_name(gene_name)) legend_colors = set() for info in positions: posname = info['name'] # make label below bin ax.text(-0.5 + ibin, -0.075, simplify_state_name(posname), rotation='vertical', size=8) total = 0.0 alpha = 0.6 for nuke, prob in sorted(info['nuke_freqs'].items(), key=operator.itemgetter(1), reverse=True): color = nuke_colors[nuke] label_to_use = None if color not in legend_colors: label_to_use = nuke legend_colors.add(color) # horizontal line at height total+prob ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, alpha=alpha, linewidth=3, label=label_to_use) # vertical line from total to total + prob ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=3) # # write [ACGT] at midpoint between total and total+prob # midpoint = 0.5*(prob + 2*total) # ... *redacted* total += prob ibin += 1 ax.get_xaxis().set_visible(False) plotting.mpl_finish(ax, plotdir, gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(positions) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})
def print_gene_choice(self): print ' chose: gene length' for region in utils.regions: print ' %s %-18s %-3d' % ( region, utils.color_gene(self.genes[region], width=18), len(self.original_seqs[region])), if region in self.pre_erosion_codon_positions: print ' (%s: %d)' % ( utils.conserved_codons[self.glfo['locus']][region], self.pre_erosion_codon_positions[region]) else: print ''
def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False): self.region = utils.get_region(gene_name) self.raw_name = gene_name # i.e. unsanitized self.germline_seqs = glfo['seqs'] # all germline alleles self.germline_seq = self.germline_seqs[self.region][gene_name] # germline sequence for this hmm self.indir = base_indir self.args = args self.debug = debug self.codon_positions = {r : glfo[c + '-positions'] for r, c in utils.conserved_codons[args.locus].items()} # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = args.min_observations_to_write self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25} # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths self.erosion_pseudocount_length = 10 # if we're closer to the end of the gene than this, make sure erosion probability isn't zero self.outdir = outdir self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there self.insertions = [] if self.region == 'v': self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') self.insertions.append('jf') assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[0] == 'N' # maybe need to update some stuff below if this changes if self.debug: print '%s' % utils.color_gene(gene_name) self.n_occurences = utils.read_single_gene_count(self.indir, gene_name, debug=self.debug) # how many times did we observe this gene in data? replacement_genes = None if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us if self.debug: print ' only saw it %d times (wanted %d), so use info from all other genes' % (self.n_occurences, self.args.min_observations_to_write) replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, debug=self.debug) self.erosion_probs = self.read_erosion_info(gene_name, replacement_genes) self.insertion_probs, self.insertion_content_probs = self.read_insertion_info(gene_name, replacement_genes) self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, locus=self.args.locus, approved_genes=replacement_genes) # actual info in <self.mute_obs> isn't actually used a.t.m. self.track = Track('nukes', utils.nukes) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM(self.saniname, self.track.getdict()) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name)) # if we really didn't see this gene at all, take pity on it and kick it an eps tmp_mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv') self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean()
def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False): self.region = utils.get_region(gene_name) self.raw_name = gene_name # i.e. unsanitized self.germline_seqs = glfo['seqs'] # all germline alleles self.germline_seq = self.germline_seqs[self.region][gene_name] # germline sequence for this hmm self.indir = base_indir self.args = args self.debug = debug self.codon_positions = {r : glfo[c + '-positions'] for r, c in utils.conserved_codons[args.chain].items()} # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = args.min_observations_to_write self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25} # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths self.erosion_pseudocount_length = 10 # if we're closer to the end of the gene than this, make sure erosion probability isn't zero self.outdir = outdir self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there self.insertions = [] if self.region == 'v': self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') self.insertions.append('jf') assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[0] == 'N' # maybe need to update some stuff below if this changes if self.debug: print '%s' % utils.color_gene(gene_name) self.n_occurences = utils.read_single_gene_count(self.indir, gene_name, debug=self.debug) # how many times did we observe this gene in data? replacement_genes = None if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us if self.debug: print ' only saw it %d times (wanted %d), so use info from all other genes' % (self.n_occurences, self.args.min_observations_to_write) replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, debug=self.debug) self.erosion_probs = self.read_erosion_info(gene_name, replacement_genes) self.insertion_probs, self.insertion_content_probs = self.read_insertion_info(gene_name, replacement_genes) self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, chain=self.args.chain, approved_genes=replacement_genes) # actual info in <self.mute_obs> isn't actually used a.t.m. self.track = Track('nukes', utils.nukes) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM(self.saniname, self.track.getdict()) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name)) # if we really didn't see this gene at all, take pity on it and kick it an eps tmp_mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv') self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean()
def print_stuff(line): cluster_index = sorted_clusters.index(cluster) naive_cdr3, matureiseq0_cdr3 = utils.subset_sequences(line, iseq=0, restrict_to_region='cdr3') # returns the CDR3 nt sequence for naive, and the first mutated sequence (iseq0); CDR3 = first base of cysteine through last base of tryptophan # mature_cdr3_seqs = [] # trying to translate the consensus cdr3 so I can search these with my seed seqs # for iseq in range(len(line['unique_ids'])): # naive_cdr3_seq, mature_cdr3_seq = utils.subset_sequences(line, iseq=iseq, restrict_to_region='cdr3') # mature_cdr3_seqs.append(mature_cdr3_seq) # mature_cdr3_seqs # translated_cdr3 = mature_cdr3_seqs.translate() cdr3_aa = '%-30s' % Seq(naive_cdr3).translate() # If a cluster contains one of our seed seqs, color this CDR3 red if any('-ig' in s for s in line['unique_ids']): cdr3_aa = utils.color('red', cdr3_aa, width=30) if args.cdr3 in cdr3_aa: # Only print clusters with naive CDR3 that matches our specified --cdr3 argument print 'index genes size n muts SHM rep frac CDR3 FayWuH' print ' mean med len seq' print '%4s %s %s %s %5d %5d %5d %7.3f %8.4f %2d %s %4.2f' % ( cluster_index, utils.color_gene(line['v_gene'], width=15), utils.color_gene(line['d_gene'], width=15), utils.color_gene(line['j_gene'], width=10), len(line['unique_ids']), numpy.mean(line['n_mutations']), numpy.median(line['n_mutations']), numpy.mean(line['mut_freqs']), float(len(cluster)) / n_total, (line['cdr3_length']/3), cdr3_aa, utils.fay_wu_h(line, debug=False), ) # print 'number of mutations per sequence in cluster', sorted(line['n_mutations']) print len(line['naive_seq']), 'length of naive seq' # utils.print_reco_event(utils.synthesize_single_seq_line(line, iseq=0)) # print ascii-art representation of the rearrangement event print 'unique_ids: ', getkey(line['unique_ids']) print print utils.print_reco_event(line)
def read_mute_counts( indir, gene, locus, extra_genes=None, debug=False ): # NOTE I'm adding the <extra_genes> arg in a hackish way because i need this to not crash in one specific instance (running bin/test-germline-inference.py) where the file for <gene> doesn't exist, but I don't remember/understand how this fcn and the following function work well enough to do this more sensibly # NOTE also that this new hack that allows a different gene's counts to be used might break something later on if the genes have different lengths? I have no idea # ---------------------------------------------------------------------------------------- def read_single_file(gtmp): mfname = indir + '/mute-freqs/' + utils.sanitize_name(gtmp) + '.csv' if not os.path.exists(mfname): return None observed_counts = {} with open(mfname, 'r') as mutefile: reader = csv.DictReader(mutefile) for line in reader: pos = int(line['position']) assert pos not in observed_counts observed_counts[pos] = { n: int(line[n + '_obs']) for n in utils.nukes } if debug: print ' read %d per-base mute counts from %s' % ( len(observed_counts), mfname) return observed_counts # ---------------------------------------------------------------------------------------- if extra_genes is not None: # I don't want to fix it cause it'd be kinda hard, and also I don't think it ever happens under normal circumstances -- it's only called with this arg from simulation, in which case you should always have parameters for the gene you're asking for print '%s Reading per-base mutation counts for genes (%s) in addition to the desired one (%s), which doesn\'t really make sense, since the counts will be wrong at the positions at which the genes differ.' % ( utils.color('red', 'warning'), utils.color_genes(extra_genes), utils.color_gene(gene)) print ' This should only happen if you\'re doing something weird, probably running simulation asking for genes for which you don\'t have parameters.' print ' If this is the case and you only care that it doesn\'t crash, and not that the mutation model is particularly accurate, this is fine.' if gene == glutils.dummy_d_genes[locus]: return {} if extra_genes is None: approved_genes = [gene] else: assert gene not in extra_genes approved_genes = [gene] + extra_genes for gtmp in approved_genes: observed_counts = read_single_file(gtmp) if observed_counts is not None: # HACK this just uses the first one that's there (in the vast majority of cases it'll just be <gene> -- i think the only way it can be missing is if you hard code a specific gene (e.g. in bin/test-germline-inference.py) and it isn't in the parameter directory you passed break return observed_counts # raw per-{ACGT} counts for each position, summed over genes ("raw" as in not a weighted average over a bunch of genes as in read_mute_freqs_with_weights())
def read_allele_prevalence_freqs(fname, debug=False): # NOTE kinda weird to mash all the regions into one file here (as compared to parametercounter), but it seems to make more sense allele_prevalence_freqs = {r: {} for r in utils.regions} with open(fname) as pfile: reader = csv.DictReader(pfile) for line in reader: allele_prevalence_freqs[utils.get_region(line["gene"])][line["gene"]] = float(line["freq"]) for region in utils.regions: if len(allele_prevalence_freqs[region]) == 0: continue if debug: for gene, freq in allele_prevalence_freqs[region].items(): print "%14.8f %s" % (freq, utils.color_gene(gene)) assert utils.is_normed(allele_prevalence_freqs[region]) return allele_prevalence_freqs
def read_allele_prevalence_freqs(fname, debug=False): # NOTE kinda weird to mash all the regions into one file here (as compared to parametercounter), but it seems to make more sense allele_prevalence_freqs = {r : {} for r in utils.regions} with open(fname) as pfile: reader = csv.DictReader(pfile) for line in reader: allele_prevalence_freqs[utils.get_region(line['gene'])][line['gene']] = float(line['freq']) for region in utils.regions: if len(allele_prevalence_freqs[region]) == 0: continue if debug: for gene, freq in allele_prevalence_freqs[region].items(): print '%14.8f %s' % (freq, utils.color_gene(gene)) assert utils.is_normed(allele_prevalence_freqs[region]) return allele_prevalence_freqs
def print_match(self, region, gene, query_seq, score, glbounds, qrbounds, codon_pos, warnings, skipping=False): out_str_list = [] buff_str = (20 - len(gene)) * ' ' out_str_list.append('%8s%s%s%9s%3s %6.0f ' % (' ', utils.color_gene(gene), '', '', buff_str, score)) out_str_list.append('%4d%4d %s\n' % (glbounds[0], glbounds[1], self.glfo['seqs'][region][gene][glbounds[0]:glbounds[1]])) out_str_list.append('%46s %4d%4d' % ('', qrbounds[0], qrbounds[1])) out_str_list.append(' %s ' % (utils.color_mutants(self.glfo['seqs'][region][gene][glbounds[0]:glbounds[1]], query_seq[qrbounds[0]:qrbounds[1]]))) if region != 'd': out_str_list.append('(%s %d)' % (utils.conserved_codons[region], codon_pos)) if warnings[gene] != '': out_str_list.append('WARNING ' + warnings[gene]) if skipping: out_str_list.append('skipping!') print ''.join(out_str_list)
def choose_some_alleles(region, genes_to_use, allelic_groups, n_alleles_per_gene, debug=False): """ choose a gene (i.e. a primary and sub-version) from <allelic_groups>, and its attendant alleles """ # NOTE also modifies <allelic_groups> if len(allelic_groups[region]) == 0: raise Exception( "ran out of %s alleles (either --n-genes-per-region or --n-alleles-per-gene are probably too big)" % region ) available_versions = None while available_versions is None or len(available_versions) == 0: if available_versions is not None: print " %s couldn't find any versions that have %d alleles, so trying again" % ( utils.color("red", "warning"), n_alleles, ) n_alleles = numpy.random.choice(n_alleles_per_gene[region]) available_versions = [ (pv, subv) for pv in allelic_groups[region] for subv in allelic_groups[region][pv] if len(allelic_groups[region][pv][subv]) >= n_alleles ] ichoice = ( numpy.random.randint(0, len(available_versions) - 1) if len(available_versions) > 1 else 0 ) # numpy.random.choice() can't handle list of tuples (and barfs if you give it only one thing to choose from) primary_version, sub_version = available_versions[ichoice] new_alleles = set( numpy.random.choice(list(allelic_groups[region][primary_version][sub_version]), size=n_alleles, replace=False) ) if debug: print " %8s %5s %s" % ( primary_version, sub_version, " ".join([utils.color_gene(g, width=15) for g in new_alleles]), ) assert len(new_alleles & genes_to_use) == 0 # make sure none of the new alleles are already in <genes_to_use> genes_to_use |= new_alleles # actually add them to the final set # remove stuff we've used from <allelic_groups> del allelic_groups[region][primary_version][ sub_version ] # remove this sub-version (we don't want any more alleles from it) if len(allelic_groups[region][primary_version]) == 0: del allelic_groups[region][primary_version]
def read_fasta_file(seqs, fname, skip_pseudogenes, aligned=False): n_skipped_pseudogenes = 0 seq_to_gene_map = {} for seqfo in utils.read_fastx(fname): # first get gene name if seqfo['name'][:2] != 'IG' and seqfo['name'][:2] != 'TR': # if it's an imgt file, with a bunch of header info (and the accession number first) gene = seqfo['infostrs'][imgt_info_indices.index('gene')] functionality = seqfo['infostrs'][imgt_info_indices.index('functionality')] if functionality not in functionalities: raise Exception('unexpected functionality %s in %s' % (functionality, fname)) if skip_pseudogenes and functionality in pseudogene_funcionalities: n_skipped_pseudogenes += 1 continue else: # plain fasta with just the gene name after the '>' gene = seqfo['name'] utils.split_gene(gene) # just to check if it's a valid gene name if not aligned and utils.get_region(gene) != utils.get_region(os.path.basename(fname)): # if <aligned> is True, file name is expected to be whatever raise Exception('gene %s from %s has unexpected region %s' % (gene, os.path.basename(fname), utils.get_region(gene))) if gene in seqs[utils.get_region(gene)]: raise Exception('gene name %s appears twice in %s' % (gene, fname)) # then the sequence seq = seqfo['seq'] if not aligned: seq = utils.remove_gaps(seq) if 'Y' in seq: print ' replacing Y --> N (%d of \'em) in %s' % (seq.count('Y'), utils.color_gene(gene)) seq = seq.replace('Y', 'N') if len(seq.strip(''.join(utils.expected_characters))) > 0: # return the empty string if it only contains expected characters raise Exception('unexpected character %s in %s (expected %s)' % (seq.strip(''.join(utils.expected_characters)), seq, ' '.join(utils.expected_characters))) if seq not in seq_to_gene_map: seq_to_gene_map[seq] = [] seq_to_gene_map[seq].append(gene) seqs[utils.get_region(gene)][gene] = seq tmpcounts = [len(gl) for gl in seq_to_gene_map.values()] # number of names corresponding to each sequence (should all be ones) if tmpcounts.count(1) != len(tmpcounts): print ' mutliple names in %s for the following sequences:' % fname for seq, genelist in seq_to_gene_map.items(): if len(genelist) > 1: print ' %-50s %s' % (' '.join(genelist), seq) raise Exception('please de-duplicate the fasta and re-run.') if n_skipped_pseudogenes > 0: print ' skipped %d %s pseudogenes (leaving %d)' % (n_skipped_pseudogenes, utils.get_region(os.path.basename(fname)), len(seqs[utils.get_region(os.path.basename(fname))]))
def print_cluster(self, iclust, clusterfo, sorted_glcounts, new_seq, true_sorted_glcounts, mean_cluster_mfreqs, has_indels): if iclust > 0: print '' print ' %-3d %4d %6.3f' % (iclust, len(clusterfo['seqfos']), mean_cluster_mfreqs['v'] / mean_cluster_mfreqs['j']), for igene in range(len(sorted_glcounts)): if igene > 0: print '%22s' % '', gene, counts = sorted_glcounts[igene] print ' %-s %4d %2d%s' % (utils.color_gene(gene, width=20), counts, utils.hamming_distance(new_seq, self.glfo['seqs'][self.region][gene], align=True), ' (%s)' % utils.color('blue', 'x') if has_indels else ' '), if igene < len(sorted_glcounts) - 1 or self.reco_info is not None: print '' if self.reco_info is not None: for igene in range(len(true_sorted_glcounts)): gene, counts = true_sorted_glcounts[igene] print '%17s %s %-s %4d %s %2d ' % ('', utils.color('green', '['), utils.color_gene(gene[:23], width=20), counts, utils.color('green', ']'), utils.hamming_distance(new_seq, self.simglfo['seqs'][self.region][gene], align=True)), if igene < len(true_sorted_glcounts) - 1: print ''
def check_allele_prevalence_freqs(outfname, glfo, allele_prevalence_fname, only_region=None): allele_prevalence_freqs = read_allele_prevalence_freqs(allele_prevalence_fname) counts = {r: {g: 0 for g in glfo["seqs"][r]} for r in utils.regions} with open(outfname) as outfile: reader = csv.DictReader(outfile) for line in reader: for region in utils.regions: counts[region][line[region + "_gene"]] += 1 print " checking allele prevalence freqs" for region in utils.regions: if only_region is not None and region != only_region: continue total = sum(counts[region].values()) print " %s obs / tot = freq expected" % region for gene in glfo["seqs"][region]: print " %4d / %-4d = %.3f %.3f %s" % ( counts[region][gene], total, float(counts[region][gene]) / total, allele_prevalence_freqs[region][gene], utils.color_gene(gene, width=15), )
def write_hmms(self, parameter_dir, sw_matches): print 'writing hmms with info from %s' % parameter_dir start = time.time() from hmmwriter import HmmWriter hmm_dir = parameter_dir + '/hmms' utils.prep_dir(hmm_dir, '*.yaml') gene_list = self.args.only_genes if gene_list == None: # if specific genes weren't specified, do the ones for which we have matches gene_list = [] for region in utils.regions: for gene in self.germline_seqs[region]: if sw_matches == None or gene in sw_matches: # shouldn't be None really, but I'm testing something gene_list.append(gene) for gene in gene_list: if self.args.debug: print ' %s' % utils.color_gene(gene) writer = HmmWriter( parameter_dir, hmm_dir, gene, self.args.naivety, self.germline_seqs[utils.get_region(gene)][gene], self.args) writer.write() print ' time to write hmms: %.3f' % (time.time() - start)
def write_hmms(self, parameter_dir, sw_matches): print 'writing hmms with info from %s' % parameter_dir start = time.time() from hmmwriter import HmmWriter hmm_dir = parameter_dir + '/hmms' utils.prep_dir(hmm_dir, '*.yaml') gene_list = self.args.only_genes if gene_list == None: # if specific genes weren't specified, do the ones for which we have matches gene_list = [] for region in utils.regions: for gene in self.germline_seqs[region]: if sw_matches == None or gene in sw_matches: # shouldn't be None really, but I'm testing something gene_list.append(gene) for gene in gene_list: if self.args.debug: print ' %s' % utils.color_gene(gene) writer = HmmWriter(parameter_dir, hmm_dir, gene, self.args.naivety, self.germline_seqs[utils.get_region(gene)][gene], self.args) writer.write() print ' time to write hmms: %.3f' % (time.time()-start)
def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir) perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'imgt') # get sequence info that was passed to imgt self.seqinfo = {} with opener('r')(self.args.simfname) as simfile: reader = csv.DictReader(simfile) iline = 0 for line in reader: if self.args.queries != None and line['unique_id'] not in self.args.queries: continue if len(re.findall('_[FP]', line['j_gene'])) > 0: line['j_gene'] = line['j_gene'].replace(re.findall('_[FP]', line['j_gene'])[0], '') self.seqinfo[line['unique_id']] = line iline += 1 if self.args.n_queries > 0 and iline >= self.args.n_queries: break paragraphs, csv_info = None, None if self.args.infname != None and '.html' in self.args.infname: print 'reading', self.args.infname with opener('r')(self.args.infname) as infile: soup = BeautifulSoup(infile) paragraphs = soup.find_all('pre') summarydir = self.args.indir[ : self.args.indir.rfind('/')] # one directoy up from <indir>, which has the detailed per-sequence files summary_fname = glob.glob(summarydir + '/1_Summary_*.txt') assert len(summary_fname) == 1 summary_fname = summary_fname[0] get_genes_to_skip(summary_fname, self.germline_seqs) n_failed, n_skipped, n_total, n_not_found, n_found = 0, 0, 0, 0, 0 for unique_id in self.seqinfo: if self.args.debug: print unique_id, imgtinfo = [] # print 'true' # utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id]) if self.args.infname != None and '.html' in self.args.infname: for pre in paragraphs: # NOTE this loops over everything an awful lot of times. Shouldn't really matter for now, though if unique_id in pre.text: imgtinfo.append(pre.text) else: n_total += 1 assert self.args.infname == None infnames = glob.glob(self.args.indir + '/' + unique_id + '*') assert len(infnames) <= 1 if len(infnames) != 1: if self.args.debug: print ' couldn\'t find it' n_not_found += 1 continue n_found += 1 with opener('r')(infnames[0]) as infile: full_text = infile.read() if len(re.findall('[123]. Alignment for [VDJ]-GENE', full_text)) < 3: failregions = re.findall('No [VDJ]-GENE has been identified', full_text) if self.args.debug and len(failregions) > 0: print ' ', failregions n_failed += 1 continue # loop over the paragraphs I want position = full_text.find(unique_id) # don't need this one for ir in range(4): position = full_text.find(unique_id, position+1) pgraph = full_text[position : full_text.find('\n\n', position+1)] if 'insertion(s) and/or deletion(s) which are not dealt in this release' in pgraph: ir -= 1 continue imgtinfo.append(pgraph) # query seq paragraph if len(imgtinfo) == 0: print '%s no info' % unique_id continue else: if self.args.debug: print '' line = self.parse_query_text(unique_id, imgtinfo) if 'skip_gene' in line: # assert self.args.skip_missing_genes n_skipped += 1 continue try: assert 'failed' not in line joinparser.add_insertions(line, debug=self.args.debug) joinparser.resolve_overlapping_matches(line, debug=False, germlines=self.germline_seqs) except (AssertionError, KeyError): print ' giving up' n_failed += 1 perfplotter.add_partial_fail(self.seqinfo[unique_id], line) # print ' perfplotter: not sure what to do with a fail' continue perfplotter.evaluate(self.seqinfo[unique_id], line) if self.args.debug: utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id], label='true:') utils.print_reco_event(self.germline_seqs, line, label='inferred:') perfplotter.plot() print 'failed: %d / %d = %f' % (n_failed, n_total, float(n_failed) / n_total) print 'skipped: %d / %d = %f' % (n_skipped, n_total, float(n_skipped) / n_total) print ' ', for g, n in genes_actually_skipped.items(): print ' %d %s' % (n, utils.color_gene(g)) print '' if n_not_found > 0: print ' not found: %d / %d = %f' % (n_not_found, n_not_found + n_found, n_not_found / float(n_not_found + n_found))
def skip_gene(gene): print ' %s in list of genes to skip' % utils.color_gene(gene) if gene not in genes_actually_skipped: genes_actually_skipped[gene] = 0 genes_actually_skipped[gene] += 1 line['skip_gene'] = True
def get_positions_to_fit(self, gene, gene_results, debug=False): self.fitted_positions[gene] = set() positions = sorted(self.mfreqer.counts[gene].keys()) xyvals = {pos : self.get_allele_finding_xyvals(gene, pos) for pos in positions} positions_to_try_to_fit = [pos for pos in positions if sum(xyvals[pos]['obs']) > self.n_muted_min or sum(xyvals[pos]['total']) > self.n_total_min] # ignore positions with neither enough mutations or total observations if len(positions_to_try_to_fit) < self.n_max_snps - 1 + self.min_non_candidate_positions_to_fit: gene_results['not_enough_obs_to_fit'].add(gene) if debug: print ' not enough positions with enough observations to fit %s' % utils.color_gene(gene) return None, None if debug and len(positions) > len(positions_to_try_to_fit): print ' skipping %d / %d positions (with fewer than %d mutations and %d observations)' % (len(positions) - len(positions_to_try_to_fit), len(positions), self.n_muted_min, self.n_total_min) self.plotvals[gene] = {} for pos in positions_to_try_to_fit: self.plotvals[gene][pos] = xyvals[pos] return positions_to_try_to_fit, xyvals
def parse_query_text(self, unique_id, query_info): if len(query_info) == 0: # one for the query sequence, then one for v, d, and j print 'no info for',unique_id return {} elif len(query_info) < 4: regions_ok = '' for info in query_info: for region in utils.regions: if 'IGH' + region.upper() in info: regions_ok += region for region in utils.regions: if region not in regions_ok: print ' ERROR no %s matches' % region return {} assert False # shouldn't get here elif len(query_info) != 4: print 'info for', unique_id, 'all messed up' for info in query_info: print info sys.exit() full_qr_seq = query_info[0].replace('>', '').replace(unique_id, '') # strip off the unique id full_qr_seq = ''.join(full_qr_seq.split()).upper() # strip off white space and uppercase it assert full_qr_seq == self.seqinfo[unique_id]['seq'] line = {} line['unique_id'] = unique_id line['seq'] = full_qr_seq for ireg in range(len(utils.regions)): region = utils.regions[ireg] info = query_info[ireg + 1].splitlines() while unique_id not in info[0]: # remove the line marking cdr3 and framework regions info.pop(0) if len(info) <= 1: print info assert len(info) > 1 assert len(info[0].split()) == 2 qr_seq = info[0].split()[1].upper() # this line should be '<unique_id> .............<query_seq>' true_gene = self.seqinfo[unique_id][region + '_gene'] imatch = 1 # which match to take match_name = str(info[imatch].split()[2]) while match_name in just_always_friggin_skip and len(info) > imatch+1 and len(info[imatch+1].split()) > 2: imatch += 1 old_one = match_name match_name = str(info[imatch].split()[2]) if self.args.debug: print ' %s: taking next match: %s --> %s)' % (unique_id, utils.color_gene(old_one), utils.color_gene(match_name)) infer_gene = match_name for gset in equivalent_genes: if match_name in gset and true_gene in gset and match_name != true_gene: # if the true gene and the inferred gene are in the same equivalence set, treat it as correct, i.e. just pretend it inferred the right name if self.args.debug: print ' %s: replacing name %s with true name %s' % (unique_id, match_name, true_gene) infer_gene = true_gene # ---------------------------------------------------------------------------------------- # skipping bullshit def skip_gene(gene): print ' %s in list of genes to skip' % utils.color_gene(gene) if gene not in genes_actually_skipped: genes_actually_skipped[gene] = 0 genes_actually_skipped[gene] += 1 line['skip_gene'] = True if infer_gene not in self.germline_seqs[region]: print ' couldn\'t find %s in germlines (skipping)' % infer_gene skip_gene(infer_gene) return line if infer_gene in just_always_friggin_skip: skip_gene(infer_gene) return line if true_gene in just_always_friggin_skip: skip_gene(true) return line if not self.args.dont_skip_or15_genes and '/OR1' in true_gene: skip_gene(true_gene) return line if self.args.skip_missing_genes: if infer_gene in genes_to_skip: skip_gene(infer_gene) return line if true_gene in genes_to_skip: skip_gene(true_gene) return line gl_seq = info[imatch].split()[4].upper() if qr_seq.replace('.', '') not in self.seqinfo[unique_id]['seq']: # if self.args.debug: print ' qr_seq not found in seqinfo' line['failed'] = True return line if self.args.debug: if utils.are_alleles(infer_gene, true_gene): regionstr = utils.color('bold', utils.color('blue', region)) truestr = '(originally %s)' % match_name else: regionstr = utils.color('bold', utils.color('red', region)) truestr = '(true: %s)' % utils.color_gene(true_gene).replace(region, '') print ' %s %s %s' % (regionstr, utils.color_gene(infer_gene).replace(region, ''), truestr) print ' gl', gl_seq print ' ', qr_seq # replace the dots (gaps) in the gl match new_qr_seq, new_gl_seq = [], [] for inuke in range(min(len(qr_seq), len(gl_seq))): if gl_seq[inuke] == '.': pass else: new_qr_seq.append(qr_seq[inuke]) # this should only be out of range if the v match extends through the whole query sequence, i.e. friggin never new_gl_seq.append(gl_seq[inuke]) for inuke in range(len(gl_seq), len(qr_seq)): new_qr_seq.append(qr_seq[inuke]) for inuke in range(len(qr_seq), len(gl_seq)): new_gl_seq.append(gl_seq[inuke]) qr_seq = ''.join(new_qr_seq) gl_seq = ''.join(new_gl_seq) # work out the erosions qr_ldots = qr_seq.rfind('.') + 1 # first strip off any dots on the left of query seq qr_seq = qr_seq[qr_ldots : ] gl_seq = gl_seq[qr_ldots : ] gl_ldots = gl_seq.rfind('.') + 1 # then remove dots on the left of the germline seq qr_seq = qr_seq[gl_ldots : ] gl_seq = gl_seq[gl_ldots : ] del_5p = qr_ldots + gl_ldots jf_insertion = '' if region == 'j': jf_insertion = qr_seq[len(gl_seq) : ] qr_seq = qr_seq[ : len(gl_seq)] # then strip the right-hand portion of the query sequence that isn't aligned to the germline del_3p = len(gl_seq) - len(qr_seq) # then do the same for the germline overhanging on the right of the query gl_seq = gl_seq[ : len(qr_seq)] assert len(gl_seq) == len(qr_seq) new_gl_seq = [] for inuke in range(len(gl_seq)): # replace dashes (matched bases) assert gl_seq[inuke] != '.' # hoping there's no gaps in here if gl_seq[inuke] == '-': new_gl_seq.append(qr_seq[inuke]) else: new_gl_seq.append(gl_seq[inuke]) gl_seq = ''.join(new_gl_seq) if self.germline_seqs[region][infer_gene].find(gl_seq) != del_5p: # why the *@*!! can't they make this consistent? if self.germline_seqs[region][infer_gene].find(gl_seq) < 0: print 'whooooaa' print self.germline_seqs[region][infer_gene] print gl_seq line['failed'] = True return line del_5p += self.germline_seqs[region][infer_gene].find(gl_seq) try: assert del_5p + len(gl_seq) + del_3p + len(jf_insertion) == len(self.germline_seqs[region][infer_gene]) except: print ' ERROR lengths failed for %s' % unique_id # print del_5p, len(gl_seq), del_3p, del_5p + len(gl_seq) + del_3p , len(self.germline_seqs[region][infer_gene]) # print gl_seq # print self.germline_seqs[region][infer_gene] line['failed'] = True return line # assert False if self.args.debug: utils.color_mutants(gl_seq, qr_seq, ref_label='gl ', extra_str=' ', print_result=True, post_str=' del: %d %d' % (del_5p, del_3p)) # try: # infer_gene = joinparser.figure_out_which_damn_gene(self.germline_seqs, infer_gene, gl_seq, debug=self.args.debug) # except: # print 'ERROR couldn\'t figure out the gene for %s' % infer_gene # return {} line[region + '_gene'] = infer_gene line[region + '_qr_seq'] = qr_seq line[region + '_gl_seq'] = gl_seq line[region + '_5p_del'] = del_5p line[region + '_3p_del'] = del_3p if region == 'j': line['jf_insertion'] = jf_insertion return line
linefo = [p.replace('>', '').strip() for p in line.split('|')] gene = None for piece in linefo: if piece[:2] == 'IG': gene = piece if gene is None: raise Exception('couldn\'t fine gene in %s' % line) if len(linefo) > 1: functionality = linefo[glutils.imgt_info_indices.index('functionality')] if functionality not in glutils.functionalities: raise Exception('unexpected functionality %s' % functionality) if functionality == 'P': n_skipped_pseudogenes += 1 continue genes[fname].add(gene) if n_skipped_pseudogenes > 0: print ' skipped %d pseudogenes' % n_skipped_pseudogenes readfile(args.file1) readfile(args.file2) print 'file1: %d' % len(genes[args.file1]) print 'file2: %d' % len(genes[args.file2]) print 'both: %d' % len(genes[args.file1] & genes[args.file2]) only_file1 = genes[args.file1] - genes[args.file2] print 'only file1: %d (%s)' % (len(only_file1), ' '.join([utils.color_gene(g) for g in only_file1])) only_file2 = genes[args.file2] - genes[args.file1] print 'only file2: %d (%s)' % (len(only_file2), ' '.join([utils.color_gene(g) for g in only_file2]))
def get_genes_to_skip(fname, germlines, method='imgt', debug=False): with opener('r')(fname) as infile: if method == 'imgt': reader = csv.DictReader(infile, delimiter='\t') imgt_genes = set() # genes that imgt spit out at least once iline = 0 no_matches = {region:0 for region in utils.regions} for line in reader: iline += 1 for region in utils.regions: matchstr = line[region.upper() + '-GENE and allele'] if len(matchstr) == 0: no_matches[region] += 1 # print ' no %s match' % region continue try: gene = matchstr.split()[1] except IndexError: raise Exception('match problem in %s: %s' % (region, matchstr)) # print '%12s %s' % (gene in germlines[region], utils.color_gene(gene)) imgt_genes.add(gene) # if len(imgt_genes) > 10: # # for g in imgt_genes: # # print utils.color_gene(g), # break print 'read %d lines, no match (v/d/j): %d/%d/%d' % tuple([iline, ] + [no_matches[region] for region in utils.regions]) elif method == 'igblast': filestr = infile.read() imgt_genes = set(re.findall('IGH[VDJ][^*]*\*[0-9][0-9]', filestr)) # ok, igblast genes, but it's not so bad to leave the variable name like that... else: raise Exception('bad method %s' % method) print '%s genes: ' % method, if debug: print '' for g in sorted(imgt_genes): print ' ', utils.color_gene(g) else: print len(imgt_genes) print '\nin %s output, not in simulation: ' % method for gene in sorted(imgt_genes): if gene not in germlines[utils.get_region(gene)]: if debug: print ' ', utils.color_gene(gene) genes_to_skip.add(gene) if not debug: print len(genes_to_skip) print '\nin simulation, not in %s output: ' % method for region in utils.regions: for gene in sorted(germlines[region]): if gene not in imgt_genes: if debug: print ' ', utils.color_gene(gene) genes_to_skip.add(gene) if not debug: print len(genes_to_skip) simulation_genes = set(germlines['v']) | set(germlines['d']) | set(germlines['j']) genes_to_use = imgt_genes & simulation_genes # print '\ngenes to use: %s' % len(genes_to_use) # if debug: # for g in sorted(genes_to_use): # print ' ', utils.color_gene(g) if len(genes_to_use & genes_to_skip) > 0: raise Exception('non zero intersection: %d' % len(genes_to_use & genes_to_skip))
def read_fasta_file(seqs, fname, skip_pseudogenes, aligned=False): n_skipped_pseudogenes = 0 seq_to_gene_map = {} for seq_record in SeqIO.parse(fname, "fasta"): linefo = [p.strip() for p in seq_record.description.split("|")] # first get gene name if linefo[0][:2] != "IG": # if it's an imgt file, with a bunch of header info (and the accession number first) gene = linefo[imgt_info_indices.index("gene")] functionality = linefo[imgt_info_indices.index("functionality")] if functionality not in functionalities: raise Exception("unexpected functionality %s in %s" % (functionality, fname)) if skip_pseudogenes and functionality in pseudogene_funcionalities: n_skipped_pseudogenes += 1 continue else: # plain fasta with just the gene name after the '>' gene = linefo[0] utils.split_gene(gene) # just to check if it's a valid gene name if not aligned and utils.get_region(gene) != utils.get_region( os.path.basename(fname) ): # if <aligned> is True, file name is expected to be whatever raise Exception( "gene %s from %s has unexpected region %s" % (gene, os.path.basename(fname), utils.get_region(gene)) ) if gene in seqs[utils.get_region(gene)]: raise Exception("gene name %s appears twice in %s" % (gene, fname)) # then the sequence seq = str(seq_record.seq).upper() if not aligned: seq = utils.remove_gaps(seq) if "Y" in seq: print " replacing Y --> N (%d of 'em) in %s" % (seq.count("Y"), utils.color_gene(gene)) seq = seq.replace("Y", "N") if ( len(seq.strip("".join(utils.expected_characters))) > 0 ): # return the empty string if it only contains expected characters raise Exception( "unexpected character %s in %s (expected %s)" % (seq.strip("".join(utils.expected_characters)), seq, " ".join(utils.expected_characters)) ) if seq not in seq_to_gene_map: seq_to_gene_map[seq] = [] seq_to_gene_map[seq].append(gene) seqs[utils.get_region(gene)][gene] = seq tmpcounts = [ len(gl) for gl in seq_to_gene_map.values() ] # number of names corresponding to each sequence (should all be ones) if tmpcounts.count(1) != len(tmpcounts): print " mutliple names in %s for the following sequences:" % fname for seq, genelist in seq_to_gene_map.items(): if len(genelist) > 1: print " %-50s %s" % (" ".join(genelist), seq) raise Exception("please de-duplicate the fasta and re-run.") if n_skipped_pseudogenes > 0: print " skipped %d %s pseudogenes (leaving %d)" % ( n_skipped_pseudogenes, utils.get_region(os.path.basename(fname)), len(seqs[utils.get_region(os.path.basename(fname))]), )
def get_new_alignments(glfo, region, debug=False): aligned_seqs = {} genes_with_alignments = set( aligned_seqs ) # used to already have some sequences aligned, and may as well keep around the code to handle that case genes_without_alignments = set(glfo["seqs"][region]) - set(aligned_seqs) if len(genes_without_alignments) == 0: if debug: print " no missing %s alignments" % region return if debug: print " missing alignments for %d %s genes" % (len(genes_without_alignments), region) if len(aligned_seqs) > 0: print " existing alignments:" for g, seq in aligned_seqs.items(): print " %s %s" % (seq, utils.color_gene(g)) # find the longest aligned sequence, so we can pad everybody else with dots on the right out to that length biggest_length = None for gene in genes_with_alignments: if biggest_length is None or len(aligned_seqs[gene]) > biggest_length: biggest_length = len(aligned_seqs[gene]) tmpdir = tempfile.mkdtemp() already_aligned_fname = tmpdir + "/already-aligned.fasta" not_aligned_fname = tmpdir + "/not-aligned.fasta" msa_table_fname = tmpdir + "/msa-table.txt" aligned_and_not_fnamefname = tmpdir + "/aligned-and-not.fasta" mafft_outfname = tmpdir + "/everybody-aligned.fasta" with open(already_aligned_fname, "w") as tmpfile, open(msa_table_fname, "w") as msafile: mysterious_index = 1 msa_str = "" for gene in genes_with_alignments: dotstr = "." * (biggest_length - len(aligned_seqs[gene])) alistr = aligned_seqs[gene] + dotstr tmpfile.write(">%s\n%s\n" % (gene, alistr.replace(".", "-"))) msa_str += " " + str(mysterious_index) mysterious_index += 1 msafile.write("%s # %s\n" % (msa_str, already_aligned_fname)) with open(not_aligned_fname, "w") as tmpfile: for gene in genes_without_alignments: tmpfile.write(">%s\n%s\n" % (gene, glfo["seqs"][region][gene])) check_call("cat " + already_aligned_fname + " " + not_aligned_fname + " >" + aligned_and_not_fnamefname, shell=True) # actually run mafft cmd = ( "mafft --merge " + msa_table_fname + " " + aligned_and_not_fnamefname + " >" + mafft_outfname ) # options= # "--localpair --maxiterate 1000" if debug: print " RUN %s" % cmd proc = Popen(cmd, shell=True, stderr=PIPE) _, err = proc.communicate() # debug info goes to err if debug and False: # aw, screw it, I don't even know what any of mafft's output means # deal with debug info (for err -- out gets redirected to a file) err = err.replace("\r", "\n") printstrs = [] for errstr in err.split("\n"): # remove the stupid progress bar things matches = re.findall("[0-9][0-9]* / [0-9][0-9]*", errstr) if len(matches) == 1 and errstr.strip() == matches[0]: continue if len(errstr) == 0: continue printstrs.append(errstr) print " " + "\n ".join(printstrs) # deal with fasta output for seq_record in SeqIO.parse(mafft_outfname, "fasta"): gene = seq_record.name.split("|")[0] seq = str(seq_record.seq).upper() if ( gene not in glfo["seqs"][region] ): # only really possible if there's a bug in the preceding fifty lines, but oh well, you can't be too careful raise Exception("unexpected gene %s in mafft output" % gene) aligned_seqs[gene] = seq # overwrite the old alignment with the new one if debug and False: # too damn verbose with all the v genes print " new alignments:" for g, seq in aligned_seqs.items(): print " %s %s %s" % (seq, utils.color_gene(g), "<--- new" if g in genes_without_alignments else "") os.remove(already_aligned_fname) os.remove(not_aligned_fname) os.remove(msa_table_fname) os.remove(aligned_and_not_fnamefname) os.remove(mafft_outfname) os.rmdir(tmpdir) return aligned_seqs
def remove_genes(glfo, genes, debug=False): """ remove <genes> from <glfo> """ if debug: print " removing %s from glfo" % " ".join([utils.color_gene(g) for g in genes]) for gene in genes: remove_gene(glfo, gene)