def read_germline_seqs(gldir, locus, skip_pseudogenes): seqs = {r : OrderedDict() for r in utils.regions} for fname in glfo_fasta_fnames(locus): read_fasta_file(seqs, gldir + '/' + locus + '/' + fname, skip_pseudogenes) if not utils.has_d_gene(locus): # choose a sequence for the dummy d seqs['d'][dummy_d_genes[locus]] = 'A' # this (arbitrary) choice is also made in packages/ham/src/bcrutils.cc return seqs
def getvalstr(gene, val): if gene is None or (utils.get_region(gene) == 'd' and not utils.has_d_gene(utils.get_locus(gene))): return '%s %5.2s %s %-16s%s' % (cstr, ' - ', cstr, ' - ', 4 * ' ' if latex else '') else: if latex: gstr = utils.shorten_gene_name(gene, use_one_based_indexing=True, n_max_mutstrs=5) if emph_genes is not None and gene in emph_genes: gstr = '\\color{red}{\\textbf{%s}}' % gstr else: gstr = utils.color_gene(gene, width=18) return '%s %s%5.2f%s %s %-20s' % (cstr, estr, 100 * val, estr, cstr, gstr)
def try_scratch_erode_insert(self, tmpline, debug=False): utils.remove_all_implicit_info(tmpline) for erosion in utils.real_erosions: # includes various contortions to avoid eroding the entire gene region = erosion[0] gene_length = len(self.glfo['seqs'][region][tmpline[region + '_gene']]) if region == 'd' and not utils.has_d_gene(self.args.locus): # dummy d genes: always erode the whole thing from the left assert gene_length == 1 and tmpline['d_gene'] == glutils.dummy_d_genes[self.args.locus] tmpline[erosion + '_del'] = 1 if '5p' in erosion else 0 else: max_erosion = max(0, gene_length/2 - 2) # heuristic if region in utils.conserved_codons[self.args.locus]: # make sure not to erode a conserved codon codon_pos = utils.cdn_pos(self.glfo, region, tmpline[region + '_gene']) if '3p' in erosion: n_bases_to_codon = gene_length - codon_pos - 3 elif '5p' in erosion: n_bases_to_codon = codon_pos max_erosion = min(max_erosion, n_bases_to_codon) tmpline[erosion + '_del'] = min(max_erosion, numpy.random.geometric(1. / utils.scratch_mean_erosion_lengths[erosion]) - 1) for bound in utils.boundaries: mean_length = utils.scratch_mean_insertion_lengths[self.args.locus][bound] length = 0 if mean_length == 0 else numpy.random.geometric(1. / mean_length) - 1 probs = [self.insertion_content_probs[bound][n] for n in utils.nukes] tmpline[bound + '_insertion'] = ''.join(numpy.random.choice(utils.nukes, size=length, p=probs)) if debug: print ' erosions: %s' % (' '.join([('%s %d' % (e, tmpline[e + '_del'])) for e in utils.real_erosions])) print ' insertions: %s' % (' '.join([('%s %s' % (b, tmpline[b + '_insertion'])) for b in utils.boundaries])) # have to add some things by hand so utils.add_implicit_info() doesn't barf (this duplicates code later on in recombinator) gl_seqs = {r : self.glfo['seqs'][r][tmpline[r + '_gene']] for r in utils.regions} for erosion in utils.real_erosions: region = erosion[0] e_length = tmpline[erosion + '_del'] if '5p' in erosion: gl_seqs[region] = gl_seqs[region][e_length:] elif '3p' in erosion: gl_seqs[region] = gl_seqs[region][:len(gl_seqs[region]) - e_length] tmpline['seqs'] = [gl_seqs['v'] + tmpline['vd_insertion'] + gl_seqs['d'] + tmpline['dj_insertion'] + gl_seqs['j'], ] tmpline['unique_ids'] = [None] # this is kind of hackey, but some things in the implicit info adder use it to get the number of sequences tmpline['input_seqs'] = copy.deepcopy(tmpline['seqs']) # NOTE has to be updated _immediately_ so seqs and input_seqs don't get out of sync tmpline['indelfos'] = [indelutils.get_empty_indel(), ] utils.add_implicit_info(self.glfo, tmpline) assert len(tmpline['in_frames']) == 1
def print_seq_in_reco_event(original_line, iseq, extra_str='', label='', one_line=False, seed_uid=None, check_line_integrity=False): """ Print ascii summary of recombination event and mutation. If <one_line>, then skip the germline lines, and only print the final_seq line. """ line = original_line if check_line_integrity: # it's very important not to modify <line> -- this lets you verify that you aren't line = copy.deepcopy( original_line) # copy that we can modify without changing <line> delstrs = { d: '.' * line[d + '_del'] for d in utils.all_erosions } # NOTE len(delstrs[<del>]) is not in general the same as len(line[<del>_del]) if len( delstrs['v_5p'] ) > 50: # don't print a million dots if left-side v deletion is really big delstrs['v_5p'] = '.%d.' % len(delstrs['v_5p']) # if there isn't enough space for dots in the vj line, we add some dashes to everybody so things fit (rare in heavy chain rearrangements, but pretty common in light chain) d_plus_inserts_length = len(line['vd_insertion'] + line['d_gl_seq'] + line['dj_insertion']) if line['v_3p_del'] + line[ 'j_5p_del'] > d_plus_inserts_length: # if dots for v and j interior deletions will be longer than <d_plus_inserts_length> delstrs['v_3p'] = '.%d.' % line['v_3p_del'] delstrs['j_5p'] = '.%d.' % line['j_5p_del'] gapstr = '-' * (len(delstrs['v_3p'] + delstrs['j_5p']) - d_plus_inserts_length) gap_insert_point = len( line['fv_insertion'] + delstrs['v_5p'] + line['v_gl_seq'] ) # it doesn't really matter exactly where we put the blue dashes, as long as it's the same place in all four lines, but this is a good spot extra_space_because_of_fixed_nospace = max( 0, d_plus_inserts_length - len(delstrs['v_3p'] + delstrs['j_5p']) ) # if shortening the <delstrs> already over-compensated for the lack of space (i.e., if the number of dashes necessary is zero), then we need to add some dots to the vj line below else: gapstr = '' gap_insert_point = None extra_space_because_of_fixed_nospace = 0 eroded_seqs_dots = { r: delstrs[r + '_5p'] + line[r + '_gl_seq'] + delstrs[r + '_3p'] for r in utils.regions } # build the three germline lines insert_line = ' ' * (len(line['fv_insertion']) + line['lengths']['v'] + len(delstrs['v_5p'])) \ + line['vd_insertion'] + ' ' * line['lengths']['d'] + line['dj_insertion'] \ + ' ' * (line['lengths']['j'] + line['j_3p_del'] + len(line['jf_insertion'])) germline_d_start = len(line['fv_insertion']) + line['lengths']['v'] + len( line['vd_insertion']) - line['d_5p_del'] germline_d_end = germline_d_start + line['d_5p_del'] + line['lengths'][ 'd'] + line['d_3p_del'] d_line = ' ' * (germline_d_start + len(delstrs['v_5p'])) \ + eroded_seqs_dots['d'] \ + ' ' * (len(line['j_gl_seq']) + len(line['dj_insertion']) - line['d_3p_del'] + line['j_3p_del'] + len(line['jf_insertion'])) germline_v_end = len(line['fv_insertion']) + len(line['v_gl_seq']) + line[ 'v_3p_del'] - 1 # position in the query sequence at which we find the last base of the v match. NOTE we subtract off the v_5p_del because we're *not* adding dots for that deletion (it's just too long) germline_j_start = germline_d_end + 1 - line['d_3p_del'] + len( line['dj_insertion']) - line['j_5p_del'] vj_line = ' ' * len(line['fv_insertion']) + eroded_seqs_dots['v'] + '.' * extra_space_because_of_fixed_nospace \ + ' ' * (germline_j_start - germline_v_end - 2) + eroded_seqs_dots['j'] + ' ' * len(line['jf_insertion']) # and the query line qrseq_line = ' ' * len( delstrs['v_5p']) + line['seqs'][iseq] + ' ' * line['j_3p_del'] outstrs = [insert_line, d_line, vj_line, qrseq_line] check_outsr_lengths( line, outstrs, fix=True ) # I think the only way they can be different is if the d right side erosion is so long that it hangs over the right side of the j if gap_insert_point is not None: for istr in [ 0, 1, 3 ]: # everybody except the vj line, which already has the modified interior delstrs above outstrs[ istr] = outstrs[istr][:gap_insert_point] + gapstr + outstrs[ istr][gap_insert_point:] check_outsr_lengths(line, outstrs, fix=True) colors = [[[] for _ in range(len(ostr))] for ostr in outstrs] if indelutils.has_indels(line['indelfos'][iseq]): # outstrs, colors = old_indel_shenanigans(line, iseq, outstrs, colors) outstrs, colors = indel_shenanigans(line, iseq, outstrs, colors) outstrs = add_colors(outstrs, colors, line) suffixes = [ 'insert%s\n' % ('s' if utils.has_d_gene(utils.get_locus(line['v_gene'])) else ''), '%s\n' % (utils.color_gene(line['d_gene'])), '%s %s\n' % (utils.color_gene(line['v_gene']), utils.color_gene(line['j_gene'])), '%s %4.2f mut %s\n' % (get_uid_str(line, iseq, seed_uid), line['mut_freqs'][iseq], utils.color('red', utils.is_functional_dbg_str(line, iseq))) ] outstrs = [ '%s%s %s' % (extra_str, ostr, suf) for ostr, suf in zip(outstrs, suffixes) ] if label != '': # this doesn't really work if the edge of the removed string is the middle of a color code... but oh well, it doesn't really happen any more since I shortened the kbound label from waterer.py offset = max( 0, len(extra_str) - 2) # skootch <label> this many positions leftward into <extra_str> removed_str = outstrs[0][offset:offset + utils.len_excluding_colors(label)] outstrs[0] = outstrs[0][:offset] + label + outstrs[0][ utils.len_excluding_colors(label) + offset:] # NOTE this *replaces* the bases in <extra_str> with <label>, which is only fine if they're spaces if removed_str.strip() != '': print '%s%s (covered by label \'%s\')' % ( ' ' * offset, utils.color('red', removed_str), label) if one_line: outstrs = outstrs[-1:] # remove all except the query seq line elif not utils.has_d_gene(utils.get_locus(line['v_gene'])): outstrs.pop(1) # remove the d germline line print ''.join(outstrs), if check_line_integrity: if set(line.keys()) != set(original_line.keys()): raise Exception('ack 1') for k in line: if line[k] != original_line[k]: print 'key %s differs:\n %s\n %s ' % (k, line[k], original_line[k]) raise Exception('')
import sys import os import random import re import numpy import glob from collections import OrderedDict import csv from subprocess import check_call, Popen, PIPE import utils # ---------------------------------------------------------------------------------------- glfo_dir = 'germline-sets' # always put germline info into a subdir with this name dummy_d_genes = {l : l.upper() + 'Dx-x*x' if not utils.has_d_gene(l) else None for l in utils.loci} # e.g. IGKDx-x*x for igk, None for igh # single-locus file names extra_fname = 'extras.csv' def glfo_fasta_fnames(locus): return [locus + r + '.fasta' for r in utils.getregions(locus)] def glfo_fnames(locus): return [extra_fname, ] + glfo_fasta_fnames(locus) csv_headers = ['gene', 'cyst_position', 'tryp_position', 'phen_position', 'aligned_seq'] functionalities = [(sep[0] + f + sep[1]).strip() for f in ['F', 'ORF', 'P'] for sep in [' ', '()', '[]']] # not actually sure what the parentheses and brackets mean pseudogene_funcionalities = ['P', '[P]', '(P)'] duplicate_names = { 'v' : [
def clean_pair_info(cpaths, antn_lists, n_max_clusters=3, debug=False): # ---------------------------------------------------------------------------------------- def check_droplet_id_groups(tdbg=False): # check against the droplet id method (we could just do it this way, but it would only work for 10x, and only until they change their naming convention) pgroup_strs = set(':'.join(sorted(pg)) for pg in pid_groups) all_uids = list( set([ su for l in cpaths for c in cpaths[l].best() for u in c for su in [u] + utils.per_seq_val(all_antns[u], 'paired-uids', u) ])) n_not_found = 0 for dropid, drop_queries in itertools.groupby( sorted(all_uids, key=utils.get_droplet_id), key=utils.get_droplet_id): dqlist = list(drop_queries) found = ':'.join(sorted(dqlist)) in pgroup_strs if not found: overlaps = [g for g in pgroup_strs if dropid in g] overlaps = utils.get_single_entry(overlaps) n_not_found += 1 if tdbg or not found: print ' %25s %s %s %s' % ( utils.color('green', '-') if found else utils.color( 'red', 'x'), dropid, ' '.join( sorted(utils.get_contig_id(q) for q in dqlist)), utils.color( 'red', ' '.join( sorted( utils.get_contig_id(q) for q in overlaps.split(':'))) if not found else '')) if n_not_found > 0: print ' %s droplet id group check failed for %d groups' % ( utils.color('red', 'error'), n_not_found) # ---------------------------------------------------------------------------------------- def getloc(uid): if uid not in all_antns: return '?' return utils.per_seq_val(all_antns[uid], 'loci', uid) # ---------------------------------------------------------------------------------------- def gval(uid, key): # get per-seq val for <uid> if uid not in all_antns: return None return utils.per_seq_val(all_antns[uid], key, uid) # ---------------------------------------------------------------------------------------- def lgstr(lgroup, sort=True): return ' '.join( utils.locstr(l) for l in (sorted if sort else utils.pass_fcn )([getloc(u) for u in lgroup])) # ---------------------------------------------------------------------------------------- def choose_seqs_to_remove( chain_ids, max_hdist=4, tdbg=False): # choose one of <chain_ids> to eliminate # look for pairs with the same locus that ids_to_remove = set(u for u in chain_ids if getloc(u) == '?') if tdbg and len( ids_to_remove ) > 0: # i think this actually can't happen a.t.m. TODO maybe remove it print ' removed %d with missing annotations' % len( ids_to_remove) dbgstr = [] n_equivalent = 0 for tpair in itertools.combinations(chain_ids, 2): if len(set(getloc(u) for u in tpair)) > 1: continue if len(set(len(gval(u, 'seqs')) for u in tpair)) > 1: continue hdist = utils.hamming_distance(*[gval(u, 'seqs') for u in tpair]) if tdbg: dbgstr.append( utils.color('blue' if hdist == 0 else 'yellow', '%d' % hdist)) if hdist <= max_hdist: # TODO would be nice to be able to combine their sequences, but I think propagating the resulting annotation modifications would be hard # print ' identical sequence overlap, choosing longer one' better_id, worse_id = sorted( tpair, key=lambda q: utils.ambig_frac(gval(q, 'seqs')) ) # TODO if we're tossing one with hdist > 0, maybe should take the lower-shm one if they're the same length? ids_to_remove.add(worse_id) n_equivalent += 1 if tdbg and len(dbgstr) > 0: print ' %d pair%s equivalent with hdists %s' % ( n_equivalent, utils.plural(n_equivalent), ' '.join(dbgstr)) # remove unproductive dbgstr = [] unproductive_ids = [] for uid in chain_ids: if not utils.is_functional( all_antns[uid], all_antns[uid]['unique_ids'].index(uid)): unproductive_ids.append(uid) if tdbg: dbgstr.append( utils.is_functional_dbg_str( all_antns[uid], all_antns[uid]['unique_ids'].index(uid), sep='+')) # unproductive_ids = [u for u in chain_ids if not utils.is_functional(all_antns[u], all_antns[u]['unique_ids'].index(u))] # this way is only one line, which may or may not be nicer if tdbg and len(unproductive_ids) > 0: print ' %d unproductive %s' % (len(unproductive_ids), ', '.join(dbgstr)) ids_to_remove |= set(unproductive_ids) return ids_to_remove # ---------------------------------------------------------------------------------------- antn_dicts = { l: utils.get_annotation_dict(antn_lists[l]) for l in antn_lists } # first make a map from each uid (for all loci) to its annotation pid_groups = [ ] # list of pid groups, i.e. each element is the uids from a single droplet (for 10x) pid_ids = {} # map from each uid to the index of its pid group all_antns = {} if debug: print ' %s consolidating info for %d loci with cluster/sequence counts: %s' % ( utils.color('blue', '+'.join(cpaths)), len(cpaths), ' '.join( '%s: %d/%d' % (l, len(cpaths[l].best()), sum(len(c) for c in cpaths[l].best())) for l in sorted(cpaths))) for ltmp in sorted(cpaths): for cluster in cpaths[ltmp].best(): cline = antn_dicts[ltmp][':'.join(cluster)] if 'paired-uids' not in cline: print ' %s no paired-uids in line' % utils.color( 'yellow', 'warning') continue # maybe should still add to all_antns? for uid, pids in zip(cline['unique_ids'], cline['paired-uids']): pset = set([uid] + pids) found = False for ipg, pgroup in enumerate(pid_groups): if any( p in pgroup for p in pset ): # TODO should maybe check for consistency if some of them are already in there (i.e. from reciprocal info in another chain)? found = True pgroup |= pset break if not found: pid_groups.append(pset) ipg = len(pid_groups) - 1 assert ipg is not None for pid in pset: pid_ids[pid] = ipg cline['loci'] = [ ltmp for _ in cline['unique_ids'] ] # TODO maybe should add this somewhere else, like in partitiondriver? (eh, maybe not? the locus is always available in each file from the germline info anyway) for uid in cline['unique_ids']: all_antns[uid] = cline # for ipg, pg in enumerate(pid_groups): # print ' %3d %s' % (ipg, ' '.join(pg)) check_droplet_id_groups() # TODO handle/keep better track of failures # then go through each group and try to figure out which seqs are real print ' cleaning %d pid groups:' % len(pid_groups) n_ok = {} for ipg, pgroup in enumerate(pid_groups): pgroup = [u for u in pgroup if getloc(u) != '?' ] # TODO figure out what to do with missing ones # print ' %s' % lgstr(pgroup), hids = [u for u in pgroup if utils.has_d_gene(getloc(u))] lids = [u for u in pgroup if u not in hids] if len(hids) < 2 and len(lids) < 2: # print ' both ok' if lgstr(pgroup) not in n_ok: n_ok[lgstr(pgroup)] = 0 n_ok[lgstr(pgroup)] += 1 pid_groups[ipg] = pgroup continue if debug: print ' %s' % lgstr(pgroup), for chain, idlist in zip(utils.chains, [hids, lids]): if len(idlist) < 2: continue if debug: print '\n too many %s chains: %s' % (chain, lgstr(idlist)) ids_to_remove = choose_seqs_to_remove(idlist) for rid in ids_to_remove: pgroup.remove(rid) idlist.remove(rid) if debug: print ' %s: removed %d, leaving %d' % (utils.color( 'green', 'fixed') if len(idlist) == 1 else utils.color( 'red', 'nope'), len(ids_to_remove), len(idlist)) if len(idlist) > 1: for uid in idlist: prutils.print_seq_in_reco_event( all_antns[uid], all_antns[uid]['unique_ids'].index(uid), one_line=True, extra_str=' ', uid_extra_str=utils.locstr(getloc(uid))) pid_groups[ipg] = pgroup print ' N ok:' for lstr, count in sorted(n_ok.items(), key=operator.itemgetter(1), reverse=True): print ' %3d %s' % (count, lstr) for ltmp in sorted(cpaths): print '%s' % utils.color('green', ltmp) cpaths[ltmp].print_partitions() for iclust, cluster in enumerate( sorted(cpaths[ltmp].best(), key=len, reverse=True)): cline = antn_dicts[ltmp][':'.join(cluster)] # before_strs = [lgstr(pids) for pids in cline['paired-uids']] cline['paired-uids'] = [[ p for p in pid_groups[pid_ids[u]] if p != u ] for u in cline['unique_ids']] # see what others in its family are paired with pfamilies = { } # TODO rewrite comment: map, for each locus, of the families that are paired with each uid in <cluster> (family name str : family annotation) for uid, pids in zip(cline['unique_ids'], cline['paired-uids']): for pid in pids: fline = all_antns[pid] fkey = ':'.join(fline['unique_ids']) floc = gval(pid, 'loci') if fkey not in pfamilies: pfamilies[fkey] = {'locus': floc, 'count': 0} pfamilies[fkey]['count'] += 1 print ' N size cdr3' for fkey, fdict in sorted(pfamilies.items(), key=lambda x: x[1]['count'], reverse=True): print ' %s %3d %3d %3d' % ( utils.locstr(fdict['locus']), fdict['count'], len(antn_dicts[fdict['locus']][fkey]['unique_ids']), antn_dicts[fdict['locus']][fkey]['cdr3_length']) def pfkey(p): return ':'.join(all_antns[p]['unique_ids']) pfcounts = [[pfamilies[pfkey(p)]['count'] for p in pids] for pids in cline['paired-uids']] def lcstr(pids, pfcs): if len(pids) == 0: return '' spids, spfcs = zip(*sorted( zip(pids, pfcs), key=operator.itemgetter(1), reverse=True)) return '%s %s' % (lgstr(spids, sort=False), ' '.join( str(c) for c in spfcs)) uid_extra_strs = [ lcstr(pids, pfs) for pids, pfs in zip(cline['paired-uids'], pfcounts) ] utils.print_reco_event(cline, uid_extra_strs=uid_extra_strs, extra_str=' ') if iclust >= n_max_clusters: break