def print_pairing_info(outfos, paired_uids): loci_by_uid = {sfo['name']: l for l in outfos for sfo in outfos[l] } # locus of each sequence, just for counting below print_cutoff = 0.01 print ' count frac paired with' for locus in utils.sub_loci(args.ig_or_tr): plocicounts = {} for sfo in outfos[locus]: plstr = ' '.join( utils.locstr(l) for l in sorted([ loci_by_uid.get(pid, '?') for pid in paired_uids[sfo['name']] ])) if plstr not in plocicounts: plocicounts[plstr] = 0 plocicounts[plstr] += 1 total = sum(plocicounts.values()) n_skipped = 0 for ipl, (plstr, counts) in enumerate( sorted(plocicounts.items(), key=operator.itemgetter(1), reverse=True)): if counts / float(total) < print_cutoff: n_skipped += counts continue print ' %s %6d %5.2f %s' % (utils.locstr(locus) if ipl == 0 else ' ', counts, counts / float(total), plstr) if n_skipped > 0: print ' +%d counts skipped with <%.3f each' % ( n_skipped, print_cutoff) # utils.color('yellow', 'note
def lpstr(spair): l, s = spair return '%s %s' % (utils.locstr(l) if l == locus else l.replace( 'ig', ''), utils.color('red' if s != 0 else None, '%3d' % s))
def lgstr(lgroup, sort=True): return ' '.join( utils.locstr(l) for l in (sorted if sort else utils.pass_fcn )([getloc(u) for u in lgroup]))
def clean_pair_info(cpaths, antn_lists, n_max_clusters=3, debug=False): # ---------------------------------------------------------------------------------------- def check_droplet_id_groups(tdbg=False): # check against the droplet id method (we could just do it this way, but it would only work for 10x, and only until they change their naming convention) pgroup_strs = set(':'.join(sorted(pg)) for pg in pid_groups) all_uids = list( set([ su for l in cpaths for c in cpaths[l].best() for u in c for su in [u] + utils.per_seq_val(all_antns[u], 'paired-uids', u) ])) n_not_found = 0 for dropid, drop_queries in itertools.groupby( sorted(all_uids, key=utils.get_droplet_id), key=utils.get_droplet_id): dqlist = list(drop_queries) found = ':'.join(sorted(dqlist)) in pgroup_strs if not found: overlaps = [g for g in pgroup_strs if dropid in g] overlaps = utils.get_single_entry(overlaps) n_not_found += 1 if tdbg or not found: print ' %25s %s %s %s' % ( utils.color('green', '-') if found else utils.color( 'red', 'x'), dropid, ' '.join( sorted(utils.get_contig_id(q) for q in dqlist)), utils.color( 'red', ' '.join( sorted( utils.get_contig_id(q) for q in overlaps.split(':'))) if not found else '')) if n_not_found > 0: print ' %s droplet id group check failed for %d groups' % ( utils.color('red', 'error'), n_not_found) # ---------------------------------------------------------------------------------------- def getloc(uid): if uid not in all_antns: return '?' return utils.per_seq_val(all_antns[uid], 'loci', uid) # ---------------------------------------------------------------------------------------- def gval(uid, key): # get per-seq val for <uid> if uid not in all_antns: return None return utils.per_seq_val(all_antns[uid], key, uid) # ---------------------------------------------------------------------------------------- def lgstr(lgroup, sort=True): return ' '.join( utils.locstr(l) for l in (sorted if sort else utils.pass_fcn )([getloc(u) for u in lgroup])) # ---------------------------------------------------------------------------------------- def choose_seqs_to_remove( chain_ids, max_hdist=4, tdbg=False): # choose one of <chain_ids> to eliminate # look for pairs with the same locus that ids_to_remove = set(u for u in chain_ids if getloc(u) == '?') if tdbg and len( ids_to_remove ) > 0: # i think this actually can't happen a.t.m. TODO maybe remove it print ' removed %d with missing annotations' % len( ids_to_remove) dbgstr = [] n_equivalent = 0 for tpair in itertools.combinations(chain_ids, 2): if len(set(getloc(u) for u in tpair)) > 1: continue if len(set(len(gval(u, 'seqs')) for u in tpair)) > 1: continue hdist = utils.hamming_distance(*[gval(u, 'seqs') for u in tpair]) if tdbg: dbgstr.append( utils.color('blue' if hdist == 0 else 'yellow', '%d' % hdist)) if hdist <= max_hdist: # TODO would be nice to be able to combine their sequences, but I think propagating the resulting annotation modifications would be hard # print ' identical sequence overlap, choosing longer one' better_id, worse_id = sorted( tpair, key=lambda q: utils.ambig_frac(gval(q, 'seqs')) ) # TODO if we're tossing one with hdist > 0, maybe should take the lower-shm one if they're the same length? ids_to_remove.add(worse_id) n_equivalent += 1 if tdbg and len(dbgstr) > 0: print ' %d pair%s equivalent with hdists %s' % ( n_equivalent, utils.plural(n_equivalent), ' '.join(dbgstr)) # remove unproductive dbgstr = [] unproductive_ids = [] for uid in chain_ids: if not utils.is_functional( all_antns[uid], all_antns[uid]['unique_ids'].index(uid)): unproductive_ids.append(uid) if tdbg: dbgstr.append( utils.is_functional_dbg_str( all_antns[uid], all_antns[uid]['unique_ids'].index(uid), sep='+')) # unproductive_ids = [u for u in chain_ids if not utils.is_functional(all_antns[u], all_antns[u]['unique_ids'].index(u))] # this way is only one line, which may or may not be nicer if tdbg and len(unproductive_ids) > 0: print ' %d unproductive %s' % (len(unproductive_ids), ', '.join(dbgstr)) ids_to_remove |= set(unproductive_ids) return ids_to_remove # ---------------------------------------------------------------------------------------- antn_dicts = { l: utils.get_annotation_dict(antn_lists[l]) for l in antn_lists } # first make a map from each uid (for all loci) to its annotation pid_groups = [ ] # list of pid groups, i.e. each element is the uids from a single droplet (for 10x) pid_ids = {} # map from each uid to the index of its pid group all_antns = {} if debug: print ' %s consolidating info for %d loci with cluster/sequence counts: %s' % ( utils.color('blue', '+'.join(cpaths)), len(cpaths), ' '.join( '%s: %d/%d' % (l, len(cpaths[l].best()), sum(len(c) for c in cpaths[l].best())) for l in sorted(cpaths))) for ltmp in sorted(cpaths): for cluster in cpaths[ltmp].best(): cline = antn_dicts[ltmp][':'.join(cluster)] if 'paired-uids' not in cline: print ' %s no paired-uids in line' % utils.color( 'yellow', 'warning') continue # maybe should still add to all_antns? for uid, pids in zip(cline['unique_ids'], cline['paired-uids']): pset = set([uid] + pids) found = False for ipg, pgroup in enumerate(pid_groups): if any( p in pgroup for p in pset ): # TODO should maybe check for consistency if some of them are already in there (i.e. from reciprocal info in another chain)? found = True pgroup |= pset break if not found: pid_groups.append(pset) ipg = len(pid_groups) - 1 assert ipg is not None for pid in pset: pid_ids[pid] = ipg cline['loci'] = [ ltmp for _ in cline['unique_ids'] ] # TODO maybe should add this somewhere else, like in partitiondriver? (eh, maybe not? the locus is always available in each file from the germline info anyway) for uid in cline['unique_ids']: all_antns[uid] = cline # for ipg, pg in enumerate(pid_groups): # print ' %3d %s' % (ipg, ' '.join(pg)) check_droplet_id_groups() # TODO handle/keep better track of failures # then go through each group and try to figure out which seqs are real print ' cleaning %d pid groups:' % len(pid_groups) n_ok = {} for ipg, pgroup in enumerate(pid_groups): pgroup = [u for u in pgroup if getloc(u) != '?' ] # TODO figure out what to do with missing ones # print ' %s' % lgstr(pgroup), hids = [u for u in pgroup if utils.has_d_gene(getloc(u))] lids = [u for u in pgroup if u not in hids] if len(hids) < 2 and len(lids) < 2: # print ' both ok' if lgstr(pgroup) not in n_ok: n_ok[lgstr(pgroup)] = 0 n_ok[lgstr(pgroup)] += 1 pid_groups[ipg] = pgroup continue if debug: print ' %s' % lgstr(pgroup), for chain, idlist in zip(utils.chains, [hids, lids]): if len(idlist) < 2: continue if debug: print '\n too many %s chains: %s' % (chain, lgstr(idlist)) ids_to_remove = choose_seqs_to_remove(idlist) for rid in ids_to_remove: pgroup.remove(rid) idlist.remove(rid) if debug: print ' %s: removed %d, leaving %d' % (utils.color( 'green', 'fixed') if len(idlist) == 1 else utils.color( 'red', 'nope'), len(ids_to_remove), len(idlist)) if len(idlist) > 1: for uid in idlist: prutils.print_seq_in_reco_event( all_antns[uid], all_antns[uid]['unique_ids'].index(uid), one_line=True, extra_str=' ', uid_extra_str=utils.locstr(getloc(uid))) pid_groups[ipg] = pgroup print ' N ok:' for lstr, count in sorted(n_ok.items(), key=operator.itemgetter(1), reverse=True): print ' %3d %s' % (count, lstr) for ltmp in sorted(cpaths): print '%s' % utils.color('green', ltmp) cpaths[ltmp].print_partitions() for iclust, cluster in enumerate( sorted(cpaths[ltmp].best(), key=len, reverse=True)): cline = antn_dicts[ltmp][':'.join(cluster)] # before_strs = [lgstr(pids) for pids in cline['paired-uids']] cline['paired-uids'] = [[ p for p in pid_groups[pid_ids[u]] if p != u ] for u in cline['unique_ids']] # see what others in its family are paired with pfamilies = { } # TODO rewrite comment: map, for each locus, of the families that are paired with each uid in <cluster> (family name str : family annotation) for uid, pids in zip(cline['unique_ids'], cline['paired-uids']): for pid in pids: fline = all_antns[pid] fkey = ':'.join(fline['unique_ids']) floc = gval(pid, 'loci') if fkey not in pfamilies: pfamilies[fkey] = {'locus': floc, 'count': 0} pfamilies[fkey]['count'] += 1 print ' N size cdr3' for fkey, fdict in sorted(pfamilies.items(), key=lambda x: x[1]['count'], reverse=True): print ' %s %3d %3d %3d' % ( utils.locstr(fdict['locus']), fdict['count'], len(antn_dicts[fdict['locus']][fkey]['unique_ids']), antn_dicts[fdict['locus']][fkey]['cdr3_length']) def pfkey(p): return ':'.join(all_antns[p]['unique_ids']) pfcounts = [[pfamilies[pfkey(p)]['count'] for p in pids] for pids in cline['paired-uids']] def lcstr(pids, pfcs): if len(pids) == 0: return '' spids, spfcs = zip(*sorted( zip(pids, pfcs), key=operator.itemgetter(1), reverse=True)) return '%s %s' % (lgstr(spids, sort=False), ' '.join( str(c) for c in spfcs)) uid_extra_strs = [ lcstr(pids, pfs) for pids, pfs in zip(cline['paired-uids'], pfcounts) ] utils.print_reco_event(cline, uid_extra_strs=uid_extra_strs, extra_str=' ') if iclust >= n_max_clusters: break