예제 #1
0
def read_germline_seqs(gldir, locus, skip_pseudogenes):
    seqs = {r : OrderedDict() for r in utils.regions}
    for fname in glfo_fasta_fnames(locus):
        read_fasta_file(seqs, gldir + '/' + locus + '/' + fname, skip_pseudogenes)
    if not utils.has_d_gene(locus):  # choose a sequence for the dummy d
        seqs['d'][dummy_d_genes[locus]] = 'A'  # this (arbitrary) choice is also made in packages/ham/src/bcrutils.cc
    return seqs
예제 #2
0
 def getvalstr(gene, val):
     if gene is None or (utils.get_region(gene) == 'd' and not utils.has_d_gene(utils.get_locus(gene))):
         return '%s  %5.2s  %s  %-16s%s' % (cstr, ' - ', cstr, ' - ', 4 * ' ' if latex else '')
     else:
         if latex:
             gstr = utils.shorten_gene_name(gene, use_one_based_indexing=True, n_max_mutstrs=5)
             if emph_genes is not None and gene in emph_genes:
                 gstr = '\\color{red}{\\textbf{%s}}' % gstr
         else:
             gstr = utils.color_gene(gene, width=18)
         return '%s  %s%5.2f%s %s %-20s' % (cstr, estr, 100 * val, estr, cstr, gstr)
예제 #3
0
    def try_scratch_erode_insert(self, tmpline, debug=False):
        utils.remove_all_implicit_info(tmpline)
        for erosion in utils.real_erosions:  # includes various contortions to avoid eroding the entire gene
            region = erosion[0]
            gene_length = len(self.glfo['seqs'][region][tmpline[region + '_gene']])
            if region == 'd' and not utils.has_d_gene(self.args.locus):  # dummy d genes: always erode the whole thing from the left
                assert gene_length == 1 and tmpline['d_gene'] == glutils.dummy_d_genes[self.args.locus]
                tmpline[erosion + '_del'] = 1 if '5p' in erosion else 0
            else:
                max_erosion = max(0, gene_length/2 - 2)  # heuristic
                if region in utils.conserved_codons[self.args.locus]:  # make sure not to erode a conserved codon
                    codon_pos = utils.cdn_pos(self.glfo, region, tmpline[region + '_gene'])
                    if '3p' in erosion:
                        n_bases_to_codon = gene_length - codon_pos - 3
                    elif '5p' in erosion:
                        n_bases_to_codon = codon_pos
                    max_erosion = min(max_erosion, n_bases_to_codon)
                tmpline[erosion + '_del'] = min(max_erosion, numpy.random.geometric(1. / utils.scratch_mean_erosion_lengths[erosion]) - 1)
        for bound in utils.boundaries:
            mean_length = utils.scratch_mean_insertion_lengths[self.args.locus][bound]
            length = 0 if mean_length == 0 else numpy.random.geometric(1. / mean_length) - 1
            probs = [self.insertion_content_probs[bound][n] for n in utils.nukes]
            tmpline[bound + '_insertion'] = ''.join(numpy.random.choice(utils.nukes, size=length, p=probs))

        if debug:
            print '    erosions:  %s' % ('   '.join([('%s %d' % (e, tmpline[e + '_del'])) for e in utils.real_erosions]))
            print '    insertions:  %s' % ('   '.join([('%s %s' % (b, tmpline[b + '_insertion'])) for b in utils.boundaries]))

        # have to add some things by hand so utils.add_implicit_info() doesn't barf (this duplicates code later on in recombinator)
        gl_seqs = {r : self.glfo['seqs'][r][tmpline[r + '_gene']] for r in utils.regions}
        for erosion in utils.real_erosions:
            region = erosion[0]
            e_length = tmpline[erosion + '_del']
            if '5p' in erosion:
                gl_seqs[region] = gl_seqs[region][e_length:]
            elif '3p' in erosion:
                gl_seqs[region] = gl_seqs[region][:len(gl_seqs[region]) - e_length]
        tmpline['seqs'] = [gl_seqs['v'] + tmpline['vd_insertion'] + gl_seqs['d'] + tmpline['dj_insertion'] + gl_seqs['j'], ]
        tmpline['unique_ids'] = [None]  # this is kind of hackey, but some things in the implicit info adder use it to get the number of sequences
        tmpline['input_seqs'] = copy.deepcopy(tmpline['seqs'])  # NOTE has to be updated _immediately_ so seqs and input_seqs don't get out of sync
        tmpline['indelfos'] = [indelutils.get_empty_indel(), ]
        utils.add_implicit_info(self.glfo, tmpline)
        assert len(tmpline['in_frames']) == 1
예제 #4
0
def print_seq_in_reco_event(original_line,
                            iseq,
                            extra_str='',
                            label='',
                            one_line=False,
                            seed_uid=None,
                            check_line_integrity=False):
    """
    Print ascii summary of recombination event and mutation.
    If <one_line>, then skip the germline lines, and only print the final_seq line.
    """
    line = original_line
    if check_line_integrity:  # it's very important not to modify <line> -- this lets you verify that you aren't
        line = copy.deepcopy(
            original_line)  # copy that we can modify without changing <line>

    delstrs = {
        d: '.' * line[d + '_del']
        for d in utils.all_erosions
    }  # NOTE len(delstrs[<del>]) is not in general the same as len(line[<del>_del])
    if len(
            delstrs['v_5p']
    ) > 50:  # don't print a million dots if left-side v deletion is really big
        delstrs['v_5p'] = '.%d.' % len(delstrs['v_5p'])

    # if there isn't enough space for dots in the vj line, we add some dashes to everybody so things fit (rare in heavy chain rearrangements, but pretty common in light chain)
    d_plus_inserts_length = len(line['vd_insertion'] + line['d_gl_seq'] +
                                line['dj_insertion'])
    if line['v_3p_del'] + line[
            'j_5p_del'] > d_plus_inserts_length:  # if dots for v and j interior deletions will be longer than <d_plus_inserts_length>
        delstrs['v_3p'] = '.%d.' % line['v_3p_del']
        delstrs['j_5p'] = '.%d.' % line['j_5p_del']
        gapstr = '-' * (len(delstrs['v_3p'] + delstrs['j_5p']) -
                        d_plus_inserts_length)
        gap_insert_point = len(
            line['fv_insertion'] + delstrs['v_5p'] + line['v_gl_seq']
        )  # it doesn't really matter exactly where we put the blue dashes, as long as it's the same place in all four lines, but this is a good spot
        extra_space_because_of_fixed_nospace = max(
            0, d_plus_inserts_length - len(delstrs['v_3p'] + delstrs['j_5p'])
        )  # if shortening the <delstrs> already over-compensated for the lack of space (i.e., if the number of dashes necessary is zero), then we need to add some dots to the vj line below
    else:
        gapstr = ''
        gap_insert_point = None
        extra_space_because_of_fixed_nospace = 0

    eroded_seqs_dots = {
        r: delstrs[r + '_5p'] + line[r + '_gl_seq'] + delstrs[r + '_3p']
        for r in utils.regions
    }

    # build the three germline lines
    insert_line = ' ' * (len(line['fv_insertion']) + line['lengths']['v'] + len(delstrs['v_5p'])) \
                  + line['vd_insertion'] + ' ' * line['lengths']['d'] + line['dj_insertion'] \
                  + ' ' * (line['lengths']['j'] + line['j_3p_del'] + len(line['jf_insertion']))
    germline_d_start = len(line['fv_insertion']) + line['lengths']['v'] + len(
        line['vd_insertion']) - line['d_5p_del']
    germline_d_end = germline_d_start + line['d_5p_del'] + line['lengths'][
        'd'] + line['d_3p_del']
    d_line = ' ' * (germline_d_start + len(delstrs['v_5p'])) \
             + eroded_seqs_dots['d'] \
             + ' ' * (len(line['j_gl_seq']) + len(line['dj_insertion']) - line['d_3p_del'] + line['j_3p_del'] + len(line['jf_insertion']))
    germline_v_end = len(line['fv_insertion']) + len(line['v_gl_seq']) + line[
        'v_3p_del'] - 1  # position in the query sequence at which we find the last base of the v match. NOTE we subtract off the v_5p_del because we're *not* adding dots for that deletion (it's just too long)
    germline_j_start = germline_d_end + 1 - line['d_3p_del'] + len(
        line['dj_insertion']) - line['j_5p_del']
    vj_line = ' ' * len(line['fv_insertion']) + eroded_seqs_dots['v'] + '.' * extra_space_because_of_fixed_nospace \
              + ' ' * (germline_j_start - germline_v_end - 2) + eroded_seqs_dots['j'] + ' ' * len(line['jf_insertion'])
    # and the query line
    qrseq_line = ' ' * len(
        delstrs['v_5p']) + line['seqs'][iseq] + ' ' * line['j_3p_del']

    outstrs = [insert_line, d_line, vj_line, qrseq_line]
    check_outsr_lengths(
        line, outstrs, fix=True
    )  # I think the only way they can be different is if the d right side erosion is so long that it hangs over the right side of the j

    if gap_insert_point is not None:
        for istr in [
                0, 1, 3
        ]:  # everybody except the vj line, which already has the modified interior delstrs above
            outstrs[
                istr] = outstrs[istr][:gap_insert_point] + gapstr + outstrs[
                    istr][gap_insert_point:]

    check_outsr_lengths(line, outstrs, fix=True)

    colors = [[[] for _ in range(len(ostr))] for ostr in outstrs]
    if indelutils.has_indels(line['indelfos'][iseq]):
        # outstrs, colors = old_indel_shenanigans(line, iseq, outstrs, colors)
        outstrs, colors = indel_shenanigans(line, iseq, outstrs, colors)
    outstrs = add_colors(outstrs, colors, line)

    suffixes = [
        'insert%s\n' %
        ('s' if utils.has_d_gene(utils.get_locus(line['v_gene'])) else ''),
        '%s\n' % (utils.color_gene(line['d_gene'])),
        '%s %s\n' %
        (utils.color_gene(line['v_gene']), utils.color_gene(line['j_gene'])),
        '%s   %4.2f mut  %s\n' %
        (get_uid_str(line, iseq, seed_uid), line['mut_freqs'][iseq],
         utils.color('red', utils.is_functional_dbg_str(line, iseq)))
    ]
    outstrs = [
        '%s%s   %s' % (extra_str, ostr, suf)
        for ostr, suf in zip(outstrs, suffixes)
    ]

    if label != '':  # this doesn't really work if the edge of the removed string is the middle of a color code... but oh well, it doesn't really happen any more since I shortened the kbound label from waterer.py
        offset = max(
            0,
            len(extra_str) -
            2)  # skootch <label> this many positions leftward into <extra_str>
        removed_str = outstrs[0][offset:offset +
                                 utils.len_excluding_colors(label)]
        outstrs[0] = outstrs[0][:offset] + label + outstrs[0][
            utils.len_excluding_colors(label) +
            offset:]  # NOTE this *replaces* the bases in <extra_str> with <label>, which is only fine if they're spaces
        if removed_str.strip() != '':
            print '%s%s (covered by label \'%s\')' % (
                ' ' * offset, utils.color('red', removed_str), label)

    if one_line:
        outstrs = outstrs[-1:]  # remove all except the query seq line
    elif not utils.has_d_gene(utils.get_locus(line['v_gene'])):
        outstrs.pop(1)  # remove the d germline line

    print ''.join(outstrs),

    if check_line_integrity:
        if set(line.keys()) != set(original_line.keys()):
            raise Exception('ack 1')
        for k in line:
            if line[k] != original_line[k]:
                print 'key %s differs:\n  %s\n  %s ' % (k, line[k],
                                                        original_line[k])
                raise Exception('')
예제 #5
0
import sys
import os
import random
import re
import numpy
import glob
from collections import OrderedDict
import csv
from subprocess import check_call, Popen, PIPE

import utils

# ----------------------------------------------------------------------------------------
glfo_dir = 'germline-sets'  # always put germline info into a subdir with this name

dummy_d_genes = {l : l.upper() + 'Dx-x*x' if not utils.has_d_gene(l) else None for l in utils.loci}  # e.g. IGKDx-x*x for igk, None for igh

# single-locus file names
extra_fname = 'extras.csv'
def glfo_fasta_fnames(locus):
    return [locus + r + '.fasta' for r in utils.getregions(locus)]
def glfo_fnames(locus):
    return [extra_fname, ] + glfo_fasta_fnames(locus)

csv_headers = ['gene', 'cyst_position', 'tryp_position', 'phen_position', 'aligned_seq']

functionalities = [(sep[0] + f + sep[1]).strip() for f in ['F', 'ORF', 'P'] for sep in ['  ', '()', '[]']]   # not actually sure what the parentheses and brackets mean
pseudogene_funcionalities = ['P', '[P]', '(P)']

duplicate_names = {
    'v' : [
예제 #6
0
def clean_pair_info(cpaths, antn_lists, n_max_clusters=3, debug=False):
    # ----------------------------------------------------------------------------------------
    def check_droplet_id_groups(tdbg=False):
        # check against the droplet id method (we could just do it this way, but it would only work for 10x, and only until they change their naming convention)
        pgroup_strs = set(':'.join(sorted(pg)) for pg in pid_groups)
        all_uids = list(
            set([
                su for l in cpaths for c in cpaths[l].best() for u in c
                for su in [u] +
                utils.per_seq_val(all_antns[u], 'paired-uids', u)
            ]))
        n_not_found = 0
        for dropid, drop_queries in itertools.groupby(
                sorted(all_uids, key=utils.get_droplet_id),
                key=utils.get_droplet_id):
            dqlist = list(drop_queries)
            found = ':'.join(sorted(dqlist)) in pgroup_strs
            if not found:
                overlaps = [g for g in pgroup_strs if dropid in g]
                overlaps = utils.get_single_entry(overlaps)
                n_not_found += 1
            if tdbg or not found:
                print '  %25s %s  %s  %s' % (
                    utils.color('green', '-') if found else utils.color(
                        'red', 'x'), dropid, ' '.join(
                            sorted(utils.get_contig_id(q) for q in dqlist)),
                    utils.color(
                        'red', ' '.join(
                            sorted(
                                utils.get_contig_id(q)
                                for q in overlaps.split(':')))
                        if not found else ''))
        if n_not_found > 0:
            print '  %s droplet id group check failed for %d groups' % (
                utils.color('red', 'error'), n_not_found)

    # ----------------------------------------------------------------------------------------
    def getloc(uid):
        if uid not in all_antns:
            return '?'
        return utils.per_seq_val(all_antns[uid], 'loci', uid)

    # ----------------------------------------------------------------------------------------
    def gval(uid, key):  # get per-seq val for <uid>
        if uid not in all_antns:
            return None
        return utils.per_seq_val(all_antns[uid], key, uid)

    # ----------------------------------------------------------------------------------------
    def lgstr(lgroup, sort=True):
        return ' '.join(
            utils.locstr(l) for l in (sorted if sort else utils.pass_fcn
                                      )([getloc(u) for u in lgroup]))

    # ----------------------------------------------------------------------------------------
    def choose_seqs_to_remove(
            chain_ids,
            max_hdist=4,
            tdbg=False):  # choose one of <chain_ids> to eliminate
        # look for pairs with the same locus that
        ids_to_remove = set(u for u in chain_ids if getloc(u) == '?')
        if tdbg and len(
                ids_to_remove
        ) > 0:  # i think this actually can't happen a.t.m. TODO maybe remove it
            print '      removed %d with missing annotations' % len(
                ids_to_remove)

        dbgstr = []
        n_equivalent = 0
        for tpair in itertools.combinations(chain_ids, 2):
            if len(set(getloc(u) for u in tpair)) > 1:
                continue
            if len(set(len(gval(u, 'seqs')) for u in tpair)) > 1:
                continue
            hdist = utils.hamming_distance(*[gval(u, 'seqs') for u in tpair])
            if tdbg:
                dbgstr.append(
                    utils.color('blue' if hdist == 0 else 'yellow',
                                '%d' % hdist))
            if hdist <= max_hdist:  # TODO would be nice to be able to combine their sequences, but I think propagating the resulting annotation modifications would be hard
                # print '      identical sequence overlap, choosing longer one'
                better_id, worse_id = sorted(
                    tpair, key=lambda q: utils.ambig_frac(gval(q, 'seqs'))
                )  # TODO if we're tossing one with hdist > 0, maybe should take the lower-shm one if they're the same length?
                ids_to_remove.add(worse_id)
                n_equivalent += 1
        if tdbg and len(dbgstr) > 0:
            print '        %d pair%s equivalent with hdists %s' % (
                n_equivalent, utils.plural(n_equivalent), ' '.join(dbgstr))

        # remove unproductive
        dbgstr = []
        unproductive_ids = []
        for uid in chain_ids:
            if not utils.is_functional(
                    all_antns[uid], all_antns[uid]['unique_ids'].index(uid)):
                unproductive_ids.append(uid)
                if tdbg:
                    dbgstr.append(
                        utils.is_functional_dbg_str(
                            all_antns[uid],
                            all_antns[uid]['unique_ids'].index(uid),
                            sep='+'))
        # unproductive_ids = [u for u in chain_ids if not utils.is_functional(all_antns[u], all_antns[u]['unique_ids'].index(u))]  # this way is only one line, which may or may not be nicer
        if tdbg and len(unproductive_ids) > 0:
            print '        %d unproductive  %s' % (len(unproductive_ids),
                                                   ',  '.join(dbgstr))
            ids_to_remove |= set(unproductive_ids)

        return ids_to_remove

    # ----------------------------------------------------------------------------------------
    antn_dicts = {
        l: utils.get_annotation_dict(antn_lists[l])
        for l in antn_lists
    }

    # first make a map from each uid (for all loci) to its annotation
    pid_groups = [
    ]  # list of pid groups, i.e. each element is the uids from a single droplet (for 10x)
    pid_ids = {}  # map from each uid to the index of its pid group
    all_antns = {}
    if debug:
        print '  %s consolidating info for %d loci with cluster/sequence counts: %s' % (
            utils.color('blue', '+'.join(cpaths)), len(cpaths), '  '.join(
                '%s: %d/%d' % (l, len(cpaths[l].best()),
                               sum(len(c) for c in cpaths[l].best()))
                for l in sorted(cpaths)))
    for ltmp in sorted(cpaths):
        for cluster in cpaths[ltmp].best():
            cline = antn_dicts[ltmp][':'.join(cluster)]
            if 'paired-uids' not in cline:
                print '  %s no paired-uids in line' % utils.color(
                    'yellow', 'warning')
                continue  # maybe should still add to all_antns?
            for uid, pids in zip(cline['unique_ids'], cline['paired-uids']):
                pset = set([uid] + pids)
                found = False
                for ipg, pgroup in enumerate(pid_groups):
                    if any(
                            p in pgroup for p in pset
                    ):  # TODO should maybe check for consistency if some of them are already in there (i.e. from reciprocal info in another chain)?
                        found = True
                        pgroup |= pset
                        break
                if not found:
                    pid_groups.append(pset)
                    ipg = len(pid_groups) - 1
                assert ipg is not None
                for pid in pset:
                    pid_ids[pid] = ipg

            cline['loci'] = [
                ltmp for _ in cline['unique_ids']
            ]  # TODO maybe should add this somewhere else, like in partitiondriver? (eh, maybe not? the locus is always available in each file from the germline info anyway)
            for uid in cline['unique_ids']:
                all_antns[uid] = cline
    # for ipg, pg in enumerate(pid_groups):
    #     print '  %3d %s' % (ipg, ' '.join(pg))

    check_droplet_id_groups()
    # TODO handle/keep better track of failures

    # then go through each group and try to figure out which seqs are real
    print '  cleaning %d pid groups:' % len(pid_groups)
    n_ok = {}
    for ipg, pgroup in enumerate(pid_groups):
        pgroup = [u for u in pgroup if getloc(u) != '?'
                  ]  # TODO figure out what to do with missing ones
        # print '    %s' % lgstr(pgroup),
        hids = [u for u in pgroup if utils.has_d_gene(getloc(u))]
        lids = [u for u in pgroup if u not in hids]
        if len(hids) < 2 and len(lids) < 2:
            # print '  both ok'
            if lgstr(pgroup) not in n_ok:
                n_ok[lgstr(pgroup)] = 0
            n_ok[lgstr(pgroup)] += 1
            pid_groups[ipg] = pgroup
            continue
        if debug:
            print '    %s' % lgstr(pgroup),
        for chain, idlist in zip(utils.chains, [hids, lids]):
            if len(idlist) < 2:
                continue
            if debug:
                print '\n      too many %s chains: %s' % (chain, lgstr(idlist))
            ids_to_remove = choose_seqs_to_remove(idlist)
            for rid in ids_to_remove:
                pgroup.remove(rid)
                idlist.remove(rid)
            if debug:
                print '      %s: removed %d, leaving %d' % (utils.color(
                    'green', 'fixed') if len(idlist) == 1 else utils.color(
                        'red', 'nope'), len(ids_to_remove), len(idlist))
                if len(idlist) > 1:
                    for uid in idlist:
                        prutils.print_seq_in_reco_event(
                            all_antns[uid],
                            all_antns[uid]['unique_ids'].index(uid),
                            one_line=True,
                            extra_str='        ',
                            uid_extra_str=utils.locstr(getloc(uid)))

        pid_groups[ipg] = pgroup

    print '    N ok:'
    for lstr, count in sorted(n_ok.items(),
                              key=operator.itemgetter(1),
                              reverse=True):
        print '      %3d  %s' % (count, lstr)

    for ltmp in sorted(cpaths):
        print '%s' % utils.color('green', ltmp)
        cpaths[ltmp].print_partitions()
        for iclust, cluster in enumerate(
                sorted(cpaths[ltmp].best(), key=len, reverse=True)):
            cline = antn_dicts[ltmp][':'.join(cluster)]
            # before_strs = [lgstr(pids) for pids in cline['paired-uids']]
            cline['paired-uids'] = [[
                p for p in pid_groups[pid_ids[u]] if p != u
            ] for u in cline['unique_ids']]

            # see what others in its family are paired with
            pfamilies = {
            }  # TODO rewrite comment: map, for each locus, of the families that are paired with each uid in <cluster> (family name str : family annotation)
            for uid, pids in zip(cline['unique_ids'], cline['paired-uids']):
                for pid in pids:
                    fline = all_antns[pid]
                    fkey = ':'.join(fline['unique_ids'])
                    floc = gval(pid, 'loci')
                    if fkey not in pfamilies:
                        pfamilies[fkey] = {'locus': floc, 'count': 0}
                    pfamilies[fkey]['count'] += 1
            print '           N  size  cdr3'
            for fkey, fdict in sorted(pfamilies.items(),
                                      key=lambda x: x[1]['count'],
                                      reverse=True):
                print '       %s %3d  %3d   %3d' % (
                    utils.locstr(fdict['locus']), fdict['count'],
                    len(antn_dicts[fdict['locus']][fkey]['unique_ids']),
                    antn_dicts[fdict['locus']][fkey]['cdr3_length'])

            def pfkey(p):
                return ':'.join(all_antns[p]['unique_ids'])

            pfcounts = [[pfamilies[pfkey(p)]['count'] for p in pids]
                        for pids in cline['paired-uids']]

            def lcstr(pids, pfcs):
                if len(pids) == 0:
                    return ''
                spids, spfcs = zip(*sorted(
                    zip(pids, pfcs), key=operator.itemgetter(1), reverse=True))
                return '%s  %s' % (lgstr(spids, sort=False), ' '.join(
                    str(c) for c in spfcs))

            uid_extra_strs = [
                lcstr(pids, pfs)
                for pids, pfs in zip(cline['paired-uids'], pfcounts)
            ]
            utils.print_reco_event(cline,
                                   uid_extra_strs=uid_extra_strs,
                                   extra_str='      ')

            if iclust >= n_max_clusters:
                break