Пример #1
0
def read_ramesh_file(fname, outdir, debug=False):
    seqfos = utils.read_fastx(fname)
    glseqs = {
        l: {r: {}
            for r in utils.loci[l]}
        for l in utils.loci if 'ig' in l
    }
    for sfo in seqfos:
        if os.path.basename(fname) == 'coding.fa':
            meta = [x.strip('[]').split('=') for x in sfo['infostrs']]
            mdict = {m[0]: m[1] for m in meta if len(m) == 2}
            if 'gene' not in mdict:
                print 'no gene for %s' % sfo['infostrs']
                continue
            gene = mdict['gene']
        else:
            mdict = {}
            gene = sfo['name']
        if debug:
            print gene
        if utils.is_constant_gene(gene):
            if debug:
                print '  constant'
            continue
        region = utils.get_region(gene)
        utils.split_gene(gene)
        # if 'partial' in mdict:
        #     gene += '_partial_%s' % mdict['partial'].replace('\'', '').replace(',', '')
        if sfo['seq'] in glseqs[utils.get_locus(gene)][region].values():
            if debug:
                print '  duplicate'
            continue
        glseqs[utils.get_locus(gene)][region][gene] = sfo['seq']

    return glseqs
Пример #2
0
def read_mute_freqs_with_weights(indir, approved_genes):  # it would be nice to eventually align the genes before combining
    # returns:
    #  - mute_freqs: inverse error-weighted average mute freq over all genes for each position
    #     - also includes weighted and unweigthed means over positions

    if len(approved_genes) == 0:
        raise Exception('no approved genes')

    if approved_genes[0] == glutils.dummy_d_genes[utils.get_locus(approved_genes[0])]:
        return {'overall_mean' : 0.5, 'unweighted_overall_mean' : 0.5}

    # add an observation for each position, for each gene where we observed that position NOTE this would be more sensible if they were aligned first
    observed_freqs = {}
    for gene in approved_genes:
        mutefname = indir + '/mute-freqs/' + utils.sanitize_name(gene) + '.csv'
        if not os.path.exists(mutefname):
            continue
        with open(mutefname, 'r') as mutefile:
            reader = csv.DictReader(mutefile)
            for line in reader:
                pos = int(line['position'])
                freq = float(line['mute_freq'])
                lo_err = float(line['lo_err'])  # NOTE lo_err in the file is really the lower *bound*
                hi_err = float(line['hi_err'])  #   same deal
                assert freq >= 0.0 and lo_err >= 0.0 and hi_err >= 0.0  # you just can't be too careful

                if freq < utils.eps or abs(1.0 - freq) < utils.eps:  # if <freq> too close to 0 or 1, replace it with the midpoint of its uncertainty band
                    freq = 0.5 * (lo_err + hi_err)

                if pos not in observed_freqs:
                    observed_freqs[pos] = []

                observed_freqs[pos].append({'freq' : freq, 'err' : max(abs(freq-lo_err), abs(freq-hi_err))})  # append one for each gene

    # set final mute_freqs[pos] to the (inverse error-weighted) average over all the observations [i.e. genes] for each position
    mute_freqs = {}
    for pos in observed_freqs:
        total, sum_of_weights = 0.0, 0.0
        for obs in observed_freqs[pos]:  # loop over genes
            assert obs['err'] > 0.0
            weight = 1.0 / obs['err']
            total += weight * obs['freq']
            sum_of_weights += weight
        assert sum_of_weights > 0.0
        mean_freq = total / sum_of_weights
        mute_freqs[pos] = mean_freq

    # NOTE I'm sure that this weighting scheme makes sense for comparing differeing genes at the same position, but I'm less sure it makes sense for the overall mean. But, I don't want to track down all the places that changing it might affect right now
    mute_freqs['overall_mean'] = 0.
    weighted_denom = sum([1. / obs['err'] for pos in observed_freqs for obs in observed_freqs[pos]])
    if weighted_denom > 0.:
        mute_freqs['overall_mean'] = sum([obs['freq'] / obs['err'] for pos in observed_freqs for obs in observed_freqs[pos]]) / weighted_denom

    # I need the inverse-error-weighted numbers to sensibly combine genes, but then I also need unweigthed values that I can easily write to the yaml files for other people to use
    mute_freqs['unweighted_overall_mean'] = 0.
    unweighted_denom = sum([len(observed_freqs[pos]) for pos in observed_freqs])
    if unweighted_denom > 0.:
        mute_freqs['unweighted_overall_mean'] = sum([obs['freq'] for pos in observed_freqs for obs in observed_freqs[pos]]) / unweighted_denom

    return mute_freqs
Пример #3
0
 def getvalstr(gene, val):
     if gene is None or (utils.get_region(gene) == 'd' and not utils.has_d_gene(utils.get_locus(gene))):
         return '%s  %5.2s  %s  %-16s%s' % (cstr, ' - ', cstr, ' - ', 4 * ' ' if latex else '')
     else:
         if latex:
             gstr = utils.shorten_gene_name(gene, use_one_based_indexing=True, n_max_mutstrs=5)
             if emph_genes is not None and gene in emph_genes:
                 gstr = '\\color{red}{\\textbf{%s}}' % gstr
         else:
             gstr = utils.color_gene(gene, width=18)
         return '%s  %s%5.2f%s %s %-20s' % (cstr, estr, 100 * val, estr, cstr, gstr)
Пример #4
0
def print_seq_in_reco_event(original_line,
                            iseq,
                            extra_str='',
                            label='',
                            one_line=False,
                            seed_uid=None,
                            check_line_integrity=False):
    """
    Print ascii summary of recombination event and mutation.
    If <one_line>, then skip the germline lines, and only print the final_seq line.
    """
    line = original_line
    if check_line_integrity:  # it's very important not to modify <line> -- this lets you verify that you aren't
        line = copy.deepcopy(
            original_line)  # copy that we can modify without changing <line>

    delstrs = {
        d: '.' * line[d + '_del']
        for d in utils.all_erosions
    }  # NOTE len(delstrs[<del>]) is not in general the same as len(line[<del>_del])
    if len(
            delstrs['v_5p']
    ) > 50:  # don't print a million dots if left-side v deletion is really big
        delstrs['v_5p'] = '.%d.' % len(delstrs['v_5p'])

    # if there isn't enough space for dots in the vj line, we add some dashes to everybody so things fit (rare in heavy chain rearrangements, but pretty common in light chain)
    d_plus_inserts_length = len(line['vd_insertion'] + line['d_gl_seq'] +
                                line['dj_insertion'])
    if line['v_3p_del'] + line[
            'j_5p_del'] > d_plus_inserts_length:  # if dots for v and j interior deletions will be longer than <d_plus_inserts_length>
        delstrs['v_3p'] = '.%d.' % line['v_3p_del']
        delstrs['j_5p'] = '.%d.' % line['j_5p_del']
        gapstr = '-' * (len(delstrs['v_3p'] + delstrs['j_5p']) -
                        d_plus_inserts_length)
        gap_insert_point = len(
            line['fv_insertion'] + delstrs['v_5p'] + line['v_gl_seq']
        )  # it doesn't really matter exactly where we put the blue dashes, as long as it's the same place in all four lines, but this is a good spot
        extra_space_because_of_fixed_nospace = max(
            0, d_plus_inserts_length - len(delstrs['v_3p'] + delstrs['j_5p'])
        )  # if shortening the <delstrs> already over-compensated for the lack of space (i.e., if the number of dashes necessary is zero), then we need to add some dots to the vj line below
    else:
        gapstr = ''
        gap_insert_point = None
        extra_space_because_of_fixed_nospace = 0

    eroded_seqs_dots = {
        r: delstrs[r + '_5p'] + line[r + '_gl_seq'] + delstrs[r + '_3p']
        for r in utils.regions
    }

    # build the three germline lines
    insert_line = ' ' * (len(line['fv_insertion']) + line['lengths']['v'] + len(delstrs['v_5p'])) \
                  + line['vd_insertion'] + ' ' * line['lengths']['d'] + line['dj_insertion'] \
                  + ' ' * (line['lengths']['j'] + line['j_3p_del'] + len(line['jf_insertion']))
    germline_d_start = len(line['fv_insertion']) + line['lengths']['v'] + len(
        line['vd_insertion']) - line['d_5p_del']
    germline_d_end = germline_d_start + line['d_5p_del'] + line['lengths'][
        'd'] + line['d_3p_del']
    d_line = ' ' * (germline_d_start + len(delstrs['v_5p'])) \
             + eroded_seqs_dots['d'] \
             + ' ' * (len(line['j_gl_seq']) + len(line['dj_insertion']) - line['d_3p_del'] + line['j_3p_del'] + len(line['jf_insertion']))
    germline_v_end = len(line['fv_insertion']) + len(line['v_gl_seq']) + line[
        'v_3p_del'] - 1  # position in the query sequence at which we find the last base of the v match. NOTE we subtract off the v_5p_del because we're *not* adding dots for that deletion (it's just too long)
    germline_j_start = germline_d_end + 1 - line['d_3p_del'] + len(
        line['dj_insertion']) - line['j_5p_del']
    vj_line = ' ' * len(line['fv_insertion']) + eroded_seqs_dots['v'] + '.' * extra_space_because_of_fixed_nospace \
              + ' ' * (germline_j_start - germline_v_end - 2) + eroded_seqs_dots['j'] + ' ' * len(line['jf_insertion'])
    # and the query line
    qrseq_line = ' ' * len(
        delstrs['v_5p']) + line['seqs'][iseq] + ' ' * line['j_3p_del']

    outstrs = [insert_line, d_line, vj_line, qrseq_line]
    check_outsr_lengths(
        line, outstrs, fix=True
    )  # I think the only way they can be different is if the d right side erosion is so long that it hangs over the right side of the j

    if gap_insert_point is not None:
        for istr in [
                0, 1, 3
        ]:  # everybody except the vj line, which already has the modified interior delstrs above
            outstrs[
                istr] = outstrs[istr][:gap_insert_point] + gapstr + outstrs[
                    istr][gap_insert_point:]

    check_outsr_lengths(line, outstrs, fix=True)

    colors = [[[] for _ in range(len(ostr))] for ostr in outstrs]
    if indelutils.has_indels(line['indelfos'][iseq]):
        # outstrs, colors = old_indel_shenanigans(line, iseq, outstrs, colors)
        outstrs, colors = indel_shenanigans(line, iseq, outstrs, colors)
    outstrs = add_colors(outstrs, colors, line)

    suffixes = [
        'insert%s\n' %
        ('s' if utils.has_d_gene(utils.get_locus(line['v_gene'])) else ''),
        '%s\n' % (utils.color_gene(line['d_gene'])),
        '%s %s\n' %
        (utils.color_gene(line['v_gene']), utils.color_gene(line['j_gene'])),
        '%s   %4.2f mut  %s\n' %
        (get_uid_str(line, iseq, seed_uid), line['mut_freqs'][iseq],
         utils.color('red', utils.is_functional_dbg_str(line, iseq)))
    ]
    outstrs = [
        '%s%s   %s' % (extra_str, ostr, suf)
        for ostr, suf in zip(outstrs, suffixes)
    ]

    if label != '':  # this doesn't really work if the edge of the removed string is the middle of a color code... but oh well, it doesn't really happen any more since I shortened the kbound label from waterer.py
        offset = max(
            0,
            len(extra_str) -
            2)  # skootch <label> this many positions leftward into <extra_str>
        removed_str = outstrs[0][offset:offset +
                                 utils.len_excluding_colors(label)]
        outstrs[0] = outstrs[0][:offset] + label + outstrs[0][
            utils.len_excluding_colors(label) +
            offset:]  # NOTE this *replaces* the bases in <extra_str> with <label>, which is only fine if they're spaces
        if removed_str.strip() != '':
            print '%s%s (covered by label \'%s\')' % (
                ' ' * offset, utils.color('red', removed_str), label)

    if one_line:
        outstrs = outstrs[-1:]  # remove all except the query seq line
    elif not utils.has_d_gene(utils.get_locus(line['v_gene'])):
        outstrs.pop(1)  # remove the d germline line

    print ''.join(outstrs),

    if check_line_integrity:
        if set(line.keys()) != set(original_line.keys()):
            raise Exception('ack 1')
        for k in line:
            if line[k] != original_line[k]:
                print 'key %s differs:\n  %s\n  %s ' % (k, line[k],
                                                        original_line[k])
                raise Exception('')