Exemplo n.º 1
0
    def run(self, args):
        open(self.logfname, 'w').close()

        for name, info in self.tests.items():
            if args.quick and name not in self.quick_tests:
                continue

            self.prepare_to_run(args, name, info)

            action = info['action']
            cmd_str = info['bin'] + ' ' + action
            cmd_str += ' ' + ' '.join(info['extras'] + self.common_extras)
            if name == 'simulate':
                cmd_str += ' --outfname ' + self.simfnames['new']
            elif 'cache-parameters-' not in name:
                cmd_str += ' --outfname ' + self.dirs['new'] + '/' + name + '.csv'

            logstr = '%s   %s' % (utils.color('green', name, width=30, padside='right'), cmd_str)
            print logstr
            if args.dry_run:
                continue
            logfile = open(self.logfname, 'a')
            logfile.write(logstr + '\n')
            logfile.close()
            start = time.time()
            try:
                check_call(cmd_str + ' 1>>' + self.logfname + ' 2>>' + self.logfname, shell=True)
            except CalledProcessError, err:
                # print err  # this just says it exited with code != 0
                print '  log tail:'
                print utils.pad_lines(check_output(['tail', self.logfname]))
                sys.exit(1)  # raise Exception('exited with error')
            self.run_times[name] = time.time() - start  # seconds
Exemplo n.º 2
0
    def run(self, args):
        if not args.dry_run:
            open(self.logfname, 'w').close()

        for name, info in self.tests.items():
            if args.quick and name not in self.quick_tests:
                continue

            self.prepare_to_run(args, name, info)

            action = info['action']
            cmd_str = info['bin'] + ' ' + action
            cmd_str += ' ' + ' '.join(info['extras'] + self.common_extras)
            if name == 'simulate':
                cmd_str += ' --outfname ' + self.infnames['new']['simu']
            elif 'cache-parameters-' not in name:
                cmd_str += ' --outfname ' + self.dirs['new'] + '/' + name + '.csv'

            logstr = '%s   %s%s' % (utils.color('green', name, width=30, padside='right'), cmd_str[:args.print_width], '[...]' if args.print_width < len(cmd_str) else '')
            print logstr
            if args.dry_run:
                continue
            logfile = open(self.logfname, 'a')
            logfile.write(logstr + '\n')
            logfile.close()
            start = time.time()
            try:
                check_call(cmd_str + ' 1>>' + self.logfname + ' 2>>' + self.logfname, shell=True)
            except CalledProcessError, err:
                # print err  # this just says it exited with code != 0
                print '  log tail:'
                print utils.pad_lines(check_output(['tail', self.logfname]))
                sys.exit(1)  # raise Exception('exited with error')
            self.run_times[name] = time.time() - start  # seconds
Exemplo n.º 3
0
def add_indels(n_indels, qrseq, glseq, mean_length, codon_positions, indel_location=None, indel_positions=None, keep_in_frame=False, dbg_pad=0, debug=False):
    def getpos():  # if <pos> is specified we use that, otherwise we use <indel_location> to decide the region of the sequence from which to choose a position
        if indel_location is None:  # uniform over entire sequence
            return random.randint(5, len(qrseq) - 6)  # this will actually exclude either before the first index or after the last index. No, I don't care.
        elif indel_location == 'v':  # within the meat of the v
            return random.randint(5, codon_positions['v'])  # NOTE this isn't actually right, since the codon positions get modified as we add each indel... but it won't usually make a difference
        elif indel_location == 'cdr3':  # inside cdr3
            return random.randint(codon_positions['v'], codon_positions['j'])
        else:
            assert False
    def getlen():
        length = numpy.random.geometric(1. / mean_length)
        if keep_in_frame:
            itry = 0
            while length % 3 != 0:
                length = numpy.random.geometric(1. / mean_length)
                itry += 1
                if itry > 9999:
                    raise Exception('tried too many times to get in-frame indel length')
        return length
    def overlaps(pos, length):  # see if there's any existing indels close to where we're thinking of putting this one NOTE in practice this _really_ shouldn't happen much -- there should be only a a couple of indels per sequence at most -- this just keeps other things (e.g. indelfo consistency checks) from getting confused and crashing
        for gapseq in (indelfo['qr_gap_seq'], indelfo['gl_gap_seq']):
            if len(gapseq) < pos + length + 1:
                return True
            if utils.gap_len(gapseq[pos - length : pos + length]) > 0:  # this leaves a pretty, albeit inexact, large buffer
                return True
        return False

    # choose positions and lengths
    if indel_positions is None:
        indel_positions = [None for _ in range(n_indels)]
    if debug:
        print '%sadding %d indel%s' % (dbg_pad * ' ', n_indels, utils.plural(n_indels))

    # then build the indelfo
    indelfo = get_empty_indel()
    indelfo['genes'] = {}  # it's kind of awkward to have the match info here, but I need some way to pasp it between the aligner that's calling the indel (typically vsearch) and the aligner that's using it (typically sw)
    indelfo['qr_gap_seq'], indelfo['gl_gap_seq'] = qrseq, glseq
    indelfo['reversed_seq'] = qrseq
    for pos in indel_positions:
        length = getlen()
        while pos is None or overlaps(pos, length):
            pos = getpos()
        add_single_indel(indelfo, pos, length, codon_positions, keep_in_frame=keep_in_frame, debug=debug)

    # make the "input seq", i.e. without gaps, and account for this in the codon positions
    input_seq = filter(utils.alphabet.__contains__, indelfo['qr_gap_seq'])
    for region in codon_positions:
        codon_positions[region] -= utils.count_gap_chars(indelfo['qr_gap_seq'], aligned_pos=codon_positions[region])

    if debug:
        print utils.pad_lines(get_dbg_str(indelfo), dbg_pad + 4)

    return input_seq, indelfo
Exemplo n.º 4
0
    def read_input_tree_file(self, outfname):
        if self.args.debug:
            print '  reading trees from %s' % self.args.input_simulation_treefname
        utils.simplerun('cp %s %s' %
                        (self.args.input_simulation_treefname, outfname),
                        debug=False)
        ages, treestrs = [], []
        with open(outfname) as treefile:
            for line in treefile:
                tstr = line.strip()
                if tstr == '':  # skip empty lines
                    continue
                dtree = treeutils.get_dendro_tree(
                    treestr=tstr, suppress_internal_node_taxa=True)
                if dtree.seed_node.edge_length is None:  # make sure root edge length is set (otherwise bppseqgen barfs)
                    dtree.seed_node.edge_length = 0.
                old_new_label_pairs = [
                    (l.taxon.label, 't%d' % (i + 1))
                    for i, l in enumerate(dtree.leaf_node_iter())
                ]
                treeutils.translate_labels(
                    dtree, old_new_label_pairs
                )  # rename the leaves to t1, t2, etc. (it would be nice to not have to do this, but a bunch of stuff in recombinator uses this  to check that e.g. bppseqgen didn't screw up the ordering)
                age = self.choose_full_sequence_branch_length()
                if self.args.debug > 1:  # it's easier to keep this debug line separate up here than make a tmp variable to keep track of the old height
                    print '    input tree %d (rescaled depth %.3f --> %.3f):' % (
                        len(ages), treeutils.get_mean_leaf_height(tree=dtree),
                        age)
                treeutils.rescale_tree(
                    age, dtree=dtree
                )  # I think this gets rescaled again for each event, so we could probably in principle avoid this rescaling, but if the input depth is greater than one stuff starts breaking, so may as well do it now
                ages.append(age)
                treestrs.append(dtree.as_string(schema='newick').strip())
                if self.args.debug > 1:
                    print utils.pad_lines(treeutils.get_ascii_tree(dtree))
        if any(a > 1. for a in ages):
            raise Exception(
                'tree depths must be less than 1., but trees read from %s don\'t satisfy this: %s'
                % (self.args.input_simulation_treefname, ages))
        if len(ages) != self.args.n_trees:
            print '    resetting --n-trees from %d to %d to match trees read from %s' % (
                self.args.n_trees, len(ages),
                self.args.input_simulation_treefname)
        self.args.n_trees = len(ages)

        return ages, treestrs
Exemplo n.º 5
0
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent):
    seqfos = utils.read_fastx(bcr_phylo_fasta_fname(
        outdir))  # output mutated sequences from bcr-phylo

    assert len(
        naive_line['unique_ids']
    ) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
    assert not indelutils.has_indels(
        naive_line['indelfos'][0])  # would have to handle this below
    if args.debug:
        utils.print_reco_event(naive_line)
    reco_info = collections.OrderedDict()
    for sfo in seqfos:
        mline = copy.deepcopy(naive_line)
        utils.remove_all_implicit_info(mline)
        del mline['tree']
        mline['unique_ids'] = [sfo['name']]
        mline['seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['input_seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['duplicates'] = [[]]
        reco_info[sfo['name']] = mline
        utils.add_implicit_info(glfo, mline)
    final_line = utils.synthesize_multi_seq_line_from_reco_info(
        [sfo['name'] for sfo in seqfos], reco_info)
    if args.debug:
        utils.print_reco_event(final_line)

    # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
    if args.stype == 'selection':
        cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % (
            outdir, args.extrastr, outdir, outdir)
        utils.run_ete_script(cmd, ete_path)
        nodefo = {}
        with open('%s/kd-vals.csv' % outdir) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                nodefo[line['uid']] = {
                    'kd': float(line['kd']),
                    'relative_kd': float(line['relative_kd']),
                    'lambda': line.get('lambda', None),
                    'target_index': int(line['target_index']),
                }
        if len(
                set(nodefo) - set(final_line['unique_ids'])
        ) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (
                set(nodefo) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(nodefo)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(
                set(final_line['unique_ids']) - set(nodefo))
        final_line['affinities'] = [
            1. / nodefo[u]['kd'] for u in final_line['unique_ids']
        ]
        final_line['relative_affinities'] = [
            1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']
        ]
        final_line['lambdas'] = [
            nodefo[u]['lambda'] for u in final_line['unique_ids']
        ]
        final_line['nearest_target_indices'] = [
            nodefo[u]['target_index'] for u in final_line['unique_ids']
        ]
        tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir)
        tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']]))
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree),
                                  padwidth=12)
        final_line['tree'] = tree.as_string(schema='newick')
    tmp_event = RecombinationEvent(
        glfo)  # I don't want to move the function out of event.py right now
    tmp_event.set_reco_id(
        final_line, irandom=ievent
    )  # not sure that setting <irandom> here actually does anything

    # get target sequences
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' %
                                     (outdir, args.extrastr))
    final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos]

    return final_line
Exemplo n.º 6
0
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent):
    seqfos = utils.read_fastx(
        '%s/%s.fasta' %
        (outdir, args.extrastr))  # output mutated sequences from bcr-phylo

    assert len(
        naive_line['unique_ids']
    ) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
    assert not indelutils.has_indels(
        naive_line['indelfos'][0])  # would have to handle this below
    if args.debug:
        utils.print_reco_event(naive_line)
    reco_info = collections.OrderedDict()
    for sfo in seqfos:
        mline = copy.deepcopy(naive_line)
        utils.remove_all_implicit_info(mline)
        del mline['tree']
        mline['unique_ids'] = [sfo['name']]
        mline['seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['input_seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        reco_info[sfo['name']] = mline
        utils.add_implicit_info(glfo, mline)
    final_line = utils.synthesize_multi_seq_line_from_reco_info(
        [sfo['name'] for sfo in seqfos], reco_info)
    if args.debug:
        utils.print_reco_event(final_line)

    # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
    if args.stype == 'selection':
        cmd = 'export PATH=%s:$PATH && xvfb-run -a python ./bin/view-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % (
            ete_path, outdir, args.extrastr, outdir, outdir)
        utils.simplerun(cmd, shell=True)
        kdvals = {}
        with open('%s/kd-vals.csv' % outdir) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                kdvals[line['uid']] = float(line['kd'])
        if len(
                set(kdvals) - set(final_line['unique_ids'])
        ) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (
                set(kdvals) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(kdvals)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(
                set(final_line['unique_ids']) - set(kdvals))
        final_line['affinities'] = [
            1. / kdvals[u] for u in final_line['unique_ids']
        ]
        tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir)
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree),
                                  padwidth=12)
        final_line['tree'] = tree.as_string(schema='newick')
    tmp_event = RecombinationEvent(
        glfo)  # I don't want to move the function out of event.py right now
    tmp_event.set_reco_id(
        final_line, irandom=ievent
    )  # not sure that setting <irandom> here actually does anything

    # get target sequences
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' %
                                     (outdir, args.extrastr))
    final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos]
    from Bio.Seq import Seq
    final_line['nearest_target_indices'] = []
    aa_targets = [Seq(seq).translate() for seq in final_line['target_seqs']]
    for mseq in final_line['input_seqs']:
        aa_mseq = Seq(mseq).translate()
        aa_hdists = [
            utils.hamming_distance(aa_t, aa_mseq, amino_acid=True)
            for aa_t in aa_targets
        ]
        imin = aa_hdists.index(
            min(aa_hdists)
        )  # NOTE doesn't do anything differently if there's more than one min
        final_line['nearest_target_indices'].append(imin)

    return final_line
Exemplo n.º 7
0
def check_single_sequence_indels(line, iseq, print_on_err=True, debug=False):
    # debug = 2
    def check_single_ifo(old_ifo, new_ifo):
        if debug:
            print '  len %d  pos %d  seqstr %s' % (
                old_ifo['len'], old_ifo['pos'], old_ifo['seqstr']),
        if new_ifo != old_ifo:
            if debug:
                print '  %s' % utils.color('red', 'nope')
            new_seqstr, old_seqstr = utils.color_mutants(
                old_ifo['seqstr'],
                new_ifo['seqstr'],
                return_ref=True,
                align=True)  #len(old_ifo['seqstr']) != len(new_ifo['seqstr']))
            if print_on_err:
                print '  pos %d --> %s    len %d --> %s    seqstr %s --> %s' % (
                    old_ifo['pos'],
                    utils.color(
                        None if new_ifo['pos'] == old_ifo['pos'] else 'red',
                        '%d' % new_ifo['pos']), old_ifo['len'],
                    utils.color(
                        None if new_ifo['len'] == old_ifo['len'] else 'red',
                        '%d' % new_ifo['len']), old_seqstr, new_seqstr)
            return False
        else:
            if debug:
                print '  %s' % utils.color('green', 'ok')
            return True

    indelfo = line['indelfos'][iseq]
    if not has_indels(indelfo):
        return

    consistent = True

    new_indelfo = reconstruct_indelfo_from_gap_seqs_and_naive_seq(
        line['indelfos'][iseq]['qr_gap_seq'],
        line['indelfos'][iseq]['gl_gap_seq'],
        indelfo['genes'],
        line,
        iseq,
        debug=debug)

    if set(new_indelfo['genes']) != set(indelfo['genes']):
        if print_on_err:
            print '%s different indel regions before %s and after %s reconstruction' % (
                utils.color('red', 'error'), ' '.join(
                    (indelfo['genes'].keys())), ' '.join(
                        new_indelfo['genes'].keys()))
        consistent = False
    else:
        for region in indelfo['genes']:
            if new_indelfo['genes'][region] != indelfo['genes'][region]:
                if print_on_err:
                    print '%s different indel genes before %s and after %s reconstruction' % (
                        utils.color('red', 'error'),
                        utils.color_gene(indelfo['genes'][region]),
                        utils.color_gene(new_indelfo['genes'][region]))
                consistent = False

    if len(new_indelfo['indels']) != len(indelfo['indels']):
        if print_on_err:
            print '%s different number of indels before %d and after %d reconstruction' % (
                utils.color('red', 'error'), len(
                    indelfo['indels']), len(new_indelfo['indels']))
        consistent = False

    old_indel_list, new_indel_list = copy.deepcopy(
        indelfo['indels']), copy.deepcopy(new_indelfo['indels'])
    old_positions, new_positions = [ifo['pos'] for ifo in old_indel_list
                                    ], [ifo['pos'] for ifo in new_indel_list]
    if old_positions == new_positions:
        if debug:
            print '  same positions in old and new indelfos: %s' % ' '.join(
                [str(p) for p in old_positions])
    elif set(new_positions) == set(old_positions):
        if debug:  # I think this'll only happen on old simulation files (ok, I can't really call them "old" yet since I haven't fixed it, but at some point I will, and then everybody's positions will then be sorted)
            print '  sorting both indel lists'
        old_indel_list = sorted(old_indel_list, key=lambda q: q['pos'])
        new_indel_list = sorted(new_indel_list, key=lambda q: q['pos'])
    else:
        consistent = False
        if print_on_err:
            print '  inconsistent position lists:\n  old  %s\n  new  %s' % (
                ' '.join([str(p) for p in sorted(old_positions)]), ' '.join(
                    [str(p) for p in sorted(new_positions)]))

    if consistent:  # i.e. if nothing so far has been inconsistent
        for old_ifo, new_ifo in zip(old_indel_list, new_indel_list):
            consistent &= check_single_ifo(old_ifo, new_ifo)

    if not consistent:
        if print_on_err:
            print '%s inconsistent indel info for %s (see previous lines)' % (
                utils.color('red', 'error'), ':'.join(line['unique_ids']))
            print '       original:'
            print utils.pad_lines(get_dbg_str(indelfo), 8)
            print '       reconstructed:'
            print utils.pad_lines(get_dbg_str(new_indelfo), 8)
Exemplo n.º 8
0
def get_indelfo_from_cigar(cigarstr,
                           full_qrseq,
                           qrbounds,
                           full_glseq,
                           glbounds,
                           genes,
                           vsearch_conventions=False,
                           uid=None,
                           debug=False):
    # debug = 'D' in cigarstr or 'I' in cigarstr
    if debug:
        print '  initial%s:' % ((' for %s' % uid) if uid is not None else '')
        print '    %s' % color_cigar(cigarstr)
        print '    qr %3d %3d %s' % (qrbounds[0], qrbounds[1], full_qrseq)
        print '    gl %3d %3d %s' % (glbounds[0], glbounds[1], full_glseq)

    cigars = [
        split_cigarstr(cstr) for cstr in re.findall('[0-9]*[A-Z]', cigarstr)
    ]  # split cigar string into its parts, then split each part into the code and the length
    if vsearch_conventions:
        assert 'v' in genes  # would need to be generalized
        cigars = [
            (code.translate(string.maketrans('ID', 'DI')), length)
            for code, length in cigars
        ]  # vsearch reverses what's the query and what's the target/gene/whathaveyou compared to what ig-sw does
        for iend in [0, -1]:
            if cigars[iend][
                    0] == 'I':  # qr extends beyond gl: ig-sw calls these soft-clips, vsearch calls them insertions
                cigars[iend] = ('S', cigars[iend][1])
            elif cigars[iend][
                    0] == 'D':  # gl goes past qr: ig-sw just calls them not part of the alignment, vsearch calls them deletions
                cigars.pop(iend)
    cigars = [(code, length) for code, length in cigars
              if code != 'S']  # remove soft-clipping
    cigarstr = ''.join(['%d%s' % (l, c) for c, l in cigars])
    qrseq = full_qrseq[qrbounds[0]:qrbounds[1]]  # ...and trim qrseq and glseq
    glseq = full_glseq[glbounds[0]:glbounds[1]]

    if debug:
        print '  parsed:'
        print '    %s' % color_cigar(cigarstr)
        print '    %s' % '   '.join(['%s %d' % (c, l) for c, l in cigars])
        print '    qr %s' % qrseq
        print '    gl %s' % glseq

    check_cigar_len(cigars, qrseq, glseq, uid=uid)

    indelfo = get_empty_indel(
    )  # replacement_seq: query seq with insertions removed and germline bases inserted at the position of deletions
    if 'I' not in cigarstr and 'D' not in cigarstr:  # has to happen after we've changed from vsearch conventions
        if debug:
            print '  no indels'
        return indelfo

    # each position is the cigar code corresponding to that position in the alignment
    codestr = ''.join([length * code for code, length in cigars])

    # add each indel to <indelfo['indels']>, and build <tmp_indices> to keep track of what's going on at each position
    indel_pos = 0  # position within alignment (god damnit, I used to have written here that it was the query sequence position)
    tmp_indices = [
    ]  # integer for each position in the alignment, giving the index of the indel that we're within (None if we're not in an indel)
    if debug:
        print '      code  length'
    for code, length in cigars:
        if debug:
            print '        %s     %3d' % (code, length)
        if code == 'I':  # advance qr seq but not gl seq
            indelfo['indels'].append(
                {
                    'type': 'insertion',
                    'pos': indel_pos,
                    'len': length,
                    'seqstr': []
                }
            )  # insertion begins at <pos> (note that 'seqstr' later on gets converted from a list to a string)
            tmp_indices += [
                len(indelfo['indels']) - 1 for _ in range(length)
            ]  # indel index corresponding to this position in the alignment
        elif code == 'D':  # advance qr seq but not gl seq
            indelfo['indels'].append(
                {
                    'type': 'deletion',
                    'pos': indel_pos,
                    'len': length,
                    'seqstr': []
                }
            )  # first deleted base is <pos> (well, first base which is in the position of the first deleted base)
            tmp_indices += [
                len(indelfo['indels']) - 1 for _ in range(length)
            ]  # indel index corresponding to this position in the alignment
        else:
            tmp_indices += [
                None for _ in range(length)
            ]  # indel index corresponding to this position in the alignment
        indel_pos += length

    if debug:
        print '      %s  codestr' % ''.join(
            [c if c not in 'ID' else utils.color('blue', c) for c in codestr])
        print '      %s  indel index' % ''.join(
            [str(ti if ti is not None else ' ') for ti in tmp_indices])

    # then construct the dbg strings, indel-reversed input sequence, and 'seqstr' entries in indelfo
    qr_gap_seq, gl_gap_seq = [], []
    iqr, igl = 0, 0
    for icode in range(len(codestr)):
        code = codestr[icode]
        if code == 'M':
            qr_gap_seq += [qrseq[iqr]]
            gl_gap_seq += [glseq[igl]]
        elif code == 'I':
            indelfo['indels'][tmp_indices[icode]]['seqstr'] += [
                qrseq[iqr]
            ]  # and to the sequence of just this indel
            qr_gap_seq += [qrseq[iqr]]
            gl_gap_seq += ['.']
            igl -= 1
        elif code == 'D':
            indelfo['indels'][tmp_indices[icode]]['seqstr'] += [
                glseq[igl]
            ]  # and to the sequence of just this indel
            qr_gap_seq += ['.']
            gl_gap_seq += [glseq[igl]]
            iqr -= 1
        else:
            raise Exception('unexpected cigar code %s' % code)
        iqr += 1
        igl += 1

    # convert character lists to strings (indels are rare enough that this probably isn't that much faster, but it just feels wrong not to)
    qr_gap_seq = ''.join(qr_gap_seq)
    gl_gap_seq = ''.join(gl_gap_seq)
    for ifo in indelfo['indels']:
        ifo['seqstr'] = ''.join(ifo['seqstr'])

    # at the start of this fcn we trimmed off the "non-matched" bits of the query and germline sequences, so now we have to account for them (it might be nicer to have it all done at once, but this is the way it is, for historical reasons) (where the definition of "non-matched" is a bit fuzzy depending on whether it's vsearch or ig-sw)
    for ifo in indelfo['indels']:
        ifo['pos'] += qrbounds[0]

    # NOTE gapped seqs do _not_ contain the v 5p and j 3p deletions or fv and jf insertions, because this makes it easier to combine indels from different regions later on
    indelfo['genes'] = genes
    indelfo['qr_gap_seq'] = qr_gap_seq
    indelfo['gl_gap_seq'] = gl_gap_seq
    indelfo['reversed_seq'] = get_reversed_seq(qr_gap_seq, gl_gap_seq,
                                               full_qrseq[:qrbounds[0]],
                                               full_qrseq[qrbounds[1]:])

    if debug:
        print utils.pad_lines(get_dbg_str(indelfo), 0)

    return indelfo
Exemplo n.º 9
0
    def make_single_tree(self, partitions, annotations, uid_set, get_fasttrees=False, n_max_cons_seqs=10, debug=False):
        # NOTE don't call this externally -- if you want a single tree, call make_trees() with <i_only_cluster> set
        def getline(uidstr, uid_set=None):
            if uidstr in annotations:  # if we have this exact annotation
                return annotations[uidstr]
            else:
                if uid_set is None:
                    uid_set = set(uidstr.split(':'))  # should only get called if it's a singleton
                # note that for internal nodes in a fasttree-derived subtree, the uids will be out of order compared the the annotation keys
                for line in annotations.values():  # we may actually have the annotation for every subcluster (e.g. if --calculate-alternative-annotations was set), but in case we don't, this is fine
                    if len(uid_set & set(line['unique_ids'])) > 0:  # just take the first one with any overlap. Yeah, it's not necessarily the best, but its naive sequence probably isn't that different, and for just getting the fasttree it reeeeeeaaaallly doesn't matter
                        return line
            raise Exception('couldn\'t find uid %s in annotations' % uid)
        def getseq(uid):
            line = getline(uid)
            return line['seqs'][line['unique_ids'].index(uid)]
        def lget(uid_list):
            return ':'.join(uid_list)

        # check for repeated uids (was only from seed uid, which shouldn't happen any more, but the code below throws an infinite loop if we do, so may as well be careful)
        for partition in partitions:
            if sum(len(c) for c in partition) > len(set(u for c in partition for u in c)):
                repeated_uids = [u for u, count in collections.Counter([u for c in partition for u in c]).items() if count > 1]
                raise Exception('found %d uid%s in more than one cluster (%s)' % (len(repeated_uids), utils.plural(len(repeated_uids)), ', '.join(repeated_uids)))

        default_edge_length = 999999  # it's nice to have the edges all set to something that's numeric (so the trees print), but also obvious wrong, if we forget to set somebody
        assert len(partitions[-1]) == 1
        root_label = lget(partitions[-1][0])  # we want the order of the uids in the label to correspond to the order in self.partitions
        tns = dendropy.TaxonNamespace([root_label])
        root_node = dendropy.Node(taxon=tns.get_taxon(root_label))
        root_node.uids = uid_set  # each node keeps track of the uids of its children
        dtree = dendropy.Tree(taxon_namespace=tns, seed_node=root_node)
        if debug:
            print '    starting tree with %d leaves' % len(uid_set)
        for ipart in reversed(range(len(partitions) - 1)):  # dendropy seems to only have fcns to build a tree from the root downward, so we loop starting with the last partition (- 1 is because the last partition is guaranteed to be just one cluster)
            for lnode in dtree.leaf_node_iter():  # look for leaf nodes that contain uids from two clusters in this partition, and add those as children
                tclusts = [c for c in partitions[ipart] if len(set(c) & lnode.uids) > 0]
                if len(tclusts) < 2:
                    continue
                for tclust in tclusts:
                    ttaxon = dendropy.Taxon(lget(tclust))
                    tns.add_taxon(ttaxon)
                    child = lnode.new_child(taxon=ttaxon, edge_length=default_edge_length)
                    child.uids = set(tclust)
                if debug:
                    print '      ipart %d' % ipart
                    print '        split node: %d --> %s      %s --> %s' % (len(lnode.uids), ' '.join([str(len(tc)) for tc in tclusts]), lnode.taxon.label, ' '.join([c.taxon.label for c in lnode.child_node_iter()]))

        # split existing leaves, which are probably not singletons (they're probably from the initial naive sequence collapse step) into subtrees such that each leaf is a singleton
        for lnode in dtree.leaf_node_iter():
            if len(lnode.uids) == 1:
                continue
            if get_fasttrees and len(lnode.uids) > 2:
                seqfos = [{'name' : uid, 'seq' : getseq(uid)} for uid in lnode.taxon.label.split(':')]  # may as well add them in the right order, although I don't think it matters
                subtree = treeutils.get_fasttree_tree(seqfos, getline(lnode.taxon.label, uid_set=lnode.uids)['naive_seq'], suppress_internal_node_taxa=True)  # note that the fasttree distances get ignored below (no idea if they'd be better than what we set down there, but they probably wouldn't be consistent, so I'd rather ignore them)
                for tmpnode in subtree.postorder_node_iter():
                    if tmpnode.is_leaf():
                        tmpnode.uids = set([tmpnode.taxon.label])
                    else:
                        tmpnode.uids = set([uid for c in tmpnode.child_node_iter() for uid in c.uids])
                        ttaxon = dendropy.Taxon(lget(tmpnode.uids))
                        subtree.taxon_namespace.add_taxon(ttaxon)
                        tmpnode.taxon = ttaxon  # ...and use the string of leaf nodes, even though they'll be in the wrong order (I think these get ignored when I call label_nodes() below, but it's still tidier to have them right in the meantime, and anyway since I'm suppressing internal taxa I think I need to set them to something)

                if debug:
                    print '   adding subtree with %d leaves from fastree at leaf node %s' % (len(seqfos), lnode.taxon.label)
                    print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=subtree))
                dtree.taxon_namespace.add_taxa(subtree.taxon_namespace)
                lnode.add_child(subtree.seed_node)
                assert len(lnode.child_edges()) == 1  # we're iterating over leaves, so this should always be true
                lnode.child_edges()[0].collapse()
            else:  # just add a star subtree
                for uid in lnode.taxon.label.split(':'):  # may as well add them in the right order, although I don't think it matters
                    ttaxon = dendropy.Taxon(uid)
                    tns.add_taxon(ttaxon)
                    child = lnode.new_child(taxon=ttaxon, edge_length=default_edge_length)
                    child.uids = set([uid])
                if debug:
                    print '      added %d singleton children for %s' % (len(lnode.uids), lnode.taxon.label)

        # in order to set edge lengths, we need node sequences, so first set leaf node seqs
        for lnode in dtree.leaf_node_iter():
            assert len(lnode.uids) == 1
            lnode.seq = getseq(lnode.taxon.label)
            lnode.n_descendent_leaves = 1  # keep track of how many leaf nodes contributed to each node's consensus sequence (these are leaves, so it's trivally 1). This is less accurate than keeping track of all the sequences, but also faster

        # then set internal node seqs as the consensus of their children, and set the distance as hamming distance to child seqs
        if debug:
            print '    adding edge lengths either from fasttree %s or cons seq %s' % (utils.color('blue', 'x'), utils.color('red', 'x'))
        min_edge_length = None  # setting this is nice for better debug viewing
        for node in dtree.postorder_internal_node_iter():  # includes root node
            child_cons_seq_counts = [c.n_descendent_leaves for c in node.child_node_iter()]
            total_descendent_leaves = sum(child_cons_seq_counts)
            if total_descendent_leaves > n_max_cons_seqs:  # if there's tons of descendent leaves, we don't want to pass them all to the consensus fcn since it's slow, so we choose them in proportion to their actual proportions, but scaled down to <n_max_cons_seqs>
                child_cons_seq_counts = [int(n_max_cons_seqs * csc / float(total_descendent_leaves)) for csc in child_cons_seq_counts]
                child_cons_seq_counts = [max(1, csc) for csc in child_cons_seq_counts]  # don't eliminate any sequences entirely (this makes the proportions less accurate (in some cases), but is the easy way to handle the case where there's a ton of singleton children
            if debug:
                print '  %s' % utils.color('green', node.taxon.label)
                csc_str = '  (reduced: %s)' % ' '.join([str(csc) for csc in child_cons_seq_counts]) if total_descendent_leaves > n_max_cons_seqs else ''
                print '      desc leaves per child: %s%s' % (' '.join(str(c.n_descendent_leaves) for c in node.child_node_iter()), csc_str)
            child_seqfos = [{'name' : cn.taxon.label + '-leaf-' + str(il), 'seq' : cn.seq} for cn, count in zip(node.child_node_iter(), child_cons_seq_counts) for il in range(count)]
            node.seq = utils.cons_seq(0.01, aligned_seqfos=child_seqfos, tie_resolver_seq=getline(root_label)['naive_seq'])  #, debug=debug)  # the consensus has an N at every position where the constituent sequences gave a tie. But Ns screw up the distances (especially because once we *get* an N, we can't get rid of it and it's propagated all the way up the tree), and in almost all cases the correct choice should be the naive base, so we use that
            node.n_descendent_leaves = total_descendent_leaves
            for edge in node.child_edge_iter():
                from_fasttree = False
                if edge.length == default_edge_length:  # otherwise it was set by fasttree, and it's probably better than what we'd get from this (it'd be nice to skip the cons seq stuff for the whole fasttree subtree, but then we don't have the cons seqs we need for later)
                    edge.length = utils.hamming_distance(edge.head_node.seq, node.seq) / float(len(node.seq))
                else:
                    from_fasttree = True
                if min_edge_length is not None:
                    edge.length = max(min_edge_length, edge.length)
                if debug:
                    print '       %6.3f   %s  %s' % (edge.length, utils.color('blue' if from_fasttree else 'red', 'x'), edge.head_node.taxon.label)

        if debug:
            print '        naive seq %s' % getline(root_label)['naive_seq'] # NOTE might be worthwhile to add an edge connecting seed node and the actual naive sequence (i.e. for cases where our approximate naive is off)
            print '    root cons seq %s' % utils.color_mutants(getline(root_label)['naive_seq'], dtree.seed_node.seq)

        for node in dtree.preorder_node_iter():
            del node.uids
            del node.seq
            del node.n_descendent_leaves

        treeutils.label_nodes(dtree, ignore_existing_internal_node_labels=True, ignore_existing_internal_taxon_labels=True, debug=debug)
        dtree.update_bipartitions()  # probably don't really need this
        if debug:
            print treeutils.utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=dtree, width=250))

        return dtree
Exemplo n.º 10
0
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent):
    seqfos = utils.read_fastx(bcr_phylo_fasta_fname(outdir))  # output mutated sequences from bcr-phylo

    assert len(naive_line['unique_ids']) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
    assert not indelutils.has_indels(naive_line['indelfos'][0])  # would have to handle this below
    if args.debug:
        utils.print_reco_event(naive_line)
    reco_info = collections.OrderedDict()
    for sfo in seqfos:
        mline = copy.deepcopy(naive_line)
        utils.remove_all_implicit_info(mline)
        del mline['tree']
        mline['unique_ids'] = [sfo['name']]
        mline['seqs'] = [sfo['seq']]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['input_seqs'] = [sfo['seq']]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['duplicates'] = [[]]
        reco_info[sfo['name']] = mline
        try:
            utils.add_implicit_info(glfo, mline)
        except:  # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file
            print 'implicit info adding failed for ievent %d in %s' % (ievent, outdir)
            lines = traceback.format_exception(*sys.exc_info())
            print utils.pad_lines(''.join(lines))  # NOTE this will still crash on the next line if implicit info adding failed
    final_line = utils.synthesize_multi_seq_line_from_reco_info([sfo['name'] for sfo in seqfos], reco_info)
    if args.debug:
        utils.print_reco_event(final_line)

    # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
    if args.stype == 'selection':
        kdfname, nwkfname = '%s/kd-vals.csv' % outdir, '%s/simu.nwk' % outdir
        if not utils.output_exists(args, kdfname, outlabel='kd/nwk conversion', offset=4):  # eh, don't really need to check for both kd an nwk file, chances of only one being missing are really small, and it'll just crash when it looks for it a couple lines later
            cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s --newick-tree-file %s' % (outdir, args.extrastr, kdfname, nwkfname)
            utils.run_ete_script(cmd, ete_path, debug=args.n_procs==1)
        nodefo = {}
        with open(kdfname) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                nodefo[line['uid']] = {
                    'kd' : float(line['kd']),
                    'relative_kd' : float(line['relative_kd']),
                    'lambda' : line.get('lambda', None),
                    'target_index' : int(line['target_index']),
                }
        if len(set(nodefo) - set(final_line['unique_ids'])) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (set(nodefo) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(nodefo)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(set(final_line['unique_ids']) - set(nodefo))
        final_line['affinities'] = [1. / nodefo[u]['kd'] for u in final_line['unique_ids']]
        final_line['relative_affinities'] = [1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']]
        final_line['lambdas'] = [nodefo[u]['lambda'] for u in final_line['unique_ids']]
        final_line['nearest_target_indices'] = [nodefo[u]['target_index'] for u in final_line['unique_ids']]
        tree = treeutils.get_dendro_tree(treefname=nwkfname)
        tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']]))
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12)
        final_line['tree'] = tree.as_string(schema='newick')
    tmp_event = RecombinationEvent(glfo)  # I don't want to move the function out of event.py right now
    tmp_event.set_reco_id(final_line, irandom=ievent)  # not sure that setting <irandom> here actually does anything

    # get target sequences
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr))
    final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos]

    return final_line
Exemplo n.º 11
0
def get_indelfo_from_cigar(cigarstr,
                           qrseq,
                           qrbounds,
                           glseq,
                           glbounds,
                           gene,
                           vsearch_conventions=False,
                           debug=False):
    if debug:
        print '  initial:'
        print '    %s' % color_cigar(cigarstr)
        print '    qr %3d %3d %s' % (qrbounds[0], qrbounds[1], qrseq)
        print '    gl %3d %3d %s' % (glbounds[0], glbounds[1], glseq)

    cigars = [
        split_cigarstr(cstr) for cstr in re.findall('[0-9]*[A-Z]', cigarstr)
    ]  # split cigar string into its parts, then split each part into the code and the length
    if vsearch_conventions:
        assert utils.get_region(gene) == 'v'  # would need to be generalized
        cigars = [
            (code.translate(string.maketrans('ID', 'DI')), length)
            for code, length in cigars
        ]  # vsearch reverses what's the query and what's the target/gene/whathaveyou compared to what ig-sw does
        for iend in [0, -1]:
            if cigars[iend][
                    0] == 'I':  # qr extends beyond gl: ig-sw calls these soft-clips, vsearch calls them insertions
                cigars[iend] = ('S', cigars[iend][1])
            elif cigars[iend][
                    0] == 'D':  # gl goes past qr: ig-sw just calls them not part of the alignment, vsearch calls them deletions
                cigars.pop(iend)
    cigars = [(code, length) for code, length in cigars
              if code != 'S']  # remove soft-clipping
    cigarstr = ''.join(['%d%s' % (l, c) for c, l in cigars])
    qrseq = qrseq[qrbounds[0]:qrbounds[1]]  # ...and trim qrseq and glseq
    glseq = glseq[glbounds[0]:glbounds[1]]

    if debug:
        print '  parsed:'
        print '    %s' % color_cigar(cigarstr)
        print '    %s' % '   '.join(['%s %d' % (c, l) for c, l in cigars])
        print '    qr %s' % qrseq
        print '    gl %s' % glseq

    # check consistency between cigar and qr/gl seqs
    for seqtype, tmpseq, tmpcode in (('qr', qrseq, 'D'), ('gl', glseq, 'I')):
        cigar_len = sum([length for code, length in cigars if code != tmpcode])
        if cigar_len != len(tmpseq):
            raise Exception('cigar length %d doesn\'t match %s seq length %d' %
                            (cigar_len, seqtype, len(tmpseq)))

    indelfo = get_empty_indel(
    )  # replacement_seq: query seq with insertions removed and germline bases inserted at the position of deletions
    # TODO should probably also ignore indels on either end (I think only relevant for vsearch)
    if 'I' not in cigarstr and 'D' not in cigarstr:  # has to happen after we've changed from vsearch conventions
        if debug:
            print '  no indels'
        return indelfo

    # add each indel to <indelfo['indels']>, and build <codestr> and <tmp_indices> to keep track of what's going on at each position
    codestr = ''.join(
        [length * code for code, length in cigars]
    )  # each position is cigar code corresponding to that position in the alignment
    qpos = 0  # position within query sequence
    tmp_indices = [
    ]  # integer for each position in the alignment, giving the index of the indel that we're within (None if we're not in an indel)
    if debug:
        print '      code  length'
    for code, length in cigars:
        if debug:
            print '        %s     %3d' % (code, length)
        if code == 'I':  # advance qr seq but not gl seq
            indelfo['indels'].append(
                {
                    'type': 'insertion',
                    'pos': qpos,
                    'len': length,
                    'seqstr': []
                }
            )  # insertion begins at <pos> (note that 'seqstr' later on gets converted from a list to a string)
            tmp_indices += [
                len(indelfo['indels']) - 1 for _ in range(length)
            ]  # indel index corresponding to this position in the alignment
        elif code == 'D':  # advance qr seq but not gl seq
            indelfo['indels'].append(
                {
                    'type': 'deletion',
                    'pos': qpos,
                    'len': length,
                    'seqstr': []
                }
            )  # first deleted base is <pos> (well, first base which is in the position of the first deleted base)
            tmp_indices += [
                len(indelfo['indels']) - 1 for _ in range(length)
            ]  # indel index corresponding to this position in the alignment
        else:
            tmp_indices += [
                None for _ in range(length)
            ]  # indel index corresponding to this position in the alignment
        qpos += length

    if debug:
        print '      %s  codestr' % ''.join(
            [c if c not in 'ID' else utils.color('blue', c) for c in codestr])
        print '      %s  indel index' % ''.join(
            [str(ti if ti is not None else ' ') for ti in tmp_indices])

    # then construct the dbg strings, indel-reversed input sequence, and 'seqstr' entries in indelfo
    qrprintstr, glprintstr, reversed_seq = [], [], []
    iqr, igl = 0, 0
    for icode in range(len(codestr)):
        code = codestr[icode]
        if code == 'M':
            qrbase = qrseq[iqr]
            if qrbase != glseq[igl]:
                qrbase = utils.color('red', qrbase)
            qrprintstr.append(qrbase)
            glprintstr.append(glseq[igl])
            reversed_seq.append(
                qrseq[iqr]
            )  # add the base to the overall sequence with all indels reversed
        elif code == 'S':
            continue
        elif code == 'I':
            qrprintstr.append(utils.color('light_blue', qrseq[iqr]))
            glprintstr.append(utils.color('light_blue', '*'))
            indelfo['indels'][tmp_indices[icode]]['seqstr'].append(
                qrseq[iqr])  # and to the sequence of just this indel
            igl -= 1
        elif code == 'D':
            qrprintstr.append(utils.color('light_blue', '*'))
            glprintstr.append(utils.color('light_blue', glseq[igl]))
            reversed_seq.append(
                glseq[igl]
            )  # add the base to the overall sequence with all indels reversed
            indelfo['indels'][tmp_indices[icode]]['seqstr'].append(
                glseq[igl])  # and to the sequence of just this indel
            iqr -= 1
        else:
            raise Exception('unhandled cigar code %s' % code)

        iqr += 1
        igl += 1

    # convert character lists to strings (indels are rare enough that this probably isn't that much faster, but it just feels wrong not to)
    qrprintstr = ''.join(qrprintstr)
    glprintstr = ''.join(glprintstr)
    indelfo['reversed_seq'] = ''.join(reversed_seq)
    for ifo in indelfo['indels']:
        ifo['seqstr'] = ''.join(ifo['seqstr'])

    # make the dbg str for indelfo
    gwidth = str(
        len(gene))  # doesn't account for color abbreviation, but oh well
    dbg_str_list = [
        ('%' + gwidth + 's  %s') %
        (utils.color_gene(gene, width=int(gwidth), leftpad=True), glprintstr),
        ('%' + gwidth + 's  %s') % ('query', qrprintstr)
    ]
    for idl in indelfo['indels']:
        dbg_str_list.append('%10s: %d base%s at %d (%s)' %
                            (idl['type'], idl['len'], utils.plural(
                                idl['len']), idl['pos'], idl['seqstr']))
    indelfo['dbg_str'] = '\n'.join(dbg_str_list)

    if debug:
        print utils.pad_lines(indelfo['dbg_str'], 0)

    return indelfo
Exemplo n.º 12
0
    def get_mature_line(sfos,
                        naive_line,
                        glfo,
                        nodefo,
                        dtree,
                        target_sfos,
                        locus=None):
        assert len(
            naive_line['unique_ids']
        ) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
        assert not indelutils.has_indels(
            naive_line['indelfos'][0])  # would have to handle this below
        if args.debug:
            utils.print_reco_event(naive_line)
        reco_info = collections.OrderedDict()
        for sfo in sfos:
            mline = utils.get_non_implicit_copy(naive_line)
            del mline['tree']
            mline['unique_ids'] = [sfo['name']]
            mline['seqs'] = [sfo['seq']]
            mline['input_seqs'] = [
                sfo['seq']
            ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
            mline['duplicates'] = [[]]
            reco_info[sfo['name']] = mline
            try:
                utils.add_implicit_info(glfo, mline)
            except:  # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file
                print 'implicit info adding failed for ievent %d in %s' % (
                    ievent, outdir)
                lines = traceback.format_exception(*sys.exc_info())
                print utils.pad_lines(
                    ''.join(lines)
                )  # NOTE this will still crash on the next line if implicit info adding failed
        final_line = utils.synthesize_multi_seq_line_from_reco_info(
            [sfo['name'] for sfo in sfos], reco_info)

        ftree = copy.deepcopy(dtree)
        if locus is not None:

            def ltr(u):
                return u + '-' + locus

            new_nodefo = {}
            for u_old in nodefo:
                new_nodefo[ltr(u_old)] = nodefo[u_old]
            nodefo = new_nodefo
            treeutils.translate_labels(ftree,
                                       [(u, ltr(u))
                                        for u in final_line['unique_ids']])
            final_line['unique_ids'] = [
                ltr(u) for u in final_line['unique_ids']
            ]
            assert len(sfos) == len(final_line['unique_ids'])
            for iseq, sfo in enumerate(sfos):
                naive_id = naive_line['unique_ids'][0]
                assert naive_id.count('-') == 1
                bstr = naive_id.replace('-' + locus, '')
                pids = final_line['paired-uids'][iseq]
                assert len(pids) == 1 and pids[0].find(
                    bstr
                ) == 0 and pids[0].count('-') == 1 and pids[0].split(
                    '-'
                )[1] in utils.loci  # if uid is xxx-igh, paired id shoud be e.g. xxx-igk
                final_line['paired-uids'][iseq] = [
                    p.replace(bstr, sfo['name']) for p in pids
                ]

        if args.debug:
            utils.print_reco_event(final_line)

        # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
        if len(
                set(nodefo) - set(final_line['unique_ids'])
        ) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (
                set(nodefo) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(nodefo)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(
                set(final_line['unique_ids']) - set(nodefo))
        final_line['affinities'] = [
            1. / nodefo[u]['kd'] for u in final_line['unique_ids']
        ]
        final_line['relative_affinities'] = [
            1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']
        ]
        final_line['lambdas'] = [
            nodefo[u]['lambda'] for u in final_line['unique_ids']
        ]
        final_line['nearest_target_indices'] = [
            nodefo[u]['target_index'] for u in final_line['unique_ids']
        ]
        ftree.scale_edges(1. / numpy.mean([len(s)
                                           for s in final_line['seqs']]))
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=ftree),
                                  padwidth=12)
        final_line['tree'] = ftree.as_string(schema='newick')

        tmp_event = RecombinationEvent(
            glfo
        )  # I don't want to move the function out of event.py right now
        tmp_event.set_reco_id(
            final_line, irandom=ievent
        )  # not sure that setting <irandom> here actually does anything
        final_line['target_seqs'] = [tfo['seq'] for tfo in target_sfos]
        return final_line