예제 #1
0
    def infer_tree_from_leaves(self, region, in_tree, leafseqs, naive_seq):
        if 'dendropy' not in sys.modules:
            import dendropy
        dendropy = sys.modules['dendropy']
        taxon_namespace = dendropy.TaxonNamespace()
        with tempfile.NamedTemporaryFile() as tmpfile:
            tmpfile.write('>%s\n%s\n' % ('naive', naive_seq))
            for iseq in range(len(leafseqs)):
                tmpfile.write(
                    '>t%s\n%s\n' % (iseq + 1, leafseqs[iseq])
                )  # NOTE the order of the leaves/names is checked when reading bppseqgen output
            tmpfile.flush()  # BEWARE if you forget this you are f****d
            with open(os.devnull, 'w') as fnull:
                out_tree = subprocess.check_output('./bin/FastTree -gtr -nt ' +
                                                   tmpfile.name,
                                                   shell=True,
                                                   stderr=fnull)
            out_dtree = dendropy.Tree.get_from_string(
                out_tree, 'newick', taxon_namespace=taxon_namespace)
            out_dtree.reroot_at_node(
                out_dtree.find_node_with_taxon_label('naive'),
                update_bipartitions=True)
            out_tree = out_dtree.as_string(schema='newick',
                                           suppress_rooting=True)

        in_height = treegenerator.get_mean_height(in_tree)
        out_height = treegenerator.get_mean_height(out_tree)
        base_width = 100
        print '  %s trees:' % ('full sequence' if region == 'all' else region)
        print '    %s' % utils.color('blue', 'input:')
        print treegenerator.get_ascii_tree(in_tree,
                                           extra_str='      ',
                                           width=base_width)
        print '    %s' % utils.color('blue', 'output:')
        print treegenerator.get_ascii_tree(out_tree,
                                           extra_str='        ',
                                           width=int(base_width * out_height /
                                                     in_height))

        in_dtree = dendropy.Tree.get_from_string(
            in_tree, 'newick', taxon_namespace=taxon_namespace)

        if self.args.debug:
            print '                   heights: %.3f   %.3f' % (in_height,
                                                               out_height)
            print '      symmetric difference: %d' % dendropy.calculate.treecompare.symmetric_difference(
                in_dtree, out_dtree)
            print '        euclidean distance: %f' % dendropy.calculate.treecompare.euclidean_distance(
                in_dtree, out_dtree)
            print '              r-f distance: %f' % dendropy.calculate.treecompare.robinson_foulds_distance(
                in_dtree, out_dtree)
예제 #2
0
    def infer_tree_from_leaves(self, region, in_tree, leafseqs):
        with tempfile.NamedTemporaryFile() as tmpfile:
            for iseq in range(len(leafseqs)):
                tmpfile.write(
                    '>t%s\n%s\n' % (iseq + 1, leafseqs[iseq])
                )  # NOTE the order of the leaves/names is checked when reading bppseqgen output
            tmpfile.flush()  # BEWARE if you forget this you are f****d
            with open(os.devnull, 'w') as fnull:
                out_tree = subprocess.check_output('./bin/FastTree -gtr -nt ' +
                                                   tmpfile.name,
                                                   shell=True,
                                                   stderr=fnull)

        in_height = treegenerator.get_mean_height(in_tree)
        out_height = treegenerator.get_mean_height(out_tree)
        base_width = 100
        print '  %s trees: input/output' % region
        print treegenerator.get_ascii_tree(in_tree,
                                           extra_str='        ',
                                           width=base_width)
        print treegenerator.get_ascii_tree(out_tree,
                                           extra_str='        ',
                                           width=int(base_width * out_height /
                                                     in_height))

        if 'dendropy' not in sys.modules:
            import dendropy
        in_dtree = sys.modules['dendropy'].Tree.get_from_string(
            in_tree, 'newick')
        out_dtree = sys.modules['dendropy'].Tree.get_from_string(
            out_tree, 'newick')
        if self.args.debug:
            print '                   heights: %.3f   %.3f' % (in_height,
                                                               out_height)
            print '      symmetric difference: %d' % in_dtree.symmetric_difference(
                out_dtree)
            print '        euclidean distance: %f' % in_dtree.euclidean_distance(
                out_dtree)
            print '              r-f distance: %f' % in_dtree.robinson_foulds_distance(
                out_dtree)
예제 #3
0
    def add_mutants(self, reco_event, irandom):
        if self.args.mutation_multiplier is not None and self.args.mutation_multiplier == 0.:  # some of the stuff below fails if mut mult is actually 0.
            reco_event.final_seqs.append(
                reco_event.recombined_seq)  # set final sequnce in reco_event
            reco_event.indelfos = [
                indelutils.get_empty_indel()
                for _ in range(len(reco_event.final_seqs))
            ]
            return

        # When generating trees, each tree's number of leaves and total depth are chosen from the specified distributions (a.t.m., by default n-leaves is from a geometric/zipf, and depth is from data)
        # This chosen depth corresponds to the sequence-wide mutation frequency.
        # In order to account for varying mutation rates in v, d, and j we simulate these regions separately, by appropriately rescaling the tree for each region.
        # i.e.: here we get the sequence-wide mute freq from the tree, and rescale it by the repertoire-wide ratios from data (which are stored in the tree file).
        # looks like e.g.: (t2:0.003751736951,t1:0.003751736951):0.001248262937;v:0.98,d:1.8,j:0.87, where the newick trees has branch lengths corresponding to the whole sequence  (i.e. the weighted mean of v, d, and j)
        # NOTE a.t.m (and probably permanently) the mean branch lengths for each region are the same for all the trees in the file, I just don't have a better place to put them while I'm passing from TreeGenerator to here than at the end of each line in the file
        treefostr = self.treeinfo[random.randint(
            0,
            len(self.treeinfo) - 1
        )]  # per-region mutation info is tacked on after the tree... sigh. kind of hackey but works ok.
        assert treefostr.count(';') == 1
        isplit = treefostr.find(';') + 1
        chosen_tree = treefostr[:isplit]  # includes semi-colon
        mutefo = [rstr for rstr in treefostr[isplit:].split(',')]
        mean_total_height = treegenerator.get_mean_height(chosen_tree)
        regional_heights = {
        }  # per-region height, including <self.args.mutation_multiplier>
        for tmpstr in mutefo:
            region, ratio = tmpstr.split(':')
            assert region in utils.regions
            ratio = float(ratio)
            if self.args.mutation_multiplier is not None:  # multiply the branch lengths by some factor
                ratio *= self.args.mutation_multiplier
            regional_heights[region] = mean_total_height * ratio

        scaled_trees = {
            r: treegenerator.rescale_tree(chosen_tree, regional_heights[r])
            for r in utils.regions
        }

        if self.args.debug:
            print '  chose tree with total height %f' % treegenerator.get_mean_height(
                chosen_tree)
            print '    regional trees rescaled to heights:  %s' % ('   '.join([
                '%s %.3f  (expected %.3f)' %
                (region, treegenerator.get_mean_height(
                    scaled_trees[region]), regional_heights[region])
                for region in utils.regions
            ]))
            print treegenerator.get_ascii_tree(chosen_tree, extra_str='    ')

        n_leaves = treegenerator.get_n_leaves(chosen_tree)
        cmdfos = []
        for region in utils.regions:
            simstr = reco_event.eroded_seqs[region]
            if region == 'd':
                simstr = reco_event.insertions[
                    'vd'] + simstr + reco_event.insertions['dj']
            cmdfos.append(
                self.prepare_bppseqgen(simstr,
                                       scaled_trees[region],
                                       n_leaves,
                                       reco_event.genes[region],
                                       reco_event,
                                       seed=irandom))

        utils.run_cmds(
            [cfo for cfo in cmdfos if cfo is not None],
            sleep=False)  # shenanigan is to handle zero-length regional seqs

        mseqs = {}
        for ireg in range(
                len(utils.regions)
        ):  # NOTE kind of sketchy just using index in <utils.regions> (although it just depends on the loop immediately above a.t.m.)
            if cmdfos[ireg] is None:
                mseqs[utils.regions[ireg]] = [
                    '' for _ in range(n_leaves)
                ]  # return an empty string for each leaf node
            else:
                mseqs[utils.regions[ireg]] = self.read_bppseqgen_output(
                    cmdfos[ireg], n_leaves)

        assert len(reco_event.final_seqs) == 0
        for iseq in range(n_leaves):
            seq = mseqs['v'][iseq] + mseqs['d'][iseq] + mseqs['j'][iseq]
            seq = reco_event.revert_conserved_codons(
                seq, debug=self.args.debug
            )  # if mutation screwed up the conserved codons, just switch 'em back to what they were to start with
            reco_event.final_seqs.append(
                seq)  # set final sequnce in reco_event
            reco_event.final_codon_positions.append(
                copy.deepcopy(reco_event.post_erosion_codon_positions)
            )  # separate codon positions for each sequence, because of shm indels

        self.add_shm_indels(reco_event)

        reco_event.setline(
            irandom
        )  # set the line here because we use it when checking tree simulation, and want to make sure the uids are always set at the same point in the workflow

        self.check_tree_simulation(mean_total_height, regional_heights,
                                   scaled_trees, mseqs, reco_event)

        if self.args.debug:
            utils.print_reco_event(reco_event.line, extra_str='    ')