Exemplo n.º 1
0
    def build_subproblems(self):
        (alignment, tree) = self.read_alignment_and_tree()

        if options().distance != 1:
            self.compute_distances(alignment)

        assert isinstance(tree, PhylogeneticTree)
        assert isinstance(alignment, MutableAlignment)

        tree.get_tree().resolve_polytomies()
        # Label edges with numbers so that we could assemble things back
        # at the end
        tree.lable_edges()
        ''' Make sure size values are set, and are meaningful. '''
        self.check_and_set_sizes(alignment.get_num_taxa())

        self._create_root_problem(tree, alignment)
        ''' Decompose the tree based on placement subsets'''
        placement_tree_map = PhylogeneticTree(Tree(
            tree.den_tree)).decompose_tree(
                self.options.placement_size,
                strategy=self.strategy,
                minSize=self.options.placement_size /
                int(self.options.exhaustive.placementminsubsetsizefacotr),
                tree_map={},
                pdistance=1,
                decomp_strategy=self.decomp_strategy,
                distances=self.distances,
                maxDiam=None)
        assert len(placement_tree_map) > 0, (
            "Tree could not be decomposed"
            " given the following settings; strategy:%s minsubsetsize:%s"
            " placement_size:%s" %
            (self.strategy, self.minsubsetsize, self.options.placement_size))
        _LOG.info("Breaking into %d placement subsets." %
                  len(placement_tree_map))
        ''' For placement subsets create a placement subproblem,
            and decompose further'''
        for (p_key, p_tree) in placement_tree_map.items():
            assert isinstance(p_tree, PhylogeneticTree)
            placement_problem = SeppProblem(p_tree.leaf_node_names(),
                                            self.root_problem)
            placement_problem.subtree = p_tree
            placement_problem.label = "P_%s" % str(p_key)
            _LOG.debug(
                "Placement subset %s has %d nodes" %
                (placement_problem.label, len(p_tree.leaf_node_names())))
            ''' Further decompose to alignment subsets '''
            alignment_tree_map = PhylogeneticTree(Tree(
                p_tree.den_tree)).decompose_tree(
                    self.options.alignment_size,
                    strategy=self.strategy,
                    minSize=self.minsubsetsize,
                    tree_map={},
                    decomp_strategy=self.options.decomp_strategy,
                    pdistance=options().distance,
                    distances=self.distances,
                    maxDiam=self.options.maxDiam)
            assert len(alignment_tree_map) > 0, (
                "Tree could not be decomposed"
                " given the following settings; strategy:%s"
                " minsubsetsize:%s alignmet_size:%s" %
                (self.strategy, self.minsubsetsize,
                 self.options.alignment_size))

            _LOG.debug("Placement subset %s has %d alignment subsets: %s" %
                       (placement_problem.label, len(alignment_tree_map),
                        str(sorted(alignment_tree_map.keys()))))
            _LOG.debug("Placement subset %s has %d taxa:" %
                       (placement_problem.label,
                        sum([
                            len(a_tree.leaf_node_names())
                            for a_tree in alignment_tree_map.values()
                        ])))
            for (a_key, a_tree) in alignment_tree_map.items():
                assert isinstance(a_tree, PhylogeneticTree)
                self.modify_tree(a_tree)
                alignment_problem = SeppProblem(a_tree.leaf_node_names(),
                                                placement_problem)
                alignment_problem.subtree = a_tree
                alignment_problem.label = "A_%s_%s" % (str(p_key), str(a_key))

        _LOG.info("Breaking into %d alignment subsets." %
                  (len(list(self.root_problem.iter_leaves()))))
        ''' Divide fragments into chunks, to help achieve better parallelism'''
        fragment_chunk_files = self.create_fragment_files()
        self.root_problem.fragment_chunks = len(fragment_chunk_files)
        for alignment_problem in self.root_problem.iter_leaves():
            for afc in range(0, self.root_problem.fragment_chunks):
                frag_chunk_problem = SeppProblem(alignment_problem.taxa,
                                                 alignment_problem)
                frag_chunk_problem.subtree = alignment_problem.subtree
                frag_chunk_problem.label = alignment_problem.label.replace(
                    "A_", "FC_") + "_" + str(afc)
                frag_chunk_problem.fragments = fragment_chunk_files[afc]

        _LOG.info("Breaking each alignment subset into %d fragment chunks." %
                  self.root_problem.fragment_chunks)
        _LOG.debug("Subproblem structure: %s" % str(self.root_problem))
        return self.root_problem
Exemplo n.º 2
0
    def testExtendedAlignment(self):
        print "======= starting testExtendedAlignment ========="

        subset = [
            "SFIF", "SFII", "SCFC", "SGHD", "SDCC", "SBGE", "SFBB", "SDI",
            "SCGB", "SJGF", "SGBI", "SCJA", "SGAD", "SHEB", "SFHB", "SDJI",
            "SHED", "SJJJ", "SBBE", "SCCH", "SDJB", "SDAC", "SHEH", "SFDC",
            "SFEI", "SHHB", "SC", "SIAB", "SDDI", "SBCB", "SJB", "SEBD",
            "SFGD", "SHA", "SIDA", "SGHI", "SGIB", "SBFJ", "SFIE", "SCJF",
            "SJHJ", "SJBG", "SEJI", "SFFF", "SJ", "SIII", "SJHH", "SEIH",
            "SBDC", "SHDJ", "SJDD", "SGDB", "SIHA", "SIBB", "SECC", "SCAD",
            "SGBB", "SGIF", "SJHC", "SFCD", "SEAA", "SEFF", "SDFG", "SDJE",
            "SCFG", "SFH", "SCJ", "SDDD", "SEGD", "SCIH", "SDAG", "SCJE",
            "SFAJ", "SIDJ", "SE", "SHBC", "SJFF", "SCHD", "SBHA", "SEDF",
            "SFAF", "SEDD", "SDHD", "SGJD", "SIBH", "SGDF", "SIFA", "SJGA",
            "SIJB", "SFI", "SGA", "SBFC", "SBJA", "SFFC", "SFDH", "SFEE",
            "SBDF", "SGBJ", "SDHE", "SJIB", "SHHI", "SIDE", "SJII"
        ]

        alg = MutableAlignment()
        alg.read_filepath("data/simulated/test.fasta")
        alg.delete_all_gap()
        tlen = alg.get_length()

        frg = MutableAlignment()
        frg.read_filepath("data/simulated/test.fas")
        #print frg.get_num_taxa()

        pp = SeppProblem(alg.keys())
        pp.fragments = frg
        pp.subalignment = alg

        cp1 = SeppProblem(subset, pp)
        cp2 = SeppProblem(list(set(alg.keys()) - set(subset)), pp)
        cp1.fragments = ReadonlySubalignment(
            [k for k in frg.keys() if int(k[-1]) >= 9], frg)
        cp2.fragments = ReadonlySubalignment(
            [k for k in frg.keys() if int(k[-1]) <= 1], frg)

        cp1labels = cp1.write_subalignment_without_allgap_columns(
            "data/tmp/cp1.fasta")
        cp2labels = cp2.write_subalignment_without_allgap_columns(
            "data/tmp/cp2.fasta")
        tmp = MutableAlignment().read_filepath("data/tmp/cp1.fasta")
        assert all(
            [not tmp.is_all_gap(pos) for pos in xrange(0, tmp.get_length())])
        tmp = MutableAlignment().read_filepath("data/tmp/cp2.fasta")
        assert all(
            [not tmp.is_all_gap(pos) for pos in xrange(0, tmp.get_length())])

        cp1.fragments.write_to_path("data/tmp/cp1.frags.fas")
        cp2.fragments.write_to_path("data/tmp/cp2.frags.fas")
        '''We have done the hmmalign before. don't worry about that right now'''

        ext1 = ExtendedAlignment(cp1.fragments)
        ext1.build_extended_alignment("data/tmp/cp1.fasta",
                                      "data/tmp/cp1.extended.sto")
        ext1.relabel_original_columns(cp1labels)
        ext2 = ExtendedAlignment(cp2.fragments)
        ext2.build_extended_alignment("data/tmp/cp2.fasta",
                                      "data/tmp/cp2.extended.sto")
        ext2.relabel_original_columns(cp2labels)

        extmerger = ExtendedAlignment([])
        extmerger.merge_in(ext1)
        mixed = extmerger.merge_in(ext2)

        extmerger.write_to_path("data/tmp/extended.merged.fasta")

        assert extmerger.is_aligned(), "Merged alignment is not aligned"
        in1 = len([x for x in ext1._col_labels if x < 0])
        in2 = len([x for x in ext2._col_labels if x < 0])
        print "Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d" % (
            extmerger.get_length(), in1, in2, tlen)
        assert (in1 + in2 + tlen - mixed) == extmerger.get_length(
        ), "Lengths don't match up after merging. Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d Mixed-insertion: %d" % (
            extmerger.get_length(), in1, in2, tlen, mixed)
        assert (in1 + in2 - mixed) == len(
            list(extmerger.iter_insertion_columns())
        ), "Columns are not correctly labeled after merging. Merged insertion count:%d. Insertion1:%d Insertion2:%d Mixed-insertion: %d" % (
            len(list(extmerger.iter_insertion_columns())), in1, in1, mixed)

        tmp = extmerger.get_base_readonly_alignment().get_mutable_alignment()
        tmp.delete_all_gap()
        assert tmp.is_aligned(), "merged alignment should be aligned!"
        assert tmp.get_length() == tlen, "merged alignment has wrong length"
        assert all([alg[k] == s for (k, s) in tmp.items()
                    ]), "merged alignment should match original alignment"

        print "======= finished testExtendedAlignment ========="
Exemplo n.º 3
0
    def testExtendedAlignment(self):
        print "======= starting testExtendedAlignment ========="

        subset = ["SFIF","SFII","SCFC","SGHD","SDCC","SBGE","SFBB","SDI","SCGB","SJGF","SGBI","SCJA","SGAD","SHEB","SFHB","SDJI","SHED","SJJJ","SBBE","SCCH","SDJB","SDAC","SHEH","SFDC","SFEI","SHHB","SC","SIAB","SDDI","SBCB","SJB","SEBD","SFGD","SHA","SIDA","SGHI","SGIB","SBFJ","SFIE","SCJF","SJHJ","SJBG","SEJI","SFFF","SJ","SIII","SJHH","SEIH","SBDC","SHDJ","SJDD","SGDB","SIHA","SIBB","SECC","SCAD","SGBB","SGIF","SJHC","SFCD","SEAA","SEFF","SDFG","SDJE","SCFG","SFH","SCJ","SDDD","SEGD","SCIH","SDAG","SCJE","SFAJ","SIDJ","SE","SHBC","SJFF","SCHD","SBHA","SEDF","SFAF","SEDD","SDHD","SGJD","SIBH","SGDF","SIFA","SJGA","SIJB","SFI","SGA","SBFC","SBJA","SFFC","SFDH","SFEE","SBDF","SGBJ","SDHE","SJIB","SHHI","SIDE","SJII"]
         
        alg = MutableAlignment()
        alg.read_filepath("data/simulated/test.fasta")
        alg.delete_all_gap()
        tlen = alg.get_length()                    
        
        frg = MutableAlignment()
        frg.read_filepath("data/simulated/test.fas")
        #print frg.get_num_taxa()
        
        pp = SeppProblem(alg.keys())
        pp.fragments = frg
        pp.subalignment = alg
        
        cp1 = SeppProblem(subset, pp)
        cp2 = SeppProblem(list(set(alg.keys()) -set(subset)), pp)
        cp1.fragments = ReadonlySubalignment([k for k in frg.keys() if int(k[-1]) >= 9], frg)
        cp2.fragments = ReadonlySubalignment([k for k in frg.keys() if int(k[-1]) <= 1], frg)
        
        cp1labels = cp1.write_subalignment_without_allgap_columns("data/tmp/cp1.fasta")
        cp2labels = cp2.write_subalignment_without_allgap_columns("data/tmp/cp2.fasta")
        tmp = MutableAlignment().read_filepath("data/tmp/cp1.fasta")
        assert all([not tmp.is_all_gap(pos) for pos in xrange(0,tmp.get_length())])        
        tmp = MutableAlignment().read_filepath("data/tmp/cp2.fasta")
        assert all([not tmp.is_all_gap(pos) for pos in xrange(0,tmp.get_length())])
        
        cp1.fragments.write_to_path("data/tmp/cp1.frags.fas")
        cp2.fragments.write_to_path("data/tmp/cp2.frags.fas")
        
        '''We have done the hmmalign before. don't worry about that right now'''
        
        ext1 = ExtendedAlignment(cp1.fragments)
        ext1.build_extended_alignment("data/tmp/cp1.fasta", "data/tmp/cp1.extended.sto")
        ext1.relabel_original_columns(cp1labels)
        ext2 = ExtendedAlignment(cp2.fragments)
        ext2.build_extended_alignment("data/tmp/cp2.fasta", "data/tmp/cp2.extended.sto")
        ext2.relabel_original_columns(cp2labels)
        
        extmerger = ExtendedAlignment([])
        extmerger.merge_in(ext1)
        mixed = extmerger.merge_in(ext2)
                        
        extmerger.write_to_path("data/tmp/extended.merged.fasta")        

        assert extmerger.is_aligned(), "Merged alignment is not aligned"
        in1 = len([x for x in ext1._col_labels if x<0])
        in2 = len([x for x in ext2._col_labels if x<0])
        print "Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d" %(extmerger.get_length(),in1 , in2 , tlen)
        assert ( in1 + in2 + tlen - mixed) == extmerger.get_length(), "Lengths don't match up after merging. Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d Mixed-insertion: %d"  %(extmerger.get_length(),in1, in2 , tlen, mixed)
        assert ( in1 + in2 - mixed) == len(list(extmerger.iter_insertion_columns())), "Columns are not correctly labeled after merging. Merged insertion count:%d. Insertion1:%d Insertion2:%d Mixed-insertion: %d"  %(len(list(extmerger.iter_insertion_columns())),in1 , in1, mixed)
         
        
        tmp = extmerger.get_base_readonly_alignment().get_mutable_alignment()
        tmp.delete_all_gap()
        assert tmp.is_aligned(), "merged alignment should be aligned!"
        assert tmp.get_length() == tlen, "merged alignment has wrong length"
        assert all([alg[k] == s for (k,s) in tmp.items()]), "merged alignment should match original alignment"

        
        print "======= finished testExtendedAlignment ========="
Exemplo n.º 4
0
for dir in dirs:
  print "Working on %s\n" % dir
  aligned_files = glob.glob('%sFC_*/hmmalign.results.*' % dir)
  sequence_files = glob.glob('%sFC_*/hmmalign.frag.*' % dir)
  base_alignment_file = glob.glob('%s/*.fasta' % dir)
  base_alignment = MutableAlignment()
  done = base_alignment.read_filepath(base_alignment_file[0])
  subbackbone = original_backbone.get_soft_sub_alignment(base_alignment.get_sequence_names())
  frags = MutableAlignment()
  sequence_names = []
  for file in sequence_files:
    seq = MutableAlignment()
    done = seq.read_filepath(file)
    done = sequence_names.extend(seq.get_sequence_names())
    for name, seq in seq.iteritems():
      frags[name] = seq.upper()
  problem = SeppProblem(sequence_names)  
  problem.set_subalignment(subbackbone)

  mut_subalg = problem.subalignment.get_mutable_alignment()
  remaining_cols = mut_subalg.delete_all_gap()        
  problem.annotations["ref.alignment.columns"] = remaining_cols
  problem.fragments = frags
  ap_alg = problem.read_extendend_alignment_and_relabel_columns\
                          (base_alignment_file, aligned_files)
  extendedAlignment.merge_in(ap_alg,convert_to_string=False)                        
                        
extendedAlignment.write_to_path("/projects/sate8/namphuon/ultra_large/1000000/upp_100_10_new/upp.unmasked.fasta")
extendedAlignment.remove_insertion_columns()
extendedAlignment.write_to_path("/projects/sate8/namphuon/ultra_large/1000000/upp_100_10_new/upp.masked.fasta")
Exemplo n.º 5
0
    def build_subproblems(self):
        (alignment, tree) = self.read_alignment_and_tree()
        assert isinstance(tree, PhylogeneticTree)
        assert isinstance(alignment, MutableAlignment)

        tree.get_tree().resolve_polytomies()
        # Label edges with numbers so that we could assemble things back
        # at the end
        tree.lable_edges()        

        ''' Make sure size values are set, and are meaningful. '''
        self.check_and_set_sizes(alignment.get_num_taxa())        

        self._create_root_problem(tree, alignment)

        ''' Decompose the tree based on placement subsets'''
        placement_tree_map = PhylogeneticTree(Tree(tree.den_tree)).decompose_tree(
                                        self.options.placement_size, 
                                        strategy=self.strategy, 
                                        minSize = self.minsubsetsize,
                                        tree_map = {})
        assert len(placement_tree_map) > 0, ("Tree could not be decomposed"
                " given the following settings; strategy:%s minsubsetsize:%s placement_size:%s" 
                %(self.strategy, self.minsubsetsize, self.options.placement_size))                    
        _LOG.info("Breaking into %d placement subsets." %len(placement_tree_map))

        ''' For placement subsets create a placement subproblem, and decompose further'''
        for (p_key,p_tree) in placement_tree_map.items():
            assert isinstance(p_tree, PhylogeneticTree)
            placement_problem  = SeppProblem(p_tree.leaf_node_names(), self.root_problem)
            placement_problem.subtree = p_tree
            placement_problem.label = "P_%s" %str(p_key)
            _LOG.debug("Placement subset %s has %d nodes" %(placement_problem.label,len(p_tree.leaf_node_names())))
            ''' Further decompose to alignment subsets '''
            alignment_tree_map = PhylogeneticTree(Tree(p_tree.den_tree)).decompose_tree(
                                        self.options.alignment_size, 
                                        strategy=self.strategy, 
                                        minSize = self.minsubsetsize,
                                        tree_map = {}, decomp_strategy = self.options.decomp_strategy)
            assert len(alignment_tree_map) > 0, ("Tree could not be decomposed"
            " given the following settings; strategy:%s minsubsetsize:%s alignmet_size:%s" 
            %(self.strategy, self.minsubsetsize, self.options.alignment_size))
                        
            _LOG.debug("Placement subset %s has %d alignment subsets: %s" %(placement_problem.label,len(alignment_tree_map.keys()),str(sorted(alignment_tree_map.keys()))))
            _LOG.debug("Placement subset %s has %d taxa:" %(placement_problem.label,sum([len(a_tree.leaf_node_names()) for a_tree in alignment_tree_map.values()])))
            for (a_key, a_tree) in alignment_tree_map.items():
                assert isinstance(a_tree, PhylogeneticTree)  
                self.modify_tree(a_tree)
                alignment_problem  = SeppProblem(a_tree.leaf_node_names(), 
                                                  placement_problem)
                alignment_problem.subtree = a_tree
                alignment_problem.label = "A_%s_%s" %(str(p_key),str(a_key))                                                       
        
        ''' Divide fragments into chunks, to help achieve better parallelism'''
        fragment_chunk_files = self.create_fragment_files()                
        for alignment_problem in self.root_problem.iter_leaves():       
            for afc in xrange(0,len(fragment_chunk_files)):
                frag_chunk_problem  = SeppProblem(alignment_problem.taxa, 
                                              alignment_problem)
                frag_chunk_problem.subtree = alignment_problem.subtree
                frag_chunk_problem.label = alignment_problem.label.replace("A_", "FC_") + "_" +str(afc)
                frag_chunk_problem.fragments = fragment_chunk_files[afc]
                    
        _LOG.info("Breaking into %d alignment subsets." %(len(list(self.root_problem.iter_leaves()))))    
        _LOG.info("Breaking each alignment subset into %d fragment chunks." %len(fragment_chunk_files))
        _LOG.info("Subproblem structure: %s" %str(self.root_problem))
        return self.root_problem