def setUp(self): """setUp method for all tests""" # named sequences self.rna1 = RnaSequence('UCAGGG', Name='rna1') self.rna2 = RnaSequence('YCU-RG', Name='rna2') self.rna3 = RnaSequence('CAA-NR', Name='rna3') self.model1 = ModelSequence('UCAGGG', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) self.model2 = ModelSequence('YCU-RG', Name='rna2',\ Alphabet=RNA.Alphabets.DegenGapped) self.model3 = ModelSequence('CAA-NR', Name='rna3',\ Alphabet=RNA.Alphabets.DegenGapped) self.aln = Alignment([self.rna1, self.rna2, self.rna3], MolType=RNA) self.da = DenseAlignment([self.model1, self.model2, self.model3],\ MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) # seqs no name self.nn_rna1 = RnaSequence('UCAGGG') self.nn_rna2 = RnaSequence('YCU-RG') self.nn_rna3 = RnaSequence('CAA-NR') self.nn_model1 = ModelSequence('UCAGGG',\ Alphabet=RNA.Alphabets.DegenGapped) self.nn_model2 = ModelSequence('YCU-RG',\ Alphabet=RNA.Alphabets.DegenGapped) self.nn_model3 = ModelSequence('CAA-NR',\ Alphabet=RNA.Alphabets.DegenGapped) self.nn_aln = Alignment([self.nn_rna1, self.nn_rna2, self.nn_rna3],\ MolType=RNA) self.nn_da = DenseAlignment([self.nn_model1, self.nn_model2,\ self.nn_model3], MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)
def remove_outliers(seqs, num_sigmas, fraction_seqs_for_stats=.95): """ remove sequences very different from the majority consensus given aligned seqs, will calculate a majority consensus (most common symbol at each position of the alignment), and average edit distance of each seq to that consensus. any seq whose edit dist is > cutoff (roughly seq_dist > num_sigmas * (average edit dist) ) is removed when calculating mean and stddev edit distance, only the best fraction_seqs_for_stats are used seqs must be compatible with DenseAlignment: aln = DenseAlignment(data=seqs, MolType=DNA) is called """ aln = DenseAlignment(data=seqs, MolType=DNA) cons = DenseAlignment(data=aln.majorityConsensus(), MolType=DNA) diff_mtx = cons.SeqData[:,0] != aln.SeqData # consider only a fraction of seqs for mean, std seq_diffs = diff_mtx.sum(1) num_to_consider = round(len(seq_diffs)*fraction_seqs_for_stats) seq_diffs_considered_sorted = \ seq_diffs[seq_diffs.argsort()[:num_to_consider]] diff_cutoff = seq_diffs_considered_sorted.mean() + \ num_sigmas*seq_diffs_considered_sorted.std() # mean + e.g.: 4 sigma seq_idxs_to_keep = numpy.arange(len(seq_diffs))[seq_diffs <= diff_cutoff] filtered_aln = aln.getSubAlignment(seq_idxs_to_keep) return filtered_aln
def test_subset_positions_DenseAlignment(self): model1 = ModelSequence('UCG', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) model2 = ModelSequence('YCG', Name='rna2',\ Alphabet=RNA.Alphabets.DegenGapped) model3 = ModelSequence('CAR', Name='rna3',\ Alphabet=RNA.Alphabets.DegenGapped) sub_da = DenseAlignment([model1, model2, model3],\ MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) full_data = array([[0, 1, 2, 3, 3, 3], [15, 1, 0, 4, 12, 3], [1, 2, 2, 4, 10, 12]]) sub_data = array([[0, 1, 3], [15, 1, 3], [1, 2, 12]]) # First check some data self.assertEqual(self.da.ArraySeqs, full_data) self.assertEqual(self.da.ArrayPositions, transpose(full_data)) self.assertEqual(sub_da.ArraySeqs, sub_data) self.assertEqual(sub_da.ArrayPositions, transpose(sub_data)) obs_sub_da_TP = self.da.takePositions([0, 1, 5]) obs_sub_da_SA = self.da.getSubAlignment(pos=[0, 1, 5]) # When using the getSubAlignment method the data is right self.assertEqual(obs_sub_da_SA, sub_da) self.assertNotEqual(obs_sub_da_SA, self.da) self.assertEqual(obs_sub_da_SA.ArraySeqs, sub_data) self.assertEqual(obs_sub_da_SA.ArrayPositions, transpose(sub_data)) # For the takePositions method: Why does this work self.assertEqual(obs_sub_da_TP, sub_da) self.assertNotEqual(obs_sub_da_TP, self.da) # If the data doesn't match? self.assertEqual(obs_sub_da_TP.ArraySeqs, sub_data) self.assertEqual(obs_sub_da_TP.ArrayPositions, transpose(sub_data))
def test_aln_equality(self): # When does something compare equal? self.assertEqual(self.da == self.da, True) # one sequence less other_da1 = DenseAlignment([self.model1, self.model2],\ MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) self.assertEqual(self.da == other_da1, False) # seqs in different order -- doesn't matter other_da2 = DenseAlignment([self.model1, self.model3, self.model2],\ MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) self.assertEqual(self.da == other_da2, True) # seqs in different encoding -- doesn't matter, only looks at data other_da3 = DenseAlignment([self.model1, self.model2, self.model3]) # Should this compare False even though the data is exactly the same? # The MolType is different... self.assertEqual(self.da == other_da3, True) assert alltrue(map(alltrue, self.da.ArraySeqs == other_da3.ArraySeqs))
def test_DenseAlignment_without_moltype(self): """Expect MolType to be picked up from the sequences.""" m1 = ModelSequence('UCAG',Alphabet=RNA.Alphabets.DegenGapped,\ Name='rna1') m2 = ModelSequence('CCCR',Alphabet=RNA.Alphabets.DegenGapped,\ Name='rna2') da = DenseAlignment([m1, m2]) exp_lines = ['>rna1', 'UCAG', '>rna2', 'CCCR'] self.assertEqual(str(da), '\n'.join(exp_lines) + '\n')
def setUp(self): """ Initialize some variables for the tests """ self.canonical_abbrevs = 'ACDEFGHIKLMNPQRSTVWY' self.ambiguous_abbrevs = 'BXZ' self.all_to_a = [('A',self.canonical_abbrevs+\ self.ambiguous_abbrevs)] self.charge_2 = alphabets['charge_2'] self.hydropathy_3 = alphabets['hydropathy_3'] self.orig = alphabets['orig'] self.aln = DenseAlignment(\ data={'1':'CDDFBXZ', '2':'CDD-BXZ', '3':'AAAASS-'}) self.aln2 = LoadSeqs(\ data={'1':'CDDFBXZ', '2':'CDD-BXZ', '3':'AAAASS-'})
def test_recode_dense_alignment(self): """recode_dense_alignment: recode alignment to charge_2 alpha works """ expected_c2 = DenseAlignment(data=\ {'1':'AKKAKAK','2':'AKK-KAK','3':'AAAAAA-'}) expected_h3 = DenseAlignment(data=\ {'1':'PRRPRPR','2':'PRR-RPR','3':'PPPPYY-'}) expected_aa = DenseAlignment(data=\ {'1':'AAAAAAA','2':'AAA-AAA','3':'AAAAAA-'}) # provided with alphabet_id actual = recode_dense_alignment(self.aln, alphabet_id='charge_2') self.assertEqual(actual, expected_c2) # provided with alphabet_def actual = recode_dense_alignment(self.aln, alphabet_def=self.charge_2) self.assertEqual(actual, expected_c2) # different alphabet actual = recode_dense_alignment(self.aln, alphabet_id='hydropathy_3') self.assertEqual(actual, expected_h3) actual = recode_dense_alignment(self.aln,\ alphabet_def=self.hydropathy_3) self.assertEqual(actual, expected_h3) # different alphabet actual = recode_dense_alignment(self.aln, alphabet_def=self.all_to_a) self.assertEqual(actual, expected_aa) # original charactars which aren't remapped are let in original state actual = recode_dense_alignment(self.aln, alphabet_def=[('a', 'b')]) self.assertEqual(actual, self.aln) # non-alphabetic character mapped same as alphabetic characters actual = recode_dense_alignment(self.aln, alphabet_def=[('.', '-')]) expected = DenseAlignment(\ data={'1':'CDDFBXZ', '2':'CDD.BXZ', '3':'AAAASS.'}) self.assertEqual(actual, expected)
def test_subset_seqs_DenseAlignment(self): model1 = ModelSequence('UCG', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) model2 = ModelSequence('YCG', Name='rna2',\ Alphabet=RNA.Alphabets.DegenGapped) model3 = ModelSequence('CAR', Name='rna3',\ Alphabet=RNA.Alphabets.DegenGapped) sub_da = DenseAlignment([model1, model2, model3],\ MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) # takeSeqs by name should have the same effect as # getSubAlignment by seq idx? obs_sub_da_TS = self.da.takeSeqs(['rna1']) obs_sub_da_SA = self.da.getSubAlignment(seqs=[0]) # These two are now the same. Fixed mapping of key to char array. self.assertEqual(obs_sub_da_TS, obs_sub_da_SA) self.assertEqual(str(obs_sub_da_TS), str(obs_sub_da_SA))
def recode_dense_alignment(aln, alphabet_id=None, alphabet_def=None): """Return new DenseAlignment recoded in the provided reduced-state alphabet aln: the DenseAlignment object to be recoded alphabet_id: string identifying an alphabet in cogent.util.recode_alignment.alphabets. (See cogent.util.recode_alignment.alphabets.keys() for valid alphabet_ids.) alphabet_def: list of two-element tuples where first element is the new alphabet character and the second elements is an iterable object containing the old alphabet chars which should be mapped to the new char. e.g., [('A','CVILFMWAGSTPYH'),('B','QNDERKBZ')] (See cogent.util.recode_alignment.alphabets.values() for more examples.) Note: either alphabet_id OR alphabet_def must be passed. Either provide the alphabet, or have it is looked up. If both are provided the alphabet_id is ignored. """ # Construct a dict mapping from UInt8s in alignment to their # associated characters. This dict is then used for looking # up chars in the new and old alphabets. byte_map = dict(zip(aln.Alphabet, range(len(aln.Alphabet)))) # Construct a dict mapping old characters to new characters. alphabet_map = build_alphabet_map(alphabet_id=alphabet_id,\ alphabet_def=alphabet_def) # Create the recoded version of seqs.Alphabet new_indices = range(len(aln.Alphabet)) for old, new in alphabet_map.items(): new_indices[byte_map[old]] = byte_map[new] # Map the old alphabet onto the new alphabet. Note: characters that # that are not mapped are ignored. Returns a new DenseAlignment. return DenseAlignment(take(new_indices,aln.ArraySeqs).transpose(),\ aln.Names[:],MolType=aln.MolType)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) parameters = {} # get the tree insertion method to use module = opts.insertion_method # create output directory output_dir = opts.output_dir create_dir(output_dir) # list of tree insertion methods tree_insertion_module_names = \ {'raxml_v730': brokit.raxml_v730, 'parsinsert': brokit.parsinsert, 'pplacer': brokit.pplacer} # load input sequences and convert to phylip since the tools require # the query sequences to phylip-compliant names load_aln = parse_fasta(open(opts.input_fasta_fp, 'U')) aln = DenseAlignment(load_aln) seqs, align_map = aln.toPhylip() if opts.method_params_fp: param_dict = parse_qiime_parameters(open(opts.method_params_fp, 'U')) if module == 'raxml_v730': # load the reference sequences load_ref_aln = \ DenseAlignment(parse_fasta(open(opts.refseq_fp, 'U'))) # combine and load the reference plus query combined_aln = parse_fasta(StringIO(load_ref_aln.toFasta() + '\n' + aln.toFasta())) # overwrite the alignment map aln = DenseAlignment(combined_aln) seqs, align_map = aln.toPhylip() try: parameters = param_dict['raxml'] except: parameters = {} tree = convert_tree_tips(align_map, opts.starting_tree_fp) # write out the tree with phylip labels updated_tree_fp = join(output_dir, '%s_phylip_named_tree.tre' % (module)) write_updated_tree_file(updated_tree_fp, tree) # set the primary parameters for raxml parameters['-w'] = abspath(output_dir) + '/' parameters["-n"] = split(splitext(get_tmp_filename())[0])[-1] parameters["-t"] = updated_tree_fp if "-f" not in parameters: parameters["-f"] = 'v' if "-m" not in parameters: parameters["-m"] = 'GTRGAMMA' elif module == 'pplacer': try: parameters = param_dict['pplacer'] except: parameters = {} # make sure stats file is passed if not opts.stats_fp: raise IOError( 'When using pplacer, the RAxML produced info file is required.') # set the primary parameters for pplacer - allow for user-defined parameters['--out-dir'] = abspath(output_dir) + '/' parameters["-t"] = opts.starting_tree_fp parameters['-r'] = opts.refseq_fp parameters['-s'] = opts.stats_fp elif module == 'parsinsert': try: parameters = param_dict['parsinsert'] except: parameters = {} # define log fp log_fp = join(output_dir, 'parsinsert.log') # define tax assignment values fp tax_assign_fp = join(output_dir, 'parsinsert_assignments.log') parameters["-l"] = log_fp parameters["-o"] = tax_assign_fp parameters["-s"] = opts.refseq_fp parameters["-t"] = opts.starting_tree_fp # call the module and return a tree object result = \ tree_insertion_module_names[module].insert_sequences_into_tree(seqs, moltype=DNA, params=parameters) result_tree = strip_and_rename_unwanted_labels_from_tree(align_map, result) # write out the resulting tree final_tree = join(output_dir, '%s_final_placement.tre' % (module)) write_updated_tree_file(final_tree, result)
class AllTests(TestCase): def setUp(self): """setUp method for all tests""" # named sequences self.rna1 = RnaSequence('UCAGGG', Name='rna1') self.rna2 = RnaSequence('YCU-RG', Name='rna2') self.rna3 = RnaSequence('CAA-NR', Name='rna3') self.model1 = ModelSequence('UCAGGG', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) self.model2 = ModelSequence('YCU-RG', Name='rna2',\ Alphabet=RNA.Alphabets.DegenGapped) self.model3 = ModelSequence('CAA-NR', Name='rna3',\ Alphabet=RNA.Alphabets.DegenGapped) self.aln = Alignment([self.rna1, self.rna2, self.rna3], MolType=RNA) self.da = DenseAlignment([self.model1, self.model2, self.model3],\ MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) # seqs no name self.nn_rna1 = RnaSequence('UCAGGG') self.nn_rna2 = RnaSequence('YCU-RG') self.nn_rna3 = RnaSequence('CAA-NR') self.nn_model1 = ModelSequence('UCAGGG',\ Alphabet=RNA.Alphabets.DegenGapped) self.nn_model2 = ModelSequence('YCU-RG',\ Alphabet=RNA.Alphabets.DegenGapped) self.nn_model3 = ModelSequence('CAA-NR',\ Alphabet=RNA.Alphabets.DegenGapped) self.nn_aln = Alignment([self.nn_rna1, self.nn_rna2, self.nn_rna3],\ MolType=RNA) self.nn_da = DenseAlignment([self.nn_model1, self.nn_model2,\ self.nn_model3], MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) def test_printing_named_seqs(self): """Printing named seqs should work the same on Aln and DenseAln""" #Note: the newline trailing each sequence is intentional, because #we want each FASTA-format record to be separated. exp_lines_general = [ '>rna1', 'UCAGGG', '>rna2', 'YCU-RG', '>rna3', 'CAA-NR' ] self.assertEqual(str(self.aln), '\n'.join(exp_lines_general) + '\n') self.assertEqual(str(self.da), '\n'.join(exp_lines_general) + '\n') def test_printing_unnamed_seqs(self): """Printing unnamed sequences should work the same on Aln and DenseAln """ exp_lines_gen = [ '>seq_0', 'UCAGGG', '>seq_1', 'YCU-RG', '>seq_2', 'CAA-NR\n' ] self.assertEqual(str(self.nn_aln), '\n'.join(exp_lines_gen)) self.assertEqual(str(self.nn_da), '\n'.join(exp_lines_gen)) def test_DenseAlignment_without_moltype(self): """Expect MolType to be picked up from the sequences.""" m1 = ModelSequence('UCAG',Alphabet=RNA.Alphabets.DegenGapped,\ Name='rna1') m2 = ModelSequence('CCCR',Alphabet=RNA.Alphabets.DegenGapped,\ Name='rna2') da = DenseAlignment([m1, m2]) exp_lines = ['>rna1', 'UCAG', '>rna2', 'CCCR'] self.assertEqual(str(da), '\n'.join(exp_lines) + '\n') def test_names(self): # Should both alignments handle names the same way? self.assertEqual(self.aln.Names, ['rna1', 'rna2', 'rna3']) self.assertEqual(self.da.Names, ['rna1', 'rna2', 'rna3']) # On unnamed sequences the behavior is now the same. self.assertEqual(self.nn_aln.Names, ['seq_0', 'seq_1', 'seq_2']) self.assertEqual(self.nn_da.Names, ['seq_0', 'seq_1', 'seq_2']) def test_seqFreqs(self): """seqFreqs should work the same on Alignment and DenseAlignment""" # Used alphabet: ('U', 'C', 'A', 'G', '-', 'B', 'D', 'H',\ # 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y') exp = [[1,1,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0],\ [1,1,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0],\ [0,1,2,0,1,0,0,0,0,0,1,0,1,0,0,0,0]] # This works self.assertEqual(self.da.getSeqFreqs().Data, exp) # This used to raise an error, but now works self.assertEqual(self.aln.getSeqFreqs().Data, exp) def test_subset_positions_DenseAlignment(self): model1 = ModelSequence('UCG', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) model2 = ModelSequence('YCG', Name='rna2',\ Alphabet=RNA.Alphabets.DegenGapped) model3 = ModelSequence('CAR', Name='rna3',\ Alphabet=RNA.Alphabets.DegenGapped) sub_da = DenseAlignment([model1, model2, model3],\ MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) full_data = array([[0, 1, 2, 3, 3, 3], [15, 1, 0, 4, 12, 3], [1, 2, 2, 4, 10, 12]]) sub_data = array([[0, 1, 3], [15, 1, 3], [1, 2, 12]]) # First check some data self.assertEqual(self.da.ArraySeqs, full_data) self.assertEqual(self.da.ArrayPositions, transpose(full_data)) self.assertEqual(sub_da.ArraySeqs, sub_data) self.assertEqual(sub_da.ArrayPositions, transpose(sub_data)) obs_sub_da_TP = self.da.takePositions([0, 1, 5]) obs_sub_da_SA = self.da.getSubAlignment(pos=[0, 1, 5]) # When using the getSubAlignment method the data is right self.assertEqual(obs_sub_da_SA, sub_da) self.assertNotEqual(obs_sub_da_SA, self.da) self.assertEqual(obs_sub_da_SA.ArraySeqs, sub_data) self.assertEqual(obs_sub_da_SA.ArrayPositions, transpose(sub_data)) # For the takePositions method: Why does this work self.assertEqual(obs_sub_da_TP, sub_da) self.assertNotEqual(obs_sub_da_TP, self.da) # If the data doesn't match? self.assertEqual(obs_sub_da_TP.ArraySeqs, sub_data) self.assertEqual(obs_sub_da_TP.ArrayPositions, transpose(sub_data)) # Shouldn't the __eq__ method check the data at least? def test_subset_positions_Alignment(self): rna1 = RnaSequence('UCG', Name='rna1') rna2 = RnaSequence('YCG', Name='rna2') rna3 = RnaSequence('CAR', Name='rna3') sub_aln = Alignment([rna1, rna2, rna3], MolType=RNA) obs_sub_aln = self.aln.takePositions([0, 1, 5]) self.assertEqual(obs_sub_aln, sub_aln) self.assertNotEqual(obs_sub_aln, self.aln) # string representations should be the same. This fails right # now, because sequence order is not maintained. See separate test. self.assertEqual(str(obs_sub_aln), str(sub_aln)) def test_takePositions_sequence_order(self): """Alignment takePositions should maintain seq order""" #This works self.assertEqual(self.da.Names, ['rna1', 'rna2', 'rna3']) sub_da = self.da.getSubAlignment(pos=[0, 1, 5]) self.assertEqual(sub_da.Names, ['rna1', 'rna2', 'rna3']) # seq order not maintained in Alignment self.assertEqual(self.aln.Names, ['rna1', 'rna2', 'rna3']) sub_aln = self.aln.takePositions([0, 1, 5]) self.assertEqual(sub_aln.Names, ['rna1', 'rna2', 'rna3']) def test_subset_seqs_Alignment(self): rna1 = RnaSequence('UCG', Name='rna1') rna2 = RnaSequence('YCG', Name='rna2') rna3 = RnaSequence('CAR', Name='rna3') sub_aln = Alignment([rna2, rna3], MolType=RNA) aln = Alignment([rna1, rna2, rna3], MolType=RNA) obs_sub_aln = aln.takeSeqs(['rna2', 'rna3']) self.assertEqual(obs_sub_aln, sub_aln) self.assertEqual(str(obs_sub_aln), str(sub_aln)) # Selected sequences should be in specified order? obs_sub_aln_1 = self.aln.takeSeqs(['rna3', 'rna2']) obs_sub_aln_2 = self.aln.takeSeqs(['rna2', 'rna3']) self.assertNotEqual(str(obs_sub_aln_1), str(obs_sub_aln_2)) def test_subset_seqs_DenseAlignment(self): model1 = ModelSequence('UCG', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) model2 = ModelSequence('YCG', Name='rna2',\ Alphabet=RNA.Alphabets.DegenGapped) model3 = ModelSequence('CAR', Name='rna3',\ Alphabet=RNA.Alphabets.DegenGapped) sub_da = DenseAlignment([model1, model2, model3],\ MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) # takeSeqs by name should have the same effect as # getSubAlignment by seq idx? obs_sub_da_TS = self.da.takeSeqs(['rna1']) obs_sub_da_SA = self.da.getSubAlignment(seqs=[0]) # These two are now the same. Fixed mapping of key to char array. self.assertEqual(obs_sub_da_TS, obs_sub_da_SA) self.assertEqual(str(obs_sub_da_TS), str(obs_sub_da_SA)) def test_aln_equality(self): # When does something compare equal? self.assertEqual(self.da == self.da, True) # one sequence less other_da1 = DenseAlignment([self.model1, self.model2],\ MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) self.assertEqual(self.da == other_da1, False) # seqs in different order -- doesn't matter other_da2 = DenseAlignment([self.model1, self.model3, self.model2],\ MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) self.assertEqual(self.da == other_da2, True) # seqs in different encoding -- doesn't matter, only looks at data other_da3 = DenseAlignment([self.model1, self.model2, self.model3]) # Should this compare False even though the data is exactly the same? # The MolType is different... self.assertEqual(self.da == other_da3, True) assert alltrue( list(map(alltrue, self.da.ArraySeqs == other_da3.ArraySeqs))) def test_seq_equality(self): model1 = ModelSequence('UCG', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) model2 = ModelSequence('UCG', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) # Shouldn't the above two sequences be equal? self.assertEqual(model1, model2) # string comparison is True self.assertEqual(str(model1), str(model2)) def test_seq_ungapping(self): rna1 = RnaSequence('U-C-A-G-', Name='rna1') model1 = ModelSequence('U-C-A-G-', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) self.assertEqual(rna1, 'U-C-A-G-') self.assertEqual(rna1.degap(), 'UCAG') # check is produces the right string from the beginning self.assertEqual(str(model1), 'U-C-A-G-') self.assertEqual(model1._data, [0, 4, 1, 4, 2, 4, 3, 4]) # ModelSequence should maybe have the same degap method as normal Seq self.assertEqual(str(model1.degap()), 'UCAG') def test_the_rest_of_ModelSequence(self): """The class ModelSequence has 14 methods, but only 2 unittests. You might want to add some tests there...""" #note: mostly these are tested in derived classes, for convenience. pass
class AllTests(TestCase): def setUp(self): """setUp method for all tests""" # named sequences self.rna1 = RnaSequence('UCAGGG', Name='rna1') self.rna2 = RnaSequence('YCU-RG', Name='rna2') self.rna3 = RnaSequence('CAA-NR', Name='rna3') self.model1 = ModelSequence('UCAGGG', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) self.model2 = ModelSequence('YCU-RG', Name='rna2',\ Alphabet=RNA.Alphabets.DegenGapped) self.model3 = ModelSequence('CAA-NR', Name='rna3',\ Alphabet=RNA.Alphabets.DegenGapped) self.aln = Alignment([self.rna1, self.rna2, self.rna3], MolType=RNA) self.da = DenseAlignment([self.model1, self.model2, self.model3],\ MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) # seqs no name self.nn_rna1 = RnaSequence('UCAGGG') self.nn_rna2 = RnaSequence('YCU-RG') self.nn_rna3 = RnaSequence('CAA-NR') self.nn_model1 = ModelSequence('UCAGGG',\ Alphabet=RNA.Alphabets.DegenGapped) self.nn_model2 = ModelSequence('YCU-RG',\ Alphabet=RNA.Alphabets.DegenGapped) self.nn_model3 = ModelSequence('CAA-NR',\ Alphabet=RNA.Alphabets.DegenGapped) self.nn_aln = Alignment([self.nn_rna1, self.nn_rna2, self.nn_rna3],\ MolType=RNA) self.nn_da = DenseAlignment([self.nn_model1, self.nn_model2,\ self.nn_model3], MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) def test_printing_named_seqs(self): """Printing named seqs should work the same on Aln and DenseAln""" #Note: the newline trailing each sequence is intentional, because #we want each FASTA-format record to be separated. exp_lines_general = ['>rna1','UCAGGG','>rna2','YCU-RG','>rna3','CAA-NR'] self.assertEqual(str(self.aln), '\n'.join(exp_lines_general) + '\n') self.assertEqual(str(self.da), '\n'.join(exp_lines_general) + '\n') def test_printing_unnamed_seqs(self): """Printing unnamed sequences should work the same on Aln and DenseAln """ exp_lines_gen = ['>seq_0','UCAGGG','>seq_1','YCU-RG','>seq_2','CAA-NR\n'] self.assertEqual(str(self.nn_aln),'\n'.join(exp_lines_gen)) self.assertEqual(str(self.nn_da),'\n'.join(exp_lines_gen)) def test_DenseAlignment_without_moltype(self): """Expect MolType to be picked up from the sequences.""" m1 = ModelSequence('UCAG',Alphabet=RNA.Alphabets.DegenGapped,\ Name='rna1') m2 = ModelSequence('CCCR',Alphabet=RNA.Alphabets.DegenGapped,\ Name='rna2') da = DenseAlignment([m1, m2]) exp_lines = ['>rna1','UCAG','>rna2','CCCR'] self.assertEqual(str(da), '\n'.join(exp_lines) + '\n') def test_names(self): # Should both alignments handle names the same way? self.assertEqual(self.aln.Names, ['rna1','rna2','rna3']) self.assertEqual(self.da.Names, ['rna1','rna2','rna3']) # On unnamed sequences the behavior is now the same. self.assertEqual(self.nn_aln.Names, ['seq_0','seq_1','seq_2']) self.assertEqual(self.nn_da.Names, ['seq_0','seq_1','seq_2']) def test_seqFreqs(self): """seqFreqs should work the same on Alignment and DenseAlignment""" # Used alphabet: ('U', 'C', 'A', 'G', '-', 'B', 'D', 'H',\ # 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y') exp = [[1,1,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0],\ [1,1,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0],\ [0,1,2,0,1,0,0,0,0,0,1,0,1,0,0,0,0]] # This works self.assertEqual(self.da.getSeqFreqs().Data, exp) # This used to raise an error, but now works self.assertEqual(self.aln.getSeqFreqs().Data, exp) def test_subset_positions_DenseAlignment(self): model1 = ModelSequence('UCG', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) model2 = ModelSequence('YCG', Name='rna2',\ Alphabet=RNA.Alphabets.DegenGapped) model3 = ModelSequence('CAR', Name='rna3',\ Alphabet=RNA.Alphabets.DegenGapped) sub_da = DenseAlignment([model1, model2, model3],\ MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) full_data = array([[0,1,2,3,3,3],[15,1,0,4,12,3],[1,2,2,4,10,12]]) sub_data = array([[0,1,3],[15,1,3],[1,2,12]]) # First check some data self.assertEqual(self.da.ArraySeqs, full_data) self.assertEqual(self.da.ArrayPositions, transpose(full_data)) self.assertEqual(sub_da.ArraySeqs, sub_data) self.assertEqual(sub_da.ArrayPositions, transpose(sub_data)) obs_sub_da_TP = self.da.takePositions([0,1,5]) obs_sub_da_SA = self.da.getSubAlignment(pos=[0,1,5]) # When using the getSubAlignment method the data is right self.assertEqual(obs_sub_da_SA, sub_da) self.failIfEqual(obs_sub_da_SA, self.da) self.assertEqual(obs_sub_da_SA.ArraySeqs, sub_data) self.assertEqual(obs_sub_da_SA.ArrayPositions, transpose(sub_data)) # For the takePositions method: Why does this work self.assertEqual(obs_sub_da_TP, sub_da) self.failIfEqual(obs_sub_da_TP, self.da) # If the data doesn't match? self.assertEqual(obs_sub_da_TP.ArraySeqs, sub_data) self.assertEqual(obs_sub_da_TP.ArrayPositions, transpose(sub_data)) # Shouldn't the __eq__ method check the data at least? def test_subset_positions_Alignment(self): rna1 = RnaSequence('UCG', Name='rna1') rna2 = RnaSequence('YCG', Name='rna2') rna3 = RnaSequence('CAR', Name='rna3') sub_aln = Alignment([rna1, rna2, rna3], MolType=RNA) obs_sub_aln = self.aln.takePositions([0,1,5]) self.assertEqual(obs_sub_aln, sub_aln) self.failIfEqual(obs_sub_aln, self.aln) # string representations should be the same. This fails right # now, because sequence order is not maintained. See separate test. self.assertEqual(str(obs_sub_aln), str(sub_aln)) def test_takePositions_sequence_order(self): """Alignment takePositions should maintain seq order""" #This works self.assertEqual(self.da.Names,['rna1','rna2','rna3']) sub_da = self.da.getSubAlignment(pos=[0,1,5]) self.assertEqual(sub_da.Names,['rna1','rna2','rna3']) # seq order not maintained in Alignment self.assertEqual(self.aln.Names,['rna1','rna2','rna3']) sub_aln = self.aln.takePositions([0,1,5]) self.assertEqual(sub_aln.Names,['rna1','rna2','rna3']) def test_subset_seqs_Alignment(self): rna1 = RnaSequence('UCG', Name='rna1') rna2 = RnaSequence('YCG', Name='rna2') rna3 = RnaSequence('CAR', Name='rna3') sub_aln = Alignment([rna2, rna3], MolType=RNA) aln = Alignment([rna1, rna2, rna3], MolType=RNA) obs_sub_aln = aln.takeSeqs(['rna2','rna3']) self.assertEqual(obs_sub_aln, sub_aln) self.assertEqual(str(obs_sub_aln), str(sub_aln)) # Selected sequences should be in specified order? obs_sub_aln_1 = self.aln.takeSeqs(['rna3','rna2']) obs_sub_aln_2 = self.aln.takeSeqs(['rna2','rna3']) self.failIfEqual(str(obs_sub_aln_1), str(obs_sub_aln_2)) def test_subset_seqs_DenseAlignment(self): model1 = ModelSequence('UCG', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) model2 = ModelSequence('YCG', Name='rna2',\ Alphabet=RNA.Alphabets.DegenGapped) model3 = ModelSequence('CAR', Name='rna3',\ Alphabet=RNA.Alphabets.DegenGapped) sub_da = DenseAlignment([model1, model2, model3],\ MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) # takeSeqs by name should have the same effect as # getSubAlignment by seq idx? obs_sub_da_TS = self.da.takeSeqs(['rna1']) obs_sub_da_SA = self.da.getSubAlignment(seqs=[0]) # These two are now the same. Fixed mapping of key to char array. self.assertEqual(obs_sub_da_TS, obs_sub_da_SA) self.assertEqual(str(obs_sub_da_TS), str(obs_sub_da_SA)) def test_aln_equality(self): # When does something compare equal? self.assertEqual(self.da == self.da, True) # one sequence less other_da1 = DenseAlignment([self.model1, self.model2],\ MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) self.assertEqual(self.da == other_da1, False) # seqs in different order -- doesn't matter other_da2 = DenseAlignment([self.model1, self.model3, self.model2],\ MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) self.assertEqual(self.da == other_da2, True) # seqs in different encoding -- doesn't matter, only looks at data other_da3 = DenseAlignment([self.model1, self.model2, self.model3]) # Should this compare False even though the data is exactly the same? # The MolType is different... self.assertEqual(self.da == other_da3, True) assert alltrue(map(alltrue,self.da.ArraySeqs == other_da3.ArraySeqs)) def test_seq_equality(self): model1 = ModelSequence('UCG', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) model2 = ModelSequence('UCG', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) # Shouldn't the above two sequences be equal? self.assertEqual(model1, model2) # string comparison is True self.assertEqual(str(model1), str(model2)) def test_seq_ungapping(self): rna1 = RnaSequence('U-C-A-G-', Name='rna1') model1 = ModelSequence('U-C-A-G-', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) self.assertEqual(rna1, 'U-C-A-G-') self.assertEqual(rna1.degap(), 'UCAG') # check is produces the right string from the beginning self.assertEqual(str(model1), 'U-C-A-G-') self.assertEqual(model1._data, [0,4,1,4,2,4,3,4]) # ModelSequence should maybe have the same degap method as normal Seq self.assertEqual(str(model1.degap()), 'UCAG') def test_the_rest_of_ModelSequence(self): """The class ModelSequence has 14 methods, but only 2 unittests. You might want to add some tests there...""" #note: mostly these are tested in derived classes, for convenience. pass
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) parameters = {} # get the tree insertion method to use module = opts.insertion_method # create output directory output_dir = opts.output_dir create_dir(output_dir) # list of tree insertion methods tree_insertion_module_names = \ {'raxml_v730':cogent.app.raxml_v730, 'parsinsert':cogent.app.parsinsert, 'pplacer':cogent.app.pplacer} # load input sequences and convert to phylip since the tools require # the query sequences to phylip-compliant names load_aln = MinimalFastaParser(open(opts.input_fasta_fp, 'U')) aln = DenseAlignment(load_aln) seqs, align_map = aln.toPhylip() if opts.method_params_fp: param_dict = parse_qiime_parameters(open(opts.method_params_fp, 'U')) if module == 'raxml_v730': # load the reference sequences load_ref_aln = \ DenseAlignment(MinimalFastaParser(open(opts.refseq_fp,'U'))) # combine and load the reference plus query combined_aln = MinimalFastaParser(StringIO(load_ref_aln.toFasta() + \ '\n' + aln.toFasta())) # overwrite the alignment map aln = DenseAlignment(combined_aln) seqs, align_map = aln.toPhylip() try: parameters = param_dict['raxml'] except: parameters = {} tree = convert_tree_tips(align_map, opts.starting_tree_fp) # write out the tree with phylip labels updated_tree_fp = join(output_dir, \ '%s_phylip_named_tree.tre' % (module)) write_updated_tree_file(updated_tree_fp, tree) # set the primary parameters for raxml parameters['-w'] = abspath(output_dir) + '/' parameters["-n"] = split(splitext(get_tmp_filename())[0])[-1] parameters["-t"] = updated_tree_fp if "-f" not in parameters: parameters["-f"] = 'v' if "-m" not in parameters: parameters["-m"] = 'GTRGAMMA' elif module == 'pplacer': try: parameters = param_dict['pplacer'] except: parameters = {} # make sure stats file is passed if not opts.stats_fp: raise IOError, \ 'When using pplacer, the RAxML produced info file is required.' # set the primary parameters for pplacer - allow for user-defined parameters['--out-dir'] = abspath(output_dir) + '/' parameters["-t"] = opts.starting_tree_fp parameters['-r'] = opts.refseq_fp parameters['-s'] = opts.stats_fp elif module == 'parsinsert': try: parameters = param_dict['parsinsert'] except: parameters = {} # define log fp log_fp = join(output_dir, 'parsinsert.log') # define tax assignment values fp tax_assign_fp = join(output_dir, 'parsinsert_assignments.log') parameters["-l"] = log_fp parameters["-o"] = tax_assign_fp parameters["-s"] = opts.refseq_fp parameters["-t"] = opts.starting_tree_fp # call the module and return a tree object result = \ tree_insertion_module_names[module].insert_sequences_into_tree(seqs, moltype=DNA, params=parameters) result_tree = strip_and_rename_unwanted_labels_from_tree(align_map, result) # write out the resulting tree final_tree = join(output_dir, '%s_final_placement.tre' % (module)) write_updated_tree_file(final_tree, result)
def VOR(alignment, n=1000, force_monte_carlo=False, mc_threshold=1000): """Returns sequence weights according to the Voronoi weighting method. alignment: Alignment object n: sampling size (in case monte carlo is used) force_monte_carlo: generate pseudo seqs with monte carlo always (even if there's only a small number of possible unique pseudo seqs mc_threshold: threshold of when to use the monte carlo sampling method if the number of possible pseudo seqs exceeds this threshold monte carlo is used. VOR differs from VA in the set of sequences against which it's comparing all the sequences in the alignment. In addition to the sequences in the alignment itself, it uses a set of pseudo sequences. Generating discrete random sequences: A discrete random sequence is generated by choosing with equal likelihood at each position one of the residues observed at that position in the alighment. An occurrence of once in the alignment column is sufficient to make the residue type an option. Note: you're choosing with equal likelihood from each of the observed residues (independent of their frequency at that position). In earlier versions of the algorithm the characters were chosen either at the frequency with which they occur at a position or at the frequency with which they occur in the database. Both trials were unsuccesful, because they deviate from random sampling (see Sibbald & Argos 1990). Depending on the number of possible pseudo sequences, all of them are used or a random sample is taken (monte carlo). Example: Alignment: AA, AA, BB AA AA BB AA 0 (.5) 0 (.5) 2 AB 1 (1/3) 1 (1/3) 1 (1/3) BA 1 (1/3) 1 (1/3) 1 (1/3) BB 2 2 0 (1) ----------------------------- total 7/6 7/6 10/6 norm .291 .291 .418 For a bigger example with more pseudo sequences, see Henikoff 1994 I tried the described optimization (pre-calculate the distance to the closest sequence). I doesn't have an advantage over the original method. """ MC_THRESHOLD = mc_threshold #decide on sampling method if force_monte_carlo or number_of_pseudo_seqs(alignment) > MC_THRESHOLD: sampling_method = pseudo_seqs_monte_carlo else: sampling_method = pseudo_seqs_exact #change sequences into arrays aln_array = DenseAlignment(alignment, MolType=BYTES) weights = zeros(len(aln_array.Names), Float64) #calc distances for each pseudo seq rows = [array(seq, 'c') for seq in map(str, aln_array.Seqs)] for seq in sampling_method(aln_array, n=n): seq = array(seq, 'c') temp = [hamming_distance(row, seq) for row in rows] votes = row_to_vote(array(temp)) #change distances to votes weights += votes #add to previous weights weight_dict = Weights(dict(zip(aln_array.Names, weights))) weight_dict.normalize() #normalize return weight_dict
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) parameters = {} # get the tree insertion method to use module = opts.insertion_method # create output directory output_dir = opts.output_dir create_dir(output_dir) # list of tree insertion methods tree_insertion_module_names = { "raxml_v730": qiime.pycogent_backports.raxml_v730, "parsinsert": qiime.pycogent_backports.parsinsert, "pplacer": qiime.pycogent_backports.pplacer, } # load input sequences and convert to phylip since the tools require # the query sequences to phylip-compliant names load_aln = MinimalFastaParser(open(opts.input_fasta_fp, "U")) aln = DenseAlignment(load_aln) seqs, align_map = aln.toPhylip() if opts.method_params_fp: param_dict = parse_qiime_parameters(open(opts.method_params_fp, "U")) if module == "raxml_v730": # load the reference sequences load_ref_aln = DenseAlignment(MinimalFastaParser(open(opts.refseq_fp, "U"))) # combine and load the reference plus query combined_aln = MinimalFastaParser(StringIO(load_ref_aln.toFasta() + "\n" + aln.toFasta())) # overwrite the alignment map aln = DenseAlignment(combined_aln) seqs, align_map = aln.toPhylip() try: parameters = param_dict["raxml"] except: parameters = {} tree = convert_tree_tips(align_map, opts.starting_tree_fp) # write out the tree with phylip labels updated_tree_fp = join(output_dir, "%s_phylip_named_tree.tre" % (module)) write_updated_tree_file(updated_tree_fp, tree) # set the primary parameters for raxml parameters["-w"] = abspath(output_dir) + "/" parameters["-n"] = split(splitext(get_tmp_filename())[0])[-1] parameters["-t"] = updated_tree_fp if "-f" not in parameters: parameters["-f"] = "v" if "-m" not in parameters: parameters["-m"] = "GTRGAMMA" elif module == "pplacer": try: parameters = param_dict["pplacer"] except: parameters = {} # make sure stats file is passed if not opts.stats_fp: raise IOError, "When using pplacer, the RAxML produced info file is required." # set the primary parameters for pplacer - allow for user-defined parameters["--out-dir"] = abspath(output_dir) + "/" parameters["-t"] = opts.starting_tree_fp parameters["-r"] = opts.refseq_fp parameters["-s"] = opts.stats_fp elif module == "parsinsert": try: parameters = param_dict["parsinsert"] except: parameters = {} # define log fp log_fp = join(output_dir, "parsinsert.log") # define tax assignment values fp tax_assign_fp = join(output_dir, "parsinsert_assignments.log") parameters["-l"] = log_fp parameters["-o"] = tax_assign_fp parameters["-s"] = opts.refseq_fp parameters["-t"] = opts.starting_tree_fp # call the module and return a tree object result = tree_insertion_module_names[module].insert_sequences_into_tree(seqs, moltype=DNA, params=parameters) result_tree = strip_and_rename_unwanted_labels_from_tree(align_map, result) # write out the resulting tree final_tree = join(output_dir, "%s_final_placement.tre" % (module)) write_updated_tree_file(final_tree, result)