def test_aligned_dna_fasta_format_validate_positive(self): filepath = self.get_data_path('aligned-dna-sequences.fasta') format = AlignedDNAFASTAFormat(filepath, mode='r') format.validate()
def test_aligned_dna_fasta_format_validate_negative(self): filepath = self.get_data_path('not-dna-sequences') format = AlignedDNAFASTAFormat(filepath, mode='r') with self.assertRaisesRegex(ValueError, 'AlignedDNAFASTA'): format.validate()
def test_aligned_dna_fasta_format_validate_negative(self): filepath = self.get_data_path('not-dna-sequences') format = AlignedDNAFASTAFormat(filepath, mode='r') with self.assertRaisesRegex(ValidationError, 'AlignedDNAFASTA'): format.validate()
def _dna_iterator_to_aligned_fasta(iterator): ff = AlignedDNAFASTAFormat() skbio.io.write(iter(iterator), format='fasta', into=str(ff)) return ff
def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments): # Save original sequence IDs since long ids (~250 chars) can be truncated # by mafft. We'll replace the IDs in the aligned sequences file output by # mafft with the originals. # # https://github.com/qiime2/q2-alignment/issues/37 aligned_seq_ids = {} unaligned_seq_ids = {} if alignment_fp is not None: for seq in skbio.io.read(alignment_fp, format='fasta', constructor=skbio.DNA): id_ = seq.metadata['id'] if id_ in aligned_seq_ids: raise ValueError( "A sequence ID is duplicated in the aligned sequences: " "%r" % id_) else: aligned_seq_ids[id_] = True for seq in skbio.io.read(sequences_fp, format='fasta', constructor=skbio.DNA): id_ = seq.metadata['id'] if id_ in unaligned_seq_ids: raise ValueError( "A sequence ID is duplicated in the unaligned sequences: " "%r" % id_) elif id_ in aligned_seq_ids: raise ValueError( "A sequence ID is present in both the aligned and unaligned " "sequences: %r" % id_) else: unaligned_seq_ids[id_] = True result = AlignedDNAFASTAFormat() result_fp = str(result) ids = {**aligned_seq_ids, **unaligned_seq_ids} # mafft will fail if the number of sequences is larger than 1 million. # mafft requires using parttree which is an algorithm to build an # approximate tree from a large number of unaligned sequences. # By catching the error below if a user has not used parttree flag, we are # eliminating the need for the mafft error to be shown to the user which # can be confusing and intimidating. if not parttree and len(ids) > 1000000: raise ValueError( "The number of sequences in your feature table is larger than " "1 million, please use the parttree parameter") # mafft's signal for utilizing all cores is -1. We want to our users # to enter auto for using all cores. This is to prevent any confusion and # to keep the UX consisent. if n_threads == 'auto': n_threads = -1 # `--inputorder` must be turned on because we need the input and output in # the same sequence order to replace the IDs below. This is mafft's default # behavior but we pass the flag in case that changes in the future. cmd = ["mafft", "--preservecase", "--inputorder", "--thread", str(n_threads)] if parttree: cmd += ['--parttree'] if alignment_fp is not None: add_flag = '--addfragments' if addfragments else '--add' cmd += [add_flag, sequences_fp, alignment_fp] else: cmd += [sequences_fp] run_command(cmd, result_fp) # Read output alignment into memory, reassign original sequence IDs, and # write alignment back to disk. msa = skbio.TabularMSA.read(result_fp, format='fasta', constructor=skbio.DNA) # Using `assert` because mafft would have had to add or drop sequences # while aligning, which would be a bug on mafft's end. This is just a # sanity check and is not expected to trigger in practice. assert len(ids) == len(msa) for id, seq in zip(ids, msa): seq.metadata['id'] = id # Turning off roundtripping options to speed up writing. We can safely turn # these options off because we know the sequence IDs are rountrip-safe # since we read them from a FASTA file above. # # http://scikit-bio.org/docs/latest/generated/ # skbio.io.format.fasta.html#writer-specific-parameters msa.write(result_fp, id_whitespace_replacement=None, description_newline_replacement=None) return result
def setUp(self): super().setUp() self.aligned_dna_path = pkg_resources.resource_filename( 'rescript.tests', 'data/trim-test-alignment.fasta') self.aligned_dna_seqs = AlignedDNAFASTAFormat( self.aligned_dna_path, mode='r').view(AlignedDNAIterator)
def setUp(self): super().setUp() input_fp = self.get_data_path('degap-test-alignment.fasta') self.alignedseqs = AlignedDNAFASTAFormat( input_fp, mode='r').view(AlignedDNAIterator)
def setUp(self): super().setUp() aligned_seqs_fp = self.get_data_path('trim-test-alignment.fasta') aligned_with_primers_fp = self.get_data_path( 'trim-test-alignment-with-primers.fasta') self.aligned_seqs = qiime2.Artifact.import_data( 'FeatureData[AlignedSequence]', aligned_seqs_fp) self.aligned_seqs_fasta = AlignedDNAFASTAFormat(aligned_seqs_fp, mode='r') self.primers_dict = { "forward": "GGGAATCTTCCACAATGG", "reverse": "GTGTTCTTCTCTAACAACAG" } self.aligned_with_primers = qiime2.Artifact.import_data( 'FeatureData[AlignedSequence]', aligned_with_primers_fp) self.aligned_with_primers_fasta = AlignedDNAFASTAFormat( aligned_with_primers_fp, mode='r') self.aligned_mess_fasta = AlignedDNAFASTAFormat( self.get_data_path('trim-test-alignment-with-primers-mess.fasta'), mode='r') self.aligned_with_fwd_fasta = AlignedDNAFASTAFormat( self.get_data_path('trim-test-alignment-fwd.fasta'), mode='r') self.aligned_with_rev_fasta = AlignedDNAFASTAFormat( self.get_data_path('trim-test-alignment-rev.fasta'), mode='r') self.trimmed_fasta = AlignedDNAFASTAFormat( self.get_data_path('trim-test-sequences-trimmed.fasta'), mode='r') self.fake_ctx = FakeCtx({ 1: self.aligned_with_primers_fasta, 2: self.aligned_with_fwd_fasta, 3: self.aligned_with_rev_fasta }) self.exp_seqs_both_primers = { 's1': ('GGGAATCTTCCACAATGGGTGCAAACCTGATGGAGCAATGCCGCGTGAG' 'TGAAGANAGGTCTTCGGATCGTAAAGCTCTGTTGTTAGAGAAGAACAC'), 's2': ('GGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCGACGCCGCGTGGG' 'GGATGA-CGGCCTTCGGGTTGTAAACTCCTTTCGCCAGGGACGAAGCG'), 's3': ('GGGAATATTGGACAATGGGCGAAAGCCTGATCCAGCCATGCCGCGTGTG' 'TGAAGA-AGGCCTTTTGGTTGTAAAGCACTTTAAGTGGGGAGGAAAAG'), 's4': ('GGGAATTTTGGACAATGGGGGCAACCCTGATCCAGCCATGCCGCGTGCG' 'GGAAGANAGGCCTTCGGGTTGTAAACCGCTTTTGTTCGGGAAGAAATC'), 's5': ('GGGAATATTGGACAATGGGCGAAAGCCTGATCCAGCCATGCCGCGTGTG' 'TGAAGA-AGGCCTTTTGGTTGTAAAGCACTTTAAGTGGGGAGGAAAAG'), } self.exp_seqs_only_fwd = { 's1': ('GGGAATCTTCCACAATGGGTGCAAACCTGATGGAGCAATGCCGCGTGAG' 'TGAAGANAGGTCTTCGGATCGTAAAGCTCTGTTGTTAGAGAAGAACACG' 'TGCTAGG--------'), 's2': ('GGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCGACGCCGCGTGGG' 'GGATGA-CGGCCTTCGGGTTGTAAACTCCTTTCGCCAGGGACGAAGCGT' 'TTTG-----------'), 's3': ('GGGAATATTGGACAATGGGCGAAAGCCTGATCCAGCCATGCCGCGTGTG' 'TGAAGA-AGGCCTTTTGGTTGTAAAGCACTTTAAGTGGGGAGGAAAAGC' 'TTATGGTTAAAAAAA'), 's4': ('GGGAATTTTGGACAATGGGGGCAACCCTGATCCAGCCATGCCGCGTGCG' 'GGAAGANAGGCCTTCGGGTTGTAAACCGCTTTTGTTCGGGAAGAAATCC' 'TCTGGGCTAAAAAAA'), 's5': ('GGGAATATTGGACAATGGGCGAAAGCCTGATCCAGCCATGCCGCGTGTG' 'TGAAGA-AGGCCTTTTGGTTGTAAAGCACTTTAAGTGGGGAGGAAAAGC' 'TTGTGGTTAA-----'), } self.exp_seqs_only_rev = { 's1': ('-----TAGGGAATCTTCCACAATGGGTGCAAACCTGATGGAGCAATGC' 'CGCGTGAGTGAAGANAGGTCTTCGGATCGTAAAGCTCTGTTGTTAGAG' 'AAGAACAC'), 's2': ('AATTTTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCGACGC' 'CGCGTGGGGGATGA-CGGCCTTCGGGTTGTAAACTCCTTTCGCCAGGG' 'ACGAAGCG'), 's3': ('-----TGGGGAATATTGGACAATGGGCGAAAGCCTGATCCAGCCATGC' 'CGCGTGTGTGAAGA-AGGCCTTTTGGTTGTAAAGCACTTTAAGTGGGG' 'AGGAAAAG'), 's4': ('-----TGGGGAATTTTGGACAATGGGGGCAACCCTGATCCAGCCATGC' 'CGCGTGCGGGAAGANAGGCCTTCGGGTTGTAAACCGCTTTTGTTCGGG' 'AAGAAATC'), 's5': ('---AATGGGGAATATTGGACAATGGGCGAAAGCCTGATCCAGCCATGC' 'CGCGTGTGTGAAGA-AGGCCTTTTGGTTGTAAAGCACTTTAAGTGGGG' 'AGGAAAAG'), }