def test_align_optimal_simple(local, term, gap_penalty, input1, input2, expect): """ Test `align_optimal()` function using constructed test cases. """ seq1 = seq.NucleotideSequence(input1) seq2 = seq.NucleotideSequence(input2) matrix = align.SubstitutionMatrix.std_nucleotide_matrix() # Test alignment function alignments = align.align_optimal(seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=term, local=local) for ali in alignments: assert str(ali) in expect # Test if separate score function calculates the same score for ali in alignments: score = align.score(ali, matrix, gap_penalty=gap_penalty, terminal_penalty=term) assert score == ali.score
def test_alignment_str(): seq1 = seq.NucleotideSequence("ACCTGA") seq2 = seq.NucleotideSequence("TATGCT") ali_str = ["A-CCTGA----", "----T-ATGCT"] trace = align.Alignment.trace_from_strings(ali_str) alignment = align.Alignment([seq1, seq2], trace, None) assert str(alignment).split("\n") == ali_str
def test_align_ungapped(): seq1 = seq.NucleotideSequence("ACCTGA") seq2 = seq.NucleotideSequence("ACTGGT") matrix = align.SubstitutionMatrix.std_nucleotide_matrix() ali = align.align_ungapped(seq1, seq2, matrix) assert ali.score == 3 assert str(ali) == "ACCTGA\nACTGGT"
def test_sequence_conversion(): path = os.path.join(data_dir("sequence"), "nuc.fasta") file = fasta.FastaFile.read(path) assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file) seq_dict = fasta.get_sequences(file) file2 = fasta.FastaFile() fasta.set_sequences(file2, seq_dict) seq_dict2 = fasta.get_sequences(file2) # Cannot compare dicts directly, since the original RNA sequence is # now guessed as protein sequence for seq1, seq2 in zip(seq_dict.values(), seq_dict2.values()): assert str(seq1) == str(seq2) file3 = fasta.FastaFile() fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG")) assert file3["sequence"] == "AACCTTGG" path = os.path.join(data_dir("sequence"), "prot.fasta") file4 = fasta.FastaFile.read(path) # Expect a warning for selenocysteine conversion with pytest.warns(UserWarning): assert seq.ProteinSequence("YAHCGFRTGS") == fasta.get_sequence(file4) path = os.path.join(data_dir("sequence"), "invalid.fasta") file5 = fasta.FastaFile.read(path) with pytest.raises(ValueError): seq.NucleotideSequence(fasta.get_sequence(file5))
def test_find_subsequence(): string = "ATACGCTTGCT" substring = "GCT" main_seq = seq.NucleotideSequence(string) sub_seq = seq.NucleotideSequence(substring) matches = seq.find_subsequence(main_seq, sub_seq) assert list(matches) == [4,8]
def test_sequence_conversion(): path = os.path.join(data_dir, "nuc.fasta") file = fasta.FastaFile() file.read(path) assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file) seq_dict = fasta.get_sequences(file) file2 = fasta.FastaFile() fasta.set_sequences(file2, seq_dict) seq_dict2 = fasta.get_sequences(file2) assert seq_dict == seq_dict2 file3 = fasta.FastaFile() fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG")) assert file3["sequence"] == "AACCTTGG" path = os.path.join(data_dir, "prot.fasta") file4 = fasta.FastaFile() file4.read(path) assert seq.ProteinSequence("YAHGFRTGS") == fasta.get_sequence(file4) path = os.path.join(data_dir, "invalid.fasta") file5 = fasta.FastaFile() file5.read(path) with pytest.raises(ValueError): seq.NucleotideSequence(fasta.get_sequence(file5))
def test_nucleotide_construction(): string = "AATGCGTTA" string_amb = "ANNGCBRTAN" dna = seq.NucleotideSequence(string) assert dna.get_alphabet() == seq.NucleotideSequence.alphabet_unamb assert str(dna) == string dna = seq.NucleotideSequence(string_amb) assert dna.get_alphabet() == seq.NucleotideSequence.alphabet_amb assert str(dna) == string_amb
def test_access_high_level(): path = os.path.join(data_dir("sequence"), "nuc.fasta") file = fasta.FastaFile.read(path) sequences = fasta.get_sequences(file) assert sequences == { "dna sequence": seq.NucleotideSequence("ACGCTACGT", False), "another dna sequence": seq.NucleotideSequence("A", False), "third dna sequence": seq.NucleotideSequence("ACGT", False), "rna sequence": seq.NucleotideSequence("ACGT", False), "ambiguous rna sequence": seq.NucleotideSequence("ACGTNN", True), }
def test_write_iter(chars_per_line, n_sequences): """ Test whether :class:`FastaFile.write()` and :class:`FastaFile.write_iter()` produce the same output file for random sequences. """ LENGTH_RANGE = (50, 150) SCORE_RANGE = (10, 60) # Generate random sequences and scores np.random.seed(0) sequences = [] for i in range(n_sequences): seq_length = np.random.randint(*LENGTH_RANGE) code = np.random.randint(len(seq.NucleotideSequence.alphabet_unamb), size=seq_length) sequence = seq.NucleotideSequence() sequence.code = code sequences.append(sequence) fasta_file = fasta.FastaFile(chars_per_line) for i, sequence in enumerate(sequences): header = f"seq_{i}" fasta_file[header] = str(sequence) ref_file = io.StringIO() fasta_file.write(ref_file) test_file = io.StringIO() fasta.FastaFile.write_iter(test_file, ((f"seq_{i}", str(sequence)) for i, sequence in enumerate(sequences)), chars_per_line) assert test_file.getvalue() == ref_file.getvalue()
def test_from_alignment(): seq1 = seq.NucleotideSequence("CGTCAT") seq2 = seq.NucleotideSequence("TCATGC") ali_str = ["CGTCAT--", "--TCATGC"] trace = align.Alignment.trace_from_strings(ali_str) alignment = align.Alignment([seq1, seq2], trace, None) profile = seq.SequenceProfile.from_alignment(alignment) symbols = np.array([[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 2], [0, 2, 0, 0], [2, 0, 0, 0], [0, 0, 0, 2], [0, 0, 1, 0], [0, 1, 0, 0]]) gaps = np.array([1, 1, 0, 0, 0, 0, 1, 1]) alphabet = seq.Alphabet(["A", "C", "G", "T"]) assert np.array_equal(symbols, profile.symbols) assert np.array_equal(gaps, profile.gaps) assert (alphabet == profile.alphabet)
def test_rna_conversion(): sequence = seq.NucleotideSequence("ACGT") fasta_file = fasta.FastaFile() fasta.set_sequence(fasta_file, sequence, "seq1", as_rna=False) fasta.set_sequence(fasta_file, sequence, "seq2", as_rna=True) assert fasta_file["seq1"] == "ACGT" assert fasta_file["seq2"] == "ACGU"
def test_access(): string = "AATGCGTTA" dna = seq.NucleotideSequence(string) assert string[2] == dna[2] assert string == "".join([symbol for symbol in dna]) dna = dna[3:-2] assert "GCGT" == str(dna)
def test_find_symbol(): string = "ATACGCTTGCT" symbol = "T" dna = seq.NucleotideSequence(string) assert list(seq.find_symbol(dna, symbol)) == [1,6,7,10] assert seq.find_symbol_first(dna, symbol) == 1 assert seq.find_symbol_last(dna, symbol) == 10
def test_rna_conversion(): sequence = seq.NucleotideSequence("ACGT") scores = np.array([0, 0, 0, 0]) fastq_file = fastq.FastqFile(offset="Sanger") fastq.set_sequence(fastq_file, sequence, scores, "seq1", as_rna=False) fastq.set_sequence(fastq_file, sequence, scores, "seq2", as_rna=True) assert fastq_file["seq1"][0] == "ACGT" assert fastq_file["seq2"][0] == "ACGU"
def test_translation_met_start(): """ Test whether the start amino acid is replaced by methionine, i.e. the correct function of the 'met_start' parameter. """ codon_table = seq.CodonTable.default_table().with_start_codons("AAA") dna = seq.NucleotideSequence("GAAACTGAAATAAGAAC") proteins, _ = dna.translate(codon_table=codon_table, met_start=True) assert [str(protein) for protein in proteins] == ["MLK*", "M*"]
def test_to_consensus_nuc_ambiguous(): symbols = np.array([[1, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 2], [0, 2, 0, 0], [2, 0, 0, 0], [0, 0, 0, 2], [0, 0, 1, 0], [0, 1, 0, 0]]) gaps = np.array([1, 1, 0, 0, 0, 0, 1, 1]) alphabet = seq.Alphabet(["A", "C", "G", "T"]) profile = seq.SequenceProfile(symbols, gaps, alphabet) assert seq.NucleotideSequence("MGTCATGC") == profile.to_consensus()
def sample_app(): """ Provide a `RNAfoldApp` object, where *RNAfold* has been executed for a sample sequence. """ sequence = seq.NucleotideSequence("CGACGTAGATGCTAGCTGACTCGATGC") app = RNAfoldApp(sequence) app.start() app.join() return app
def test_frame_translation(dna_str, protein_str_list): dna = seq.NucleotideSequence(dna_str) proteins, pos = dna.translate(complete=False) assert len(proteins) == len(protein_str_list) assert set([str(protein) for protein in proteins]) == set(protein_str_list) # Test if the positions are also right # -> Get sequence slice and translate completely assert set([ str(dna[start:stop].translate(complete=True)) for start, stop in pos ]) == set(protein_str_list)
def random_sequences(k, alphabet): N_SEQS = 10 SEQ_LENGTH = 1000 np.random.seed(0) sequences = [] for _ in range(N_SEQS): sequence = seq.NucleotideSequence() sequence.code = np.random.randint(len(alphabet), size=SEQ_LENGTH) sequences.append(sequence) return sequences
def test_manipulation(): dna_seq = seq.NucleotideSequence("ACGTA") dna_copy = dna_seq.copy() dna_copy[2] = "C" assert "ACCTA" == str(dna_copy) dna_copy = dna_seq.copy() dna_copy[0:2] = dna_copy[3:5] assert "TAGTA" == str(dna_copy) dna_copy = dna_seq.copy() dna_copy[np.array([True, False, False, False, True])] = "T" assert "TCGTT" == str(dna_copy) dna_copy = dna_seq.copy() dna_copy[1:4] = np.array([0, 1, 2]) assert "AACGA" == str(dna_copy)
def test_concatenation(): str1 = "AAGTTA" str2 = "CGA" str3 = "NNN" concat_seq = seq.NucleotideSequence(str1) + seq.NucleotideSequence(str2) assert str1 + str2 == str(concat_seq) concat_seq = seq.NucleotideSequence(str1) + seq.NucleotideSequence(str3) assert str1 + str3 == str(concat_seq) concat_seq = seq.NucleotideSequence(str3) + seq.NucleotideSequence(str1) assert str3 + str1 == str(concat_seq)
def test_access(chars_per_line): path = os.path.join(data_dir("sequence"), "random.fastq") file = fastq.FastqFile.read(path, offset=33, chars_per_line=chars_per_line) assert len(file) == 20 assert list(file.keys()) == [f"Read:{i+1:02d}" for i in range(20)] del (file["Read:05"]) assert len(file) == 19 assert list( file.keys()) == [f"Read:{i+1:02d}" for i in range(20) if i + 1 != 5] for sequence, scores in file.values(): assert len(sequence) == len(scores) assert (scores >= 0).all() sequence = seq.NucleotideSequence("ACTCGGT") scores = np.array([10, 12, 20, 11, 0, 80, 42]) file["test"] = sequence, scores sequence2, scores2 = file["test"] assert sequence == sequence2 assert np.array_equal(scores, scores2)
def test_nucleotide(simple_matrix, use_custom_matrix): """ Test masking a nucleotide sequence based on a known example. """ seq_string = "TGCAAGCTATTAGGCTTAGGTCAGTGCttaagcttaggtcagtgcAACATA" sequence = seq.NucleotideSequence(seq_string) if use_custom_matrix: matrix = simple_matrix else: matrix = None test_mask = TantanApp.mask_repeats(sequence, matrix) ref_mask = [True if char.islower() else False for char in seq_string] assert len(test_mask) == len(ref_mask) assert np.all(test_mask.tolist() == ref_mask)
def test_large_sequence_mapping(length, excerpt_length, seed): """ Test whether an excerpt of a very large sequence is aligned to that sequence at the position, where the excerpt was taken from. """ BAND_WIDTH = 100 np.random.seed(seed) sequence = seq.NucleotideSequence() sequence.code = np.random.randint(len(sequence.alphabet), size=length) excerpt_pos = np.random.randint(len(sequence) - excerpt_length) excerpt = sequence[excerpt_pos : excerpt_pos + excerpt_length] diagonal = np.random.randint( excerpt_pos - BAND_WIDTH, excerpt_pos + BAND_WIDTH ) band = ( diagonal - BAND_WIDTH, diagonal + BAND_WIDTH ) print(band) print(len(sequence), len(excerpt)) matrix = align.SubstitutionMatrix.std_nucleotide_matrix() test_alignments = align.align_banded( excerpt, sequence, matrix, band=band ) # The excerpt should be uniquely mappable to a single location on # the long sequence assert len(test_alignments) == 1 test_alignment = test_alignments[0] test_trace = test_alignment.trace ref_trace = np.stack([ np.arange(len(excerpt)), np.arange(excerpt_pos, len(excerpt) + excerpt_pos) ], axis=1) assert np.array_equal(test_trace, ref_trace)
def test_affine_gap_penalty(local, term, gap_penalty, seed): """ Expect the same alignment results for a linear gap penalty and an affine gap penalty with the same gap open and extension penalty. """ LENGTH_RANGE = (10, 100) MAX_NUMBER = 1000 np.random.seed(seed) sequences = [] for _ in range(2): sequence = seq.NucleotideSequence() length = np.random.randint(*LENGTH_RANGE) sequence.code = np.random.randint(len(sequence.alphabet), size=length) sequences.append(sequence) matrix = align.SubstitutionMatrix.std_nucleotide_matrix() ref_alignments = align.align_optimal(*sequences, matrix, gap_penalty, term, local, MAX_NUMBER) test_alignments = align.align_optimal(*sequences, matrix, (gap_penalty, gap_penalty), term, local, MAX_NUMBER) assert test_alignments[0].score == ref_alignments[0].score assert len(test_alignments) == len(ref_alignments) # We can only expect to get the same alignments in the test and # reference, if we get all optimal alignments if len(test_alignments) < MAX_NUMBER: for alignment in test_alignments: try: assert alignment in ref_alignments except: print("Test alignment:") print(alignment) print() print("First reference alignment") print(ref_alignments[0]) raise
def test_annotated_sequence(): sequence = seq.NucleotideSequence("ATGGCGTACGATTAGAAAAAAA") feature1 = Feature("misc_feature", [Location(1, 2), Location(11, 12)], {"note": "walker"}) feature2 = Feature("misc_feature", [Location(16, 22)], {"note": "poly-A"}) annotation = Annotation([feature1, feature2]) annot_seq = AnnotatedSequence(annotation, sequence) assert annot_seq[2] == "T" assert annot_seq.sequence[2] == "G" annot_seq2 = annot_seq[:16] assert annot_seq2.sequence == seq.NucleotideSequence("ATGGCGTACGATTAG") assert annot_seq[feature1] == seq.NucleotideSequence("ATAT") assert annot_seq[feature2] == seq.NucleotideSequence("AAAAAAA") annot_seq[feature1] = seq.NucleotideSequence("CCCC") assert annot_seq.sequence == seq.NucleotideSequence( "CCGGCGTACGCCTAGAAAAAAA")
def test_masking(k, input_mask, ref_output_mask): """ Explicitly test the conversion of removal masks to k-mer masks using known examples. Since the conversion function is private, this is tested indirectly, by looking at the sequence positions, that were added to the array. """ input_mask = np.array(input_mask, dtype=bool) ref_output_mask = np.array(ref_output_mask, dtype=bool) sequence = seq.NucleotideSequence() sequence.code = np.zeros(len(input_mask)) table = align.KmerTable.from_sequences(k, [sequence], ignore_masks=[input_mask]) # Get the k-mer positions that were masked test_output_mask = np.zeros(len(ref_output_mask), dtype=bool) for kmer in table.get_kmers(): seq_indices = table[kmer][:, 1] test_output_mask[seq_indices] = True assert test_output_mask.tolist() == ref_output_mask.tolist()
def test_max_table_size(gap_penalty, direction, score_only, should_raise): """ Check if the `max_table_size` parameter in `align_local_gapped()` raises the expected `MemoryError` if the aligned regions get too large. """ if should_raise: # This table size is exceed in this test case... max_table_size = 1_000_000 else: # ... and this one is not max_table_size = 1_000_000_000 # Align a long random sequence to itself, # effectively resulting in a global alignment np.random.seed(0) seq1 = seq.NucleotideSequence() seq1.code = np.random.randint(len(seq1.alphabet), size=10000) matrix = align.SubstitutionMatrix.std_nucleotide_matrix() # Local alignment starts in the center of the sequences seed = (len(seq1) // 2, len(seq1) // 2) threshold = 100 if should_raise: with pytest.raises(MemoryError): align.align_local_gapped(seq1, seq1, matrix, seed, threshold, gap_penalty, 1, direction, score_only, max_table_size) else: result = align.align_local_gapped(seq1, seq1, matrix, seed, threshold, gap_penalty, 1, direction, score_only, max_table_size) if not score_only and direction == "both": alignment = result[0] # Expect that no gaps are introduced assert len(alignment) == len(seq1)
""" From A to T - The Sequence subpackage ===================================== .. currentmodule:: biotite.sequence :mod:`biotite.sequence` is a *Biotite* subpackage concerning maybe the most popular data type in computational molecular biology: sequences. The instantiation can be quite simple as """ import biotite.sequence as seq dna = seq.NucleotideSequence("AACTGCTA") print(dna) ######################################################################## # This example shows :class:`NucleotideSequence` which is a subclass of # the abstract base class :class:`Sequence`. # A :class:`NucleotideSequence` accepts an iterable object of strings, # where each string can be ``'A'``, ``'C'``, ``'G'`` or ``'T'``. # Each of these letters is called a *symbol*. # # In general the sequence implementation in *Biotite* allows for # *sequences of anything*. # This means any (immutable an hashable) *Python* object can be used as # a symbol in a sequence, as long as the object is part of the # :class:`Alphabet` of the particular :class:`Sequence`. # An :class:`Alphabet` object simply represents a list of objects that # are allowed to occur in a :class:`Sequence`. # The following figure shows how the symbols are stored in a
# An alignment is an instance of :class:`BlastAlignment`, a subclass of # :class:`biotite.sequence.align.Alignment`. # It contains some additional information as shown above. # The hit UID can be used to obtain the complete hit sequence via # :mod:`biotite.database.entrez`. # # The next alignment should be a bit more challenging. # We take a random part of the *E. coli* BL21 genome and distort it a # little bit. # Since we still expect a high similarity to the original sequence, # we decrease the E-value threshold. import biotite.application.blast as blast import biotite.sequence as seq bl21_seq = seq.NucleotideSequence( "CGGAAGCGCTCGGTCTCCTGGCCTTATCAGCCACTGCGCGACGATATGCTCGTCCGTTTCGAAGA") app = blast.BlastWebApp("blastn", bl21_seq, obey_rules=False) app.set_max_expect_value(0.1) app.start() app.join() alignments = app.get_alignments() best_ali = alignments[0] print(best_ali) print() print("HSP position in query: ", best_ali.query_interval) print("HSP position in hit: ", best_ali.hit_interval) print("Score: ", best_ali.score) print("E-value: ", best_ali.e_value) print("Hit UID: ", best_ali.hit_id) print("Hit name: ", best_ali.hit_definition)