def test_simple_alignment(gap_penalty, local, band_width): """ Test `align_banded()` by comparing the output to `align_optimal()`. This test uses a pair of highly similar short sequences. """ # Cyclotide C, Uniprot: P86843 seq1 = seq.ProteinSequence("gvpcaescvwipctvtallgcsckdkvcyld") # Cyclotide F, Uniprot: P86846 seq2 = seq.ProteinSequence("gipcgescvfipcissvvgcsckskvcyld") matrix = align.SubstitutionMatrix.std_protein_matrix() ref_alignments = align.align_optimal( seq1, seq2, matrix, gap_penalty=gap_penalty, local=local, terminal_penalty=False ) # Remove terminal gaps in reference to obtain a true semi-global # alignment, as returned by align_banded() ref_alignments = [align.remove_terminal_gaps(al) for al in ref_alignments] test_alignments = align.align_banded( seq1, seq2, matrix, (-band_width, band_width), gap_penalty=gap_penalty, local=local ) assert len(test_alignments) == len(ref_alignments) for alignment in test_alignments: assert alignment in ref_alignments
def test_score_threshold_rule(kmer_alphabet, ref_kmer, threshold): """ Test if the similar k-mers given by :class:`ScoreThresholdRule` are equal to k-mers generated by a brute-force approach. """ matrix = align.SubstitutionMatrix.std_protein_matrix() ref_kmer_sequence = seq.ProteinSequence() ref_kmer_sequence.code = kmer_alphabet.split(ref_kmer) ref_sim_kmer_set = set() # Iterate through all possible k-mers for kmer in range(len(kmer_alphabet)): kmer_sequence = seq.ProteinSequence() kmer_sequence.code = kmer_alphabet.split(kmer) score = align.align_ungapped( ref_kmer_sequence, kmer_sequence, matrix, score_only=True ) # Add k-mer to list if the threshold score is reached if score >= threshold: ref_sim_kmer_set.add(kmer) test_rule = align.ScoreThresholdRule(matrix, threshold) test_sim_kmer_set = set(test_rule.similar_kmers(kmer_alphabet, ref_kmer)) assert test_sim_kmer_set == ref_sim_kmer_set
def get_alignment(cls, seq1: str, seq2: str, local: bool = True): """ Generate an alignment between two sequences Parameters ---------- seq1: str The first sequence to be aligned seq1: str The second sequence to be aligned local: bool If false, a global alignment is performed (based on the Needleman-Wunsch algorithm), otherwise a local alignment is performed (based on the Smith–Waterman algorithm). (Default: True) Returns ------- Alignment """ import biotite.sequence as seq import biotite.sequence.align as align import numpy as np # create the default matrix # TODO add more options for the choice of matrix matrix = align.SubstitutionMatrix.std_protein_matrix() alignments = align.align_optimal( seq.ProteinSequence(seq1), seq.ProteinSequence(seq2), matrix, local=local, ) alignment = alignments[0] score = alignment.score seq_identity = align.get_sequence_identity(alignment) symbols = align.get_symbols(alignment) codes = align.get_codes(alignment) return cls( alignment=alignment, metadata={ "score": score, "sequence_identity": seq_identity, "symbols": symbols, "codes": codes, }, )
def test_sequence_conversion(): path = os.path.join(data_dir("sequence"), "nuc.fasta") file = fasta.FastaFile.read(path) assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file) seq_dict = fasta.get_sequences(file) file2 = fasta.FastaFile() fasta.set_sequences(file2, seq_dict) seq_dict2 = fasta.get_sequences(file2) # Cannot compare dicts directly, since the original RNA sequence is # now guessed as protein sequence for seq1, seq2 in zip(seq_dict.values(), seq_dict2.values()): assert str(seq1) == str(seq2) file3 = fasta.FastaFile() fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG")) assert file3["sequence"] == "AACCTTGG" path = os.path.join(data_dir("sequence"), "prot.fasta") file4 = fasta.FastaFile.read(path) # Expect a warning for selenocysteine conversion with pytest.warns(UserWarning): assert seq.ProteinSequence("YAHCGFRTGS") == fasta.get_sequence(file4) path = os.path.join(data_dir("sequence"), "invalid.fasta") file5 = fasta.FastaFile.read(path) with pytest.raises(ValueError): seq.NucleotideSequence(fasta.get_sequence(file5))
def test_sequence_conversion(): path = os.path.join(data_dir, "nuc.fasta") file = fasta.FastaFile() file.read(path) assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file) seq_dict = fasta.get_sequences(file) file2 = fasta.FastaFile() fasta.set_sequences(file2, seq_dict) seq_dict2 = fasta.get_sequences(file2) assert seq_dict == seq_dict2 file3 = fasta.FastaFile() fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG")) assert file3["sequence"] == "AACCTTGG" path = os.path.join(data_dir, "prot.fasta") file4 = fasta.FastaFile() file4.read(path) assert seq.ProteinSequence("YAHGFRTGS") == fasta.get_sequence(file4) path = os.path.join(data_dir, "invalid.fasta") file5 = fasta.FastaFile() file5.read(path) with pytest.raises(ValueError): seq.NucleotideSequence(fasta.get_sequence(file5))
def sequences(): """ 10 Cas9 sequences. """ fasta_file = fasta.FastaFile() fasta_file.read(join(data_dir, "cas9.fasta")) return [seq.ProteinSequence(sequence) for sequence in fasta_file.values()]
def test_to_consensus_prot(): # Avidin protein sequence seq1 = seq.ProteinSequence( "MVHATSPLLLLLLLSLALVAPGLSARKCSLTGKWTNDLGSNMTIGAVNSRGEFTGTYITAVTATSNEIKESPLHGTQNTINKRTQP" "TFGFTVNWKFSESTTVFTGQCFIDRNGKEVLKTMWLLRSSVNDIGDDWKATRVGINIFTRLRTQKE") # Streptavidin protein sequence seq2 = seq.ProteinSequence( "MRKIVVAAIAVSLTTVSITASASADPSKDSKAQVSAAEAGITGTWYNQLGSTFIVTAGADGALTGTYESAVGNAESRYVLTGRYDSA" "PATDGSGTALGWTVAWKNNYRNAHSATTWSGQYVGGAEARINTQWLLTSGTTEANAWKSTLVGHDTFTKVKPSAASIDAAKKAGVNN" "GNPLDAVQQ") matrix = align.SubstitutionMatrix.std_protein_matrix() alignment = align.align_optimal(seq1, seq2, matrix)[0] profile = seq.SequenceProfile.from_alignment(alignment) assert seq.ProteinSequence( "MRHIATAAIALSLLLLSITALASADPGKDSKAQLSAAEAGITGKWTNDLGSNFIIGAVGADGAFTGTYESAVGNAESNEIKEGPLD" "GAPATDGKGTALGWTFAFKNNWKFAESATTFSGQCFGGADARINGKELLTKGTMEANAWKSTLLGHDSFSKVKDIAADIDAAKKAG" "INIFNPLDAQKE") == profile.to_consensus()
def _create_random_pair(seed, length=100, max_subsitutions=5, max_insertions=5, max_deletions=5, max_truncations=5): """ generate a pair of protein sequences. Each pair contains 1. a randomly generated sequence 2. a sequence created by randomly introducing deletions/insertions/substitutions into the first sequence. """ np.random.seed(seed) original = seq.ProteinSequence() original.code = np.random.randint(len(original.alphabet), size=length) mutant = original.copy() # Random Substitutions n_subsitutions = np.random.randint(max_subsitutions) subsitution_indices = np.random.choice( np.arange(len(mutant)), size=n_subsitutions, replace=False ) subsitution_values = np.random.randint( len(original.alphabet), size=n_subsitutions ) mutant.code[subsitution_indices] = subsitution_values # Random insertions n_insertions = np.random.randint(max_insertions) insertion_indices = np.random.choice( np.arange(len(mutant)), size=n_insertions, replace=False ) insertion_values = np.random.randint( len(original.alphabet), size=n_insertions ) mutant.code = np.insert(mutant.code, insertion_indices, insertion_values) # Random deletions n_deletions = np.random.randint(max_deletions) deletion_indices = np.random.choice( np.arange(len(mutant)), size=n_deletions, replace=False ) mutant.code = np.delete(mutant.code, deletion_indices) # Truncate at both ends of original and mutant original = original[ np.random.randint(max_truncations) : -(1 + np.random.randint(max_truncations)) ] mutant = mutant[ np.random.randint(max_truncations) : -(1 + np.random.randint(max_truncations)) ] return original, mutant
def test_get_molecular_weight(monoisotopic, expected_mol_weight_protein): """ Test whether the molecular weight of a protein is calculated correctly. """ protein = seq.ProteinSequence("ACDEFGHIKLMNPQRSTVW") mol_weight_protein = protein.get_molecular_weight( monoisotopic=monoisotopic) assert mol_weight_protein == \ pytest.approx(expected_mol_weight_protein, abs=1e-2)
def test_evalue(): """ Check if the estimated E-values for a given score approximately match the number of random sequences with equal or better score via sampling. Low scores that lead to a rather high E-value are required to get a reasonable accuracy. """ TEST_SCORES = [30, 40, 50] GAP_PENALTY = (-12, -1) N_SAMPLES = 10000 SEQ_LENGTH = 300 matrix = align.SubstitutionMatrix.std_protein_matrix() estimator = align.EValueEstimator.from_samples( seq.ProteinSequence.alphabet, matrix, GAP_PENALTY, BACKGROUND) # Generate large number of alignments of random sequences np.random.seed(0) random_sequence_code = np.random.choice(len(seq.ProteinSequence.alphabet), size=(N_SAMPLES, 2, SEQ_LENGTH), p=BACKGROUND) sample_scores = np.zeros(N_SAMPLES, dtype=int) for i in range(N_SAMPLES): seq1 = seq.ProteinSequence() seq2 = seq.ProteinSequence() seq1.code = random_sequence_code[i, 0] seq2.code = random_sequence_code[i, 1] sample_scores[i] = align.align_optimal(seq1, seq2, matrix, local=True, gap_penalty=GAP_PENALTY, max_number=1)[0].score e_values = [ 10**estimator.log_evalue(score, SEQ_LENGTH, SEQ_LENGTH * N_SAMPLES) for score in TEST_SCORES ] counts = [ np.count_nonzero(sample_scores >= score) for score in TEST_SCORES ] assert e_values == pytest.approx(counts, rel=0.5)
def test_simple_alignment(gap_penalty, seed, threshold, direction, score_only): """ Test `align_local_gapped()` by comparing the output to `align_optimal()`. This test uses a pair of highly similar short sequences. """ # Cyclotide C, Uniprot: P86843 seq1 = seq.ProteinSequence("gvpcaescvwipctvtallgcsckdkvcyld") # Cyclotide F, Uniprot: P86846 seq2 = seq.ProteinSequence("gipcgescvfipcissvvgcsckskvcyld") matrix = align.SubstitutionMatrix.std_protein_matrix() ref_alignments = align.align_optimal(seq1, seq2, matrix, gap_penalty=gap_penalty, local=True) # Limit reference alignment range to seed # if the alignment does not extend in both directions for alignment in ref_alignments: seed_index = np.where(alignment.trace[:, 0] == seed[0])[0][0] if direction == "upstream": alignment.trace = alignment.trace[:seed_index + 1] elif direction == "downstream": alignment.trace = alignment.trace[seed_index:] alignment.score = align.score(alignment, matrix, gap_penalty) test_result = align.align_local_gapped(seq1, seq2, matrix, seed, threshold, gap_penalty, 1000, direction, score_only) if score_only: test_score = test_result # All optimal alignments have the same score assert test_score == ref_alignments[0].score else: test_alignments = test_result assert len(test_alignments) == len(ref_alignments) for alignment in test_alignments: assert alignment in ref_alignments
def sequence_alignment(seq1: str, seq2: str, matrix: str, gap: int, local: bool = False) -> str: """ Perform a global alignment, based on the Needleman-Wunsch algorithm Parameters ---------- seq1,seq2: str The sequences to be aligned matrix: SubstitutionMatrix The substitution matrix used for scoring gap: int or (tuple, dtype=int) Int the value will be interpreted as general gap penalty. Tupel is provided, an affine gap penalty is used. The first integer in the tuple is the gap opening penalty, the second integer is the gap extension penalty. The values need to be negative. local : bool, optional, default=False Whether to use local alignment (Smith-Waterman) or global (Needleman-Wunsch) Returns ------- str An optimal alignment of two sequences """ matrix = matrices(matrix) alignment = seq_align.align_optimal( seq.ProteinSequence(seq1), seq.ProteinSequence(seq2), matrix, gap_penalty=gap, local=local, ) return alignment[0]
def test_identity(): seq_str1 = "--HAKLPRDD--WL--" seq_str2 = "FRHA--QRTDADWLHH" seq_strings = [seq_str1, seq_str2] sequences = [ seq.ProteinSequence(seq_str.replace("-", "")) for seq_str in seq_strings ] trace = align.Alignment.trace_from_strings(seq_strings) alignment = align.Alignment(sequences, trace, score=None) # Assert correct sequence identity calculation modes = ["all", "not_terminal", "shortest"] values = [6 / 16, 6 / 12, 6 / 10] for mode, value in zip(modes, values): assert align.get_sequence_identity(alignment, mode=mode) == value
def create_consensus(sequences): seq_len = len(sequences[0]) consensus_code = np.zeros(seq_len, dtype=int) for seq_pos in range(seq_len): # Count the number of occurrences of each amino acid # at the given sequence position counts = np.bincount( [sequence.code[seq_pos] for sequence in sequences]) # The consensus amino acid is the most frequent amino acid consensus_code[seq_pos] = np.argmax(counts) # Create empty ProteinSequence object... consensus_sequence = seq.ProteinSequence() # ...and fill it with the sequence code containing the consensus # sequence consensus_sequence.code = consensus_code return consensus_sequence
def test_conversion_to_symbols(): """ Test conversion of alignments to strings. """ seq_str1 = "HAKLPRDD--WKL--" seq_str2 = "HA--PRDDADWKLHH" seq_str3 = "HA----DDADWKLHH" seq_strings = [seq_str1, seq_str2, seq_str3] sequences = [seq.ProteinSequence(seq_str.replace("-","")) for seq_str in seq_strings] trace = align.Alignment.trace_from_strings(seq_strings) alignment = align.Alignment(sequences, trace, score=None) # Test the conversion bach to strings of symbols symbols = align.get_symbols(alignment) symbols = ["".join([sym if sym is not None else "-" for sym in sym_list]) for sym_list in symbols] assert symbols == seq_strings
def test_protein(use_custom_matrix): """ Test masking a protein sequence based on a known example. """ seq_string = "MAPKINASekinasekinase" sequence = seq.ProteinSequence(seq_string) if use_custom_matrix: matrix = align.SubstitutionMatrix.std_protein_matrix() else: matrix = None test_mask = TantanApp.mask_repeats(sequence, matrix) ref_mask = [True if char.islower() else False for char in seq_string] assert len(test_mask) == len(ref_mask) assert np.all(test_mask.tolist() == ref_mask)
def test_create_spaced_kmers(kmer_alphabet, spaced_kmer_alphabet, seed): """ Test :meth:`create_kmers()` for creating spaced *k-mers*. Compare results from random sequences to corresponding results from :meth:`create_kmers()` without spacing, by using a spacing model that is equivalent to non-spaced *k-mers*. """ MIN_LENGTH = 10 MAX_LENGTH = 1000 np.random.seed(seed) sequence = seq.ProteinSequence() sequence.code = np.random.randint(len(sequence.alphabet), size=np.random.randint( MIN_LENGTH, MAX_LENGTH)) ref_kmers = kmer_alphabet.create_kmers(sequence.code) test_kmers = spaced_kmer_alphabet.create_kmers(sequence.code) assert len(test_kmers) == len(ref_kmers) assert test_kmers.tolist() == ref_kmers.tolist()
mol = info.residue(residue) thetas = np.linspace(-30, 30, 60) thetas = np.append(thetas, np.linspace(30, 0, 30)) for i, theta in enumerate(thetas): mol_new = rotate_residue(mol, 0, theta * np.pi / 180) plot(mol_new, save_as=f"./plots/res_flex/{i}.png", show=False) thetas = np.linspace(0, 30, 60) thetas = np.append(thetas, np.linspace(30, -30, 30)) for j, theta in enumerate(thetas): mol_new = rotate_residue(mol, 1, theta * np.pi / 180) plot(mol_new, save_as=f"./plots/res_flex/{i+j}.png", show=False) # --- create directory --- pth = f"./data/psi4files/peptides" if not os.path.exists(pth): os.makedirs(pth) n_res = [5, 10, 20, 30, 50, 100] for i in range(len(n_res)): seq_str = "" for j in range(n_res[i]): seq_str = seq_str + "T" sequence = seq.ProteinSequence(seq_str) peptide = assemble_peptide(sequence) pep_string = mkpsi4(peptide) f = open(f"{pth}/{seq_str}.txt", "w") f.write(pep_string) f.close()
# Gene is surrounded by square brackets gene = line[gene_start : gene_end+1] \ .replace("[","").replace("]","") # Sometimes alternative gene names are separated via a # semicolon -> Choose the first gene name gene = gene.split(";")[0].strip() genes.append(gene) ids.append(ncbi_id) # Download sequences a file-like object and read the sequences from it fasta_file = fasta.FastaFile.read( entrez.fetch_single_file(ids, file_name=None, db_name="protein", ret_type="fasta")) sequences = [seq.ProteinSequence(seq_str) for seq_str in fasta_file.values()] # Create multiple sequence alignment with Clustal Omega alignment = clustalo.ClustalOmegaApp.align(sequences) # The distance measure required for the tree calculation is the # percentage of non-identical amino acids in the respective two # sequences distances = 1 - align.get_pairwise_sequence_identity(alignment, mode="shortest") # Create tree via neighbor joining tree = phylo.neighbor_joining(distances) # Convert to NetworkX graph #For the graph visualization, the edge directions are unnecessary graph = tree.as_graph().to_undirected() fig = plt.figure(figsize=(8.0, 8.0))
import matplotlib.pyplot as plt import biotite import biotite.sequence as seq import biotite.sequence.align as align import biotite.sequence.io.fasta as fasta import biotite.database.entrez as entrez import biotite.sequence.graphics as graphics # Download and parse protein sequences of avidin and streptavidin file_name = entrez.fetch_single_file(["CAC34569", "ACL82594"], biotite.temp_file("sequences.fasta"), "protein", "fasta") file = fasta.FastaFile.read(file_name) for name, sequence in file.items(): if "CAC34569" in name: avidin_seq = seq.ProteinSequence(sequence) elif "ACL82594" in name: streptavidin_seq = seq.ProteinSequence(sequence) # Get BLOSUM62 matrix matrix = align.SubstitutionMatrix.std_protein_matrix() # Perform pairwise sequence alignment with affine gap penalty # Terminal gaps are not penalized alignments = align.align_optimal(avidin_seq, streptavidin_seq, matrix, gap_penalty=(-10, -1), terminal_penalty=False) # Draw first and only alignment # The color intensity indicates the similiarity fig = plt.figure(figsize=(8.0, 2.5)) ax = fig.add_subplot(111)
import biotite.database.entrez as entrez # Search for protein products of LexA gene in UniProtKB/Swiss-Prot database query = entrez.SimpleQuery("luxA", "Gene Name") \ & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") uids = entrez.search(query, db_name="protein") fasta_file = fasta.FastaFile.read( entrez.fetch_single_file(uids, None, db_name="protein", ret_type="fasta")) ids = [] sequences = [] for header, seq_str in fasta_file.items(): # Extract the UniProt Entry name from header identifier = header.split("|")[-1].split()[0] ids.append(identifier) sequences.append(seq.ProteinSequence(seq_str)) matrix = align.SubstitutionMatrix.std_protein_matrix() alignment, order, tree, distances = align.align_multiple( sequences, matrix, gap_penalty=(-10, -1), terminal_penalty=False) # Order alignment according to the guide tree alignment = alignment[:, order] ids = [ids[i] for i in order] fig = plt.figure(figsize=(8.0, 20.0)) ax = fig.add_subplot(111) graphics.plot_alignment_type_based(ax, alignment, labels=ids, show_numbers=True, spacing=2.0)
# :func:`NucleotideSequence.complement()` method. # Lower case characters are automatically capitalized seq1 = seq.NucleotideSequence("tacagtt") print("Original:", seq1) seq2 = seq1.reverse().complement() print("Reverse complement:", seq2) ######################################################################## # The other :class:`Sequence` type is :class:`ProteinSequence`. # It supports the letters for the 20 standard amino acids plus some # letters for ambiguous amino acids and a letter for a stop signal. # Furthermore, this class provides some utilities like # 3-letter to 1-letter translation (and vice versa). prot_seq = seq.ProteinSequence("BIQTITE") print("-".join( [seq.ProteinSequence.convert_letter_1to3(symbol) for symbol in prot_seq])) ######################################################################## # A :class:`NucleotideSequence` can be translated into a # :class:`ProteinSequence` via the # :func:`NucleotideSequence.translate()` method. # By default, the method searches for open reading frames (ORFs) in the # 3 frames of the sequence. # A 6 frame ORF search requires an # additional call of :func:`NucleotideSequence.translate()` with the # reverse complement of the sequence. # If you want to conduct a complete translation of the sequence, # irrespective of any start and stop codons, set the parameter # :obj:`complete` to true.
def sequences(): return [ seq.ProteinSequence(string) for string in ["BIQTITE", "TITANITE", "BISMITE", "IQLITE"] ]
def test_stop_removal(): string = "LYG*GR*" protein = seq.ProteinSequence(string) assert str(protein.remove_stops()) == string.replace("*", "")
# :class:`Application` classes in depth. # # Finding homologous sequences with BLAST # --------------------------------------- # # .. currentmodule:: biotite.application.blast # # the :mod:`biotite.application.blast` subpackage provides an # interface to NCBI BLAST: the :class:`BlastWebApp` class. # Let's dive directly into the code, we try to find # homologous sequences to the miniprotein *TC5b*: import biotite.application.blast as blast import biotite.sequence as seq tc5b_seq = seq.ProteinSequence("NLYIQWLKDGGPSSGRPPPS") app = blast.BlastWebApp("blastp", tc5b_seq) app.start() app.join() alignments = app.get_alignments() best_ali = alignments[0] print(best_ali) print() print("HSP position in query: ", best_ali.query_interval) print("HSP position in hit: ", best_ali.hit_interval) print("Score: ", best_ali.score) print("E-value: ", best_ali.e_value) print("Hit UID: ", best_ali.hit_id) print("Hit name: ", best_ali.hit_definition) ########################################################################
"R": -4.5 } # Look for the Swiss-Prot entry contaning the human HCN1 channel query = entrez.SimpleQuery("HCN1", "Gene Name") \ & entrez.SimpleQuery("h**o sapiens", "Organism") \ & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") uids = entrez.search(query, db_name="protein") file_name = entrez.fetch(uids[0], biotite.temp_dir(), "gp", db_name="protein", ret_type="gp") gp_file = gb.GenBankFile.read(file_name) hcn1 = seq.ProteinSequence(gb.get_sequence(gp_file, format="gp")) print(hcn1) ######################################################################## # The positional hydropathy is calculated and smoothened using # a moving average for clearer visualization. hydropathies = np.array([hydropathy_dict[symbol] for symbol in hcn1]) def moving_average(data_set, window_size): weights = np.full(window_size, 1 / window_size) return np.convolve(data_set, weights, mode='valid') # Apply moving average over 15 amino acids for clearer visualization
# :func:`NucleotideSequence.complement()` method. # Lower case characters are automatically capitalized seq1 = seq.NucleotideSequence("tacagtt") print("Original:", seq1) seq2 = seq1.reverse().complement() print("Reverse complement:", seq2) ######################################################################## # The other :class:`Sequence` type is :class:`ProteinSequence`. # It supports the letters for the 20 standard amino acids plus some # letters for ambiguous amino acids and a letter for a stop signal. # Furthermore, this class provides some utilities like # 3-letter to 1-letter translation (and vice versa). prot_seq = seq.ProteinSequence("BIQTITE") print("-".join( [seq.ProteinSequence.convert_letter_1to3(symbol) for symbol in prot_seq])) ######################################################################## # A :class:`NucleotideSequence` can be translated into a # :class:`ProteinSequence` via the # :func:`NucleotideSequence.translate()` method. # By default, the method searches for open reading frames (ORFs) in the # 3 frames of the sequence. # A 6-frame ORF search requires an # additional call of :func:`NucleotideSequence.translate()` with the # reverse complement of the sequence. # If you want to conduct a complete 1-frame translation of the sequence, # irrespective of any start and stop codons, set the parameter # :obj:`complete` to true.
annotation = gb.get_annotation(gb_file) # Find ID of strain in 'source' feature strain = None for feature in annotation: if feature.key == "source": strain = int(feature.qual["strain"]) assert strain is not None # Find corresponding protein sequence in 'CDS' feature sequence = None for feature in annotation: if feature.key == "CDS": sequence = seq.ProteinSequence( # Remove whitespace in sequence # resulting from line breaks feature.qual["translation"].replace(" ", "") ) assert sequence is not None sequences[strain] = sequence # None of the THCA synthase variants have an insertion or deletion # -> each one should have the same sequence length seq_len = len(list(sequences.values())[0]) for sequence in sequences.values(): assert len(sequence) == seq_len # Create consensus sequences for the drug-type and fiber-type cannabis # strains
disulfide_bonds = detect_disulfide_bonds(knottin) for sg1_index, sg2_index in disulfide_bonds: print(knottin[sg1_index]) print(knottin[sg2_index]) print() ######################################################################## # The found disulfide bonds are visualized with the help of # *Matplotlib*: # The amino acid sequence is written on the X-axis and the disulfide # bonds are depicted by yellow semi-ellipses. # Create a sequence object for each residue in the structure # As we want each residue to appear only once in the sequence, # we choose an atom that appears in each residue once: the CA sequence = seq.ProteinSequence(knottin.res_name[knottin.atom_name == "CA"]) figure = plt.figure(figsize=(4.0, 1.0)) ax = figure.gca() MARGIN = 0.2 ax.set_xlim(1-MARGIN, len(sequence)+MARGIN) ax.set_ylim(0, 1+MARGIN) ax.set_xticks(np.arange(1, len(sequence)+1)) ax.set_xticklabels(str(sequence)) ax.yaxis.set_tick_params( left=False, right=False, labelleft=False, labelright=False ) ax.xaxis.set_tick_params( bottom=True, top=False, labelbottom=True, labeltop=False, width=0 ) ax.set_frame_on(False) for sg1_index, sg2_index in disulfide_bonds:
import biotite.sequence as seq import biotite.application.blast as blast import numpy as np from requests.exceptions import ConnectionError import pytest import os.path from ..util import data_dir, cannot_connect_to BLAST_URL = "https://blast.ncbi.nlm.nih.gov/Blast.cgi" # Start of E. coli lacZ ORF (UID: AJ308295) dna_seq = seq.NucleotideSequence("ATGACCATGATTACGCCAAGCTTTCCGGGGAATTCA") # Start of E. coli lacZ, translated dna_seq (UID: AJ308295) prot_seq = seq.ProteinSequence("MTMITPSFPGNS") @pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available") def test_blastn(): app = blast.BlastWebApp("blastn", dna_seq, obey_rules=False) app.set_max_expect_value(100) app.start() app.join(timeout=300) alignments = app.get_alignments() # BLAST should find original sequence as best hit assert dna_seq == alignments[0].sequences[0] assert dna_seq == alignments[0].sequences[1]