def needle(reference, query, gap_open=-15, gap_extend=0, matrix=submat.DNA_SIMPLE): '''Do a Needleman-Wunsch alignment. :param reference: Reference sequence. :type reference: coral.DNA :param query: Sequence to align against the reference. :type query: coral.DNA :param gapopen: Penalty for opening a gap. :type gapopen: float :param gapextend: Penalty for extending a gap. :type gapextend: float :param matrix: Matrix to use for alignment - options are DNA_simple (for DNA) and BLOSUM62 (for proteins). :type matrix: str :returns: (aligned reference, aligned query, score) :rtype: tuple of two coral.DNA instances and a float ''' # Align using cython Needleman-Wunsch aligned_ref, aligned_res = aligner(str(reference), str(query), gap_open=gap_open, gap_extend=gap_extend, method='global_cfe', matrix=matrix.matrix, alphabet=matrix.alphabet) # Score the alignment score = score_alignment(aligned_ref, aligned_res, gap_open, gap_extend, matrix.matrix, matrix.alphabet) return cr.DNA(aligned_ref), cr.DNA(aligned_res), score
def needle_msa(reference, results, gap_open=-15, gap_extend=0, matrix=submat.DNA_SIMPLE): '''Create a multiple sequence alignment based on aligning every result sequence against the reference, then inserting gaps until every aligned reference is identical ''' gap = '-' # Convert alignments to list of strings alignments = [] for result in results: ref_dna, res_dna, score = needle(reference, result, gap_open=gap_open, gap_extend=gap_extend, matrix=matrix) alignments.append([str(ref_dna), str(res_dna), score]) def insert_gap(sequence, position): return sequence[:position] + gap + sequence[position:] i = 0 while True: # Iterate over 'columns' in every reference refs = [alignment[0][i] for alignment in alignments] # If there's a non-unanimous gap, insert gap into alignments gaps = [ref == gap for ref in refs] if any(gaps) and not all(gaps): for alignment in alignments: if alignment[0][i] != gap: alignment[0] = insert_gap(alignment[0], i) alignment[1] = insert_gap(alignment[1], i) # If all references match, we're all done alignment_set = set(alignment[0] for alignment in alignments) if len(alignment_set) == 1: break # If we've reach the end of some, but not all sequences, add end gap lens = [len(alignment[0]) for alignment in alignments] if i + 1 in lens: for alignment in alignments: if len(alignment[0]) == i + 1: alignment[0] = alignment[0] + gap alignment[1] = alignment[1] + gap i += 1 if i > 20: break # Convert into MSA format output_alignment = [cr.DNA(alignments[0][0])] for alignment in alignments: output_alignment.append(cr.DNA(alignment[1])) return output_alignment
def test_overhang(self): '''Tests that primer overhangs are added correctly to the amplicon.''' template = self.template[30:-30] fwd_overhang = cr.DNA('AGCGGGGGGGGGCTGGGGCTGAT') rev_overhang = cr.DNA('GGGTGGGGGGGGGGGGGGG') fwd = cr.design.primer(template, overhang=fwd_overhang) rev = cr.design.primer(template.reverse_complement(), overhang=rev_overhang) expected = (fwd_overhang + template + rev_overhang.reverse_complement()) self.pcr_equal(expected, self.template, fwd, rev)
def __init__(self): # Part BBa_R0010 (pLac promoter) bba_r0010 = ('caatacgcaaaccgcctctccccgcgcgttggccgattcattaatgcag' 'ctggcacgacaggtttcccgactggaaagcgggcagtgagcgcaacgca' 'attaatgtgagttagctcactcattaggcaccccaggctttacacttta' 'tgcttccggctcgtatgttgtgtggaattgtgagcggataacaatttca' 'caca') self.template = cr.DNA(bba_r0010, circular=False)
def random_dna(n): '''Generate a random DNA sequence. :param n: Output sequence length. :type n: int :returns: Random DNA sequence of length n. :rtype: coral.DNA ''' return coral.DNA(''.join([random.choice('ATGC') for i in range(n)]))
def count_end_gaps(seq): gap = coral.DNA('-') count = 0 for base in seq: if base == gap: count += 1 else: break return count
def test_primer_larger_than_template(): template = cr.design.random_dna(50) overhangs = [cr.design.random_dna(200), cr.DNA('')] expected = overhangs[0] + template primer1, primer2 = cr.design.primers(template, overhangs=overhangs, min_len=14) amplicon = cr.reaction.pcr(template, primer1, primer2) assert_true(expected == amplicon)
def MAFFT(sequences, gap_open=1.53, gap_extension=0.0, retree=2): '''A Coral wrapper for the MAFFT command line multiple sequence aligner. :param sequences: A list of sequences to align. :type sequences: List of homogeneous sequences (all DNA, or all RNA, etc.) :param gap_open: --op (gap open) penalty in MAFFT cli. :type gap_open: float :param gap_extension: --ep (gap extension) penalty in MAFFT cli. :type gap_extension: float :param retree: Number of times to build the guide tree. :type retree: int ''' arguments = ['mafft'] arguments += ['--op', str(gap_open)] arguments += ['--ep', str(gap_extension)] arguments += ['--retree', str(retree)] arguments.append('input.fasta') tempdir = tempfile.mkdtemp() try: with open(os.path.join(tempdir, 'input.fasta'), 'w') as f: for i, sequence in enumerate(sequences): if hasattr(sequence, 'name'): name = sequence.name else: name = 'sequence{}'.format(i) f.write('>{}\n'.format(name)) f.write(str(sequence) + '\n') process = subprocess.Popen(arguments, stdout=subprocess.PIPE, stderr=open(os.devnull, 'w'), cwd=tempdir) stdout = process.communicate()[0] finally: shutil.rmtree(tempdir) # Process stdout into something downstream process can use records = stdout.split('>') # First line is now blank records.pop(0) aligned_list = [] for record in records: lines = record.split('\n') name = lines.pop(0) aligned_list.append(coral.DNA(''.join(lines))) return aligned_list
def read_dna(path): '''Read DNA from file. Uses BioPython and coerces to coral format. :param path: Full path to input file. :type path: str :returns: DNA sequence. :rtype: coral.DNA ''' filename, ext = os.path.splitext(os.path.split(path)[-1]) genbank_exts = ['.gb', '.ape'] fasta_exts = ['.fasta', '.fa', '.fsa', '.seq'] abi_exts = ['.abi', '.ab1'] if any([ext == extension for extension in genbank_exts]): file_format = 'genbank' elif any([ext == extension for extension in fasta_exts]): file_format = 'fasta' elif any([ext == extension for extension in abi_exts]): file_format = 'abi' else: raise ValueError('File format not recognized.') seq = SeqIO.read(path, file_format) dna = coral.DNA(str(seq.seq)) if seq.name == '.': dna.name = filename else: dna.name = seq.name # Features for feature in seq.features: try: dna.features.append(_seqfeature_to_coral(feature)) except FeatureNameError: pass dna.features = sorted(dna.features, key=lambda feature: feature.start) # Used to use data_file_division, but it's inconsistent (not always the # molecule type) dna.topology = 'linear' with open(path) as f: first_line = f.read().split() for word in first_line: if word == 'circular': dna.topology = 'circular' return dna
def write_map(self, path): '''Write genbank map that highlights overlaps. :param path: full path to .gb file to write. :type path: str ''' starts = [index[0] for index in self.overlap_indices] features = [] for i, start in enumerate(starts): stop = start + len(self.overlaps[i]) name = 'overlap {}'.format(i + 1) feature_type = 'misc' strand = 0 features.append( coral.Feature(name, start, stop, feature_type, strand=strand)) seq_map = coral.DNA(self.template, features=features) coral.seqio.write_dna(seq_map, path)
def get_yeast_sequence(chromosome, start, end, reverse_complement=False): '''Acquire a sequence from SGD http://www.yeastgenome.org :param chromosome: Yeast chromosome. :type chromosome: int :param start: A biostart. :type start: int :param end: A bioend. :type end: int :param reverse_complement: Get the reverse complement. :type revervse_complement: bool :returns: A DNA sequence. :rtype: coral.DNA ''' import requests if start != end: if reverse_complement: rev_option = '-REV' else: rev_option = '' param_url = '&chr=' + str(chromosome) + '&beg=' + str(start) + \ '&end=' + str(end) + '&rev=' + rev_option url = 'http://www.yeastgenome.org/cgi-bin/getSeq?map=a2map' + \ param_url res = requests.get(url) # ok... sadely, I contacted SGD and they haven;t implemented this so # I have to parse their yeastgenome page, but # it is easy between the raw sequence is between <pre> tags! # warning that's for the first < so we need +5! begin_index = res.text.index('<pre>') end_index = res.text.index('</pre>') sequence = res.text[begin_index + 5:end_index] sequence = sequence.replace('\n', '').replace('\r', '') else: sequence = '' return coral.DNA(sequence)
def get_yeast_promoter_ypa(gene_name): '''Retrieve promoter from Yeast Promoter Atlas (http://ypa.csbb.ntu.edu.tw). :param gene_name: Common name for yeast gene. :type gene_name: str :returns: Double-stranded DNA sequence of the promoter. :rtype: coral.DNA ''' import requests loc = get_yeast_gene_location(gene_name) gid = get_gene_id(gene_name) ypa_baseurl = 'http://ypa.csbb.ntu.edu.tw/do' params = { 'act': 'download', 'nucle': 'InVitro', 'right': str(loc[2]), 'left': str(loc[1]), 'gene': str(gid), 'chr': str(loc[0]) } response = requests.get(ypa_baseurl, params=params) text = response.text # FASTA records are just name-sequence pairs split up by > e.g. # >my_dna_name # GACGATA # TODO: most of this is redundant, as we just want the 2nd record record_split = text.split('>') record_split.pop(0) parsed = [] for record in record_split: parts = record.split('\n') sequence = coral.DNA(''.join(parts[1:])) sequence.name = parts[0] parsed.append(sequence) return parsed[1]
def get_yeast_sequence(chromosome, start, end, reverse_complement=False): """Acquire a sequence from SGD http://www.yeastgenome.org :param chromosome: Yeast chromosome. :type chromosome: int :param start: A biostart. :type start: int :param end: A bioend. :type end: int :param reverse_complement: Get the reverse complement. :type revervse_complement: bool :returns: A DNA sequence. :rtype: coral.DNA """ if start != end: if reverse_complement: rev_option = "-REV" else: rev_option = "" param_url = "&chr=" + str(chromosome) + "&beg=" + str(start) + \ "&end=" + str(end) + "&rev=" + rev_option url = "http://www.yeastgenome.org/cgi-bin/getSeq?map=a2map" + \ param_url res = requests.get(url) # ok... sadely, I contacted SGD and they haven;t implemented this so # I have to parse their yeastgenome page, but # it is easy between the raw sequence is between <pre> tags! # warning that"s for the first < so we need +5! begin_index = res.text.index("<pre>") end_index = res.text.index("</pre>") sequence = res.text[begin_index + 5:end_index] sequence = sequence.replace("\n", "").replace("\r", "") else: sequence = "" return coral.DNA(sequence)
def get_yeast_promoter_ypa(gene_name): """Retrieve promoter from Yeast Promoter Atlas (http://ypa.csbb.ntu.edu.tw). :param gene_name: Common name for yeast gene. :type gene_name: str :returns: Double-stranded DNA sequence of the promoter. :rtype: coral.DNA """ loc = get_yeast_gene_location(gene_name) gid = get_gene_id(gene_name) ypa_baseurl = "http://ypa.csbb.ntu.edu.tw/do" params = { "act": "download", "nucle": "InVitro", "right": str(loc[2]), "left": str(loc[1]), "gene": str(gid), "chr": str(loc[0]) } response = requests.get(ypa_baseurl, params=params) text = response.text # FASTA records are just name-sequence pairs split up by > e.g. # >my_dna_name # GACGATA # TODO: most of this is redundant, as we just want the 2nd record record_split = text.split(">") record_split.pop(0) parsed = [] for record in record_split: parts = record.split("\n") sequence = coral.DNA("".join(parts[1:])) sequence.name = parts[0] parsed.append(sequence) return parsed[1]
except urllib2.HTTPError, e: print 'HTTP Error: {} {}'.format(e.code, url) print 'Falling back on default enzyme list' self._enzyme_dict = coral.constants.fallback_enzymes except urllib2.URLError, e: print 'URL Error: {} {}'.format(e.reason, url) print 'Falling back on default enzyme list' self._enzyme_dict = coral.constants.fallback_enzymes # Process into RestrictionSite objects? (depends on speed) print 'Processing into RestrictionSite instances.' self.restriction_sites = {} # TODO: make sure all names are unique for key, (site, cuts) in self._enzyme_dict.iteritems(): # Make a site try: r = coral.RestrictionSite(coral.DNA(site), cuts, name=key) # Add it to dict with name as key self.restriction_sites[key] = r except ValueError: # Encountered ambiguous sequence, have to ignore it until # coral.DNA can handle ambiguous DNA pass def get(self, name): '''Retrieve enzyme by name. :param name: Name of the restriction enzyme, e.g. EcoRV. :type name: str :returns: Restriction site matching the input name. :rtype: coral.RestrictionSite :raises: Exception when enzyme is not found in the database.
def __init__(self): self.dnas = [cr.DNA('GATACTAGCG'), cr.DNA('TACGATT'), cr.DNA('GATACG')] self.rnas = [s.transcribe() for s in self.dnas] self.nupack = cr.analysis.NUPACK()
def convert_sequence(seq, to_material): '''Translate a DNA sequence into peptide sequence. The following conversions are supported: Transcription (seq is DNA, to_material is 'rna') Reverse transcription (seq is RNA, to_material is 'dna') Translation (seq is RNA, to_material is 'peptide') :param seq: DNA or RNA sequence. :type seq: coral.DNA or coral.RNA :param to_material: material to which to convert ('rna', 'dna', or 'peptide'). :type to_material: str :returns: sequence of type coral.sequence.[material type] ''' if isinstance(seq, coral.DNA) and to_material == 'rna': # Transcribe # Can't transcribe a gap if '-' in seq: raise ValueError('Cannot transcribe gapped DNA') # Convert DNA chars to RNA chars origin = ALPHABETS['dna'][:-1] destination = ALPHABETS['rna'] code = dict(zip(origin, destination)) converted = ''.join([code.get(str(k), str(k)) for k in seq]) # Instantiate RNA object converted = coral.RNA(converted) elif isinstance(seq, coral.RNA): if to_material == 'dna': # Reverse transcribe origin = ALPHABETS['rna'] destination = ALPHABETS['dna'][:-1] code = dict(zip(origin, destination)) converted = ''.join([code.get(str(k), str(k)) for k in seq]) # Instantiate DNA object converted = coral.DNA(converted) elif to_material == 'peptide': # Translate seq_list = list(str(seq)) # Convert to peptide until stop codon is found. converted = [] while True: if len(seq_list) >= 3: base_1 = seq_list.pop(0) base_2 = seq_list.pop(0) base_3 = seq_list.pop(0) codon = ''.join(base_1 + base_2 + base_3).upper() amino_acid = CODONS[codon] # Stop when stop codon is found if amino_acid == '*': break converted.append(amino_acid) else: break converted = ''.join(converted) converted = coral.Peptide(converted) else: msg1 = 'Conversion from ' msg2 = '{0} to {1} is not supported.'.format(seq.__class__.__name__, to_material) raise ValueError(msg1 + msg2) return converted
def fetch_yeast_locus_sequence(locus_name, flanking_size=0): '''Acquire a sequence from SGD http://www.yeastgenome.org. :param locus_name: Common name or systematic name for the locus (e.g. ACT1 or YFL039C). :type locus_name: str :param flanking_size: The length of flanking DNA (on each side) to return :type flanking_size: int ''' from intermine.webservice import Service service = Service('http://yeastmine.yeastgenome.org/yeastmine/service') # Get a new query on the class (table) you will be querying: query = service.new_query('Gene') if flanking_size > 0: # The view specifies the output columns # secondaryIdentifier: the systematic name (e.g. YFL039C) # symbol: short name (e.g. ACT1) # length: sequence length # flankingRegions.direction: Upstream or downstream (or both) of locus # flankingRegions.sequence.length: length of the flanking regions # flankingRegions.sequence.residues: sequence of the flanking regions query.add_view('secondaryIdentifier', 'symbol', 'length', 'flankingRegions.direction', 'flankingRegions.sequence.length', 'flankingRegions.sequence.residues') # You can edit the constraint values below query.add_constraint('flankingRegions.direction', '=', 'both', code='A') query.add_constraint('Gene', 'LOOKUP', locus_name, 'S. cerevisiae', code='B') query.add_constraint('flankingRegions.distance', '=', '{:.1f}kb'.format(flanking_size / 1000.), code='C') # Uncomment and edit the code below to specify your own custom logic: query.set_logic('A and B and C') # TODO: What to do when there's more than one result? first_result = query.rows().next() # FIXME: Use logger module instead # print first_result['secondaryIdentifier'] # print first_result['symbol'], row['length'] # print first_result['flankingRegions.direction'] # print first_result['flankingRegions.sequence.length'] # print first_result['flankingRegions.sequence.residues'] seq = coral.DNA(first_result['flankingRegions.sequence.residues']) # TODO: add more metadata elif flanking_size == 0: # The view specifies the output columns query.add_view('primaryIdentifier', 'secondaryIdentifier', 'symbol', 'name', 'sgdAlias', 'organism.shortName', 'sequence.length', 'sequence.residues', 'description', 'qualifier') query.add_constraint('status', 'IS NULL', code='D') query.add_constraint('status', '=', 'Active', code='C') query.add_constraint('qualifier', 'IS NULL', code='B') query.add_constraint('qualifier', '!=', 'Dubious', code='A') query.add_constraint('Gene', 'LOOKUP', locus_name, 'S. cerevisiae', code='E') # Your custom constraint logic is specified with the code below: query.set_logic('(A or B) and (C or D) and E') first_result = query.rows().next() seq = coral.DNA(first_result['sequence.residues']) else: print 'Problem with the flanking region size....' seq = coral.DNA('') return seq
def primer(dna, tm=65, min_len=10, tm_undershoot=1, tm_overshoot=3, end_gc=False, tm_parameters='cloning', overhang=None, structure=False): '''Design primer to a nearest-neighbor Tm setpoint. :param dna: Sequence for which to design a primer. :type dna: coral.DNA :param tm: Ideal primer Tm in degrees C. :type tm: float :param min_len: Minimum primer length. :type min_len: int :param tm_undershoot: Allowed Tm undershoot. :type tm_undershoot: float :param tm_overshoot: Allowed Tm overshoot. :type tm_overshoot: float :param end_gc: Obey the 'end on G or C' rule. :type end_gc: bool :param tm_parameters: Melting temp calculator method to use. :type tm_parameters: string :param overhang: Append the primer to this overhang sequence. :type overhang: str :param structure: Evaluate primer for structure, with warning for high structure. :type structure: bool :returns: A primer. :rtype: coral.Primer :raises: ValueError if the input sequence is lower than the Tm settings allow. ValueError if a primer ending with G or C can't be found given the Tm settings. ''' # Check Tm of input sequence to see if it's already too low seq_tm = coral.analysis.tm(dna, parameters=tm_parameters) if seq_tm < (tm - tm_undershoot): msg = 'Input sequence Tm is lower than primer Tm setting' raise ValueError(msg) # Focus on first 90 bases - shouldn't need more than 90bp to anneal dna = dna[0:90] # Generate primers from min_len to 'tm' + tm_overshoot # TODO: this is a good place for optimization. Only calculate as many # primers as are needed. Use binary search. primers_tms = [] last_tm = 0 bases = min_len while last_tm <= tm + tm_overshoot and bases != len(dna): next_primer = dna[0:bases] last_tm = coral.analysis.tm(next_primer, parameters=tm_parameters) primers_tms.append((next_primer, last_tm)) bases += 1 # Trim primer list based on tm_undershoot and end_gc primers_tms = [(primer, melt) for primer, melt in primers_tms if melt >= tm - tm_undershoot] if end_gc: primers_tms = [pair for pair in primers_tms if pair[0][-1] == coral.DNA('C') or pair[0][-1] == coral.DNA('G')] if not primers_tms: raise ValueError('No primers could be generated using these settings') # Find the primer closest to the set Tm, make it single stranded tm_diffs = [abs(melt - tm) for primer, melt in primers_tms] best_index = tm_diffs.index(min(tm_diffs)) best_primer, best_tm = primers_tms[best_index] best_primer = best_primer.top # Apply overhang if overhang: overhang = overhang.top output_primer = coral.Primer(best_primer, best_tm, overhang=overhang) def _structure(primer): '''Check annealing sequence for structure. :param primer: Primer for which to evaluate structure :type primer: sequence.Primer ''' # Check whole primer for high-probability structure, focus in on # annealing sequence, report average nupack = coral.analysis.Nupack(primer.primer()) pairs = nupack.pairs(0) anneal_len = len(primer.anneal) pairs_mean = sum(pairs[-anneal_len:]) / anneal_len if pairs_mean < 0.5: warnings.warn('High probability structure', Warning) return pairs_mean if structure: _structure(output_primer) return output_primer
def fetch_yeast_locus_sequence(locus_name, flanking_size=0): """Acquire a sequence from SGD http://www.yeastgenome.org. :param locus_name: Common name or systematic name for the locus (e.g. ACT1 or YFL039C). :type locus_name: str :param flanking_size: The length of flanking DNA (on each side) to return :type flanking_size: int """ service = Service("http://yeastmine.yeastgenome.org/yeastmine/service") # Get a new query on the class (table) you will be querying: query = service.new_query("Gene") if flanking_size > 0: # The view specifies the output columns # secondaryIdentifier: the systematic name (e.g. YFL039C) # symbol: short name (e.g. ACT1) # length: sequence length # flankingRegions.direction: Upstream or downstream (or both) of locus # flankingRegions.sequence.length: length of the flanking regions # flankingRegions.sequence.residues: sequence of the flanking regions query.add_view("secondaryIdentifier", "symbol", "length", "flankingRegions.direction", "flankingRegions.sequence.length", "flankingRegions.sequence.residues") # You can edit the constraint values below query.add_constraint("flankingRegions.direction", "=", "both", code="A") query.add_constraint("Gene", "LOOKUP", locus_name, "S. cerevisiae", code="B") query.add_constraint("flankingRegions.distance", "=", "{:.1f}kb".format(flanking_size / 1000.), code="C") # Uncomment and edit the code below to specify your own custom logic: query.set_logic("A and B and C") # TODO: What to do when there"s more than one result? first_result = query.rows().next() # FIXME: Use logger module instead # print first_result["secondaryIdentifier"] # print first_result["symbol"], row["length"] # print first_result["flankingRegions.direction"] # print first_result["flankingRegions.sequence.length"] # print first_result["flankingRegions.sequence.residues"] seq = coral.DNA(first_result["flankingRegions.sequence.residues"]) # TODO: add more metadata elif flanking_size == 0: # The view specifies the output columns query.add_view("primaryIdentifier", "secondaryIdentifier", "symbol", "name", "sgdAlias", "organism.shortName", "sequence.length", "sequence.residues", "description", "qualifier") query.add_constraint("status", "IS NULL", code="D") query.add_constraint("status", "=", "Active", code="C") query.add_constraint("qualifier", "IS NULL", code="B") query.add_constraint("qualifier", "!=", "Dubious", code="A") query.add_constraint("Gene", "LOOKUP", locus_name, "S. cerevisiae", code="E") # Your custom constraint logic is specified with the code below: query.set_logic("(A or B) and (C or D) and E") first_result = query.rows().next() seq = coral.DNA(first_result["sequence.residues"]) else: print "Problem with the flanking region size...." seq = coral.DNA("") return seq