示例#1
0
    def test_EMBL_CCDS_RefSeq(self):
        exp = [
            CodingSequence(
                'CR456855', 'EMBL',
                Seq(
                    'ATGGAGGGTCAACGCTGGCTGCCGCTGGAGGCCAATCCCGAGGTCACCAACCAGTTTCTTAAACAATTAGGTCTACATCCTAACTGGCAATTCGTTGATGTATATGGAATGGATCCTGAACTCCTTAGCATGGTACCAAGACCAGTCTGTGCAGTCTTACTTCTCTTTCCTATTACAGAAAAGTATGAAGTATTCAGAACAGAAGAGGAAGAAAAAATAAAATCTCAGGGACAAGATGTTACATCATCAGTATATTTCATGAAGCAAACAATCAGCAATGCCTGTGGAACAATTGGACTGATTCATGCTATTGCAAACAATAAAGACAAGATGCACTTTGAATCTGGATCAACCTTGAAAAAATTCCTGGAGGAATCTGTGTCAATGAGCCCTGAAGAACGAGCCAGATACCTGGAGAACTATGATGCCATCCGAGTTACTCATGAGACCAGTGCCCATGAAGGTCAGACTGAGGCACCAAGTATAGATGAGAAAGTAGATCTTCATTTTATTGCATTAGTTCATGTAGATGGGCATCTCTATGAATTAGATGGGCGGAAGCCATTTCCAATTAACCATGGTGAAACTAGTGATGAAACTTTATTAGAGGATGCCATAGAAGTTTGCAAGAAGTTTATGGAGCGCGACCCTGATGAACTAAGATTTAATGCGATTGCTCTTTCTGCAGCTTAA',
                    IUPACUnambiguousDNA()),
                Seq(
                    'MEGQRWLPLEANPEVTNQFLKQLGLHPNWQFVDVYGMDPELLSMVPRPVCAVLLLFPITEKYEVFRTEEEEKIKSQGQDVTSSVYFMKQTISNACGTIGLIHAIANNKDKMHFESGSTLKKFLEESVSMSPEERARYLENYDAIRVTHETSAHEGQTEAPSIDEKVDLHFIALVHVDGHLYELDGRKPFPINHGETSDETLLEDAIEVCKKFMERDPDELRFNAIALSAA',
                    ExtendedIUPACProtein())),
            CodingSequence(
                'DQ917642', 'EMBL',
                Seq(
                    'ATGACGGGCAATGCCGGGGAGTGGTGCCTCATGGAAAGCGACCCCGGAGTCTTCACCGAGCTCATTAAAGGATTCGGTTGCCGAGGAGCCCAAGTAGAAGAAATATGGAGTTTAGAGCCTGAGAATTTTGAAAAATTAAAGCCAGTTCATGGGCTGATTTTTCTTTTCAAGTGGCAGCCCGGAGAAGAACCAGCAGGCTCTGTGGTTCAGGACTCCCGACTTGACACGATATTTTTTGCCAAGCAGGTAATTAATAATGCTTGTGCTACTCAAGCCATAGTAAGTGTGTTATTGAACTGTACCCATCAGGATGTCCATTTAGGAGAGACATTGTCAGAGTTTAAGGAATTCTCACAAAGTTTTGATGCAGCTATGAAAGGTTTGGCCCTGAGTAATTCGGATGTGATTCGCCAAGTACACAACAGTTTCGCCAGACAGCAAATGTTTGAATTTGATGCAAAGACATCAGCAAAAGAAGAAGATGCTTTTCACTTTGTCAGTTACGTTCCTGTGAATGGAAGACTGTACGAATTAGATGGATTAAGAGAAGGACCGATCGATTTAGGTGCATGCAATCAAGATGACTGGATCAGCGCAGTGAGGCCAGTCATAGAAAAAAGGATACAAAAGTACAGTGAAGGTGAAATTCGATTTAACTTAATGGCCATTGTGTCTGACAGGAAAATGATATATGAACAGAAGATAGCAGAGTTACAAAGACAGCTTGCTGAGGAGGAACCCATGGATACAGATCAGGGTAGTAACATGTTAAGTGCTATTCAGTCAGAAGTTGCCAAAAATCAGATGCTTATTGAAGAAGAAGTACAGAAATTAAAAAGATATAAGATTGAAAACATCAGAAGGAAGCATAATTATCTGCCTTTCATTATGGAACTGTTAAAGACTTTAGCAGAACACCAGCAGTTAATACCTCTCGTAGAAAAGGCAAAAGAAAAACAGAATGCGAAGAAGGCACAGGAAACCAAATGA',
                    IUPACUnambiguousDNA()),
                Seq(
                    'MTGNAGEWCLMESDPGVFTELIKGFGCRGAQVEEIWSLEPENFEKLKPVHGLIFLFKWQPGEEPAGSVVQDSRLDTIFFAKQVINNACATQAIVSVLLNCTHQDVHLGETLSEFKEFSQSFDAAMKGLALSNSDVIRQVHNSFARQQMFEFDAKTSAKEEDAFHFVSYVPVNGRLYELDGLREGPIDLGACNQDDWISAVRPVIEKRIQKYSEGEIRFNLMAIVSDRKMIYEQKIAELQRQLAEEEPMDTDQGSNMLSAIQSEVAKNQMLIEEEVQKLKRYKIENIRRKHNYLPFIMELLKTLAEHQQLIPLVEKAKEKQNAKKAQETK',
                    ExtendedIUPACProtein())),
            CodingSequence(
                'NM_001270952', 'RefSeq',
                Seq(
                    'ATGGATCCTGAACTCCTTAGCATGGTACCAAGACCAGTCTGTGCAGTCTTACTTCTCTTTCCTATTACAGAAAAGTATGAAGTATTCAGAACAGAAGAGGAAGAAAAAATAAAATCTCAGGGACAAGATGTTACATCATCAGTATATTTCATGAAGCAAACAATCAGCAATGCCTGTGGAACAATTGGACTGATTCATGCTATTGCAAACAATAAAGACAAGATGCACTTTGAATCTGGATCAACCTTGAAAAAATTCCTGGAGGAATCTGTGTCAATGAGCCCTGAAGAACGAGCCAGATACCTGGAGAACTATGATGCCATCCGAGTTACTCATGAGACCAGTGCCCATGAAGGTCAGACTGAGGCACCAAGTATAGATGAGAAAGTAGATCTTCATTTTATTGCATTAGTTCATGTAGATGGGCATCTCTATGAATTAGATGGGCGGAAGCCATTTCCAATTAACCATGGTGAAACTAGTGATGAAACTTTATTAGAGGATGCCATAGAAGTTTGCAAGAAGTTTATGGAGCGCGACCCTGATGAACTAAGATTTAATGCGATTGCTCTTTCTGCAGCATAG',
                    IUPACUnambiguousDNA()),
                Seq(
                    'MDPELLSMVPRPVCAVLLLFPITEKYEVFRTEEEEKIKSQGQDVTSSVYFMKQTISNACGTIGLIHAIANNKDKMHFESGSTLKKFLEESVSMSPEERARYLENYDAIRVTHETSAHEGQTEAPSIDEKVDLHFIALVHVDGHLYELDGRKPFPINHGETSDETLLEDAIEVCKKFMERDPDELRFNAIALSAA',
                    ExtendedIUPACProtein())),
            CodingSequence(
                'CCDS73586.1', 'CCDS',
                Seq('ATGGATCCTGAACTCCTTAGCATGGTACCAAGACCAGTCTGTGCAGTCTTACTTCTCTTTCCTATTACAGAAAAGTATGAAGTATTCAGAACAGAAGAGGAAGAAAAAATAAAATCTCAGGGACAAGATGTTACATCATCAGTATATTTCATGAAGCAAACAATCAGCAATGCCTGTGGAACAATTGGACTGATTCATGCTATTGCAAACAATAAAGACAAGATGCACTTTGAATCTGGATCAACCTTGAAAAAATTCCTGGAGGAATCTGTGTCAATGAGCCCTGAAGAACGAGCCAGATACCTGGAGAACTATGATGCCATCCGAGTTACTCATGAGACCAGTGCCCATGAAGGTCAGACTGAGGCACCAAGTATAGATGAGAAAGTAGATCTTCATTTTATTGCATTAGTTCATGTAGATGGGCATCTCTATGAATTAGATGGGCGGAAGCCATTTCCAATTAACCATGGTGAAACTAGTGATGAAACTTTATTAGAGGATGCCATAGAAGTTTGCAAGAAGTTTATGGAGCGCGACCCTGATGAACTAAGATTTAATGCGATTGCTCTTTCTGCAGCATAG'
                    ),
                Seq('MDPELLSMVPRPVCAVLLLFPITEKYEVFRTEEEEKIKSQGQDVTSSVYFMKQTISNACGTIGLIHAIANNKDKMHFESGSTLKKFLEESVSMSPEERARYLENYDAIRVTHETSAHEGQTEAPSIDEKVDLHFIALVHVDGHLYELDGRKPFPINHGETSDETLLEDAIEVCKKFMERDPDELRFNAIALSAA'
                    )),
            CodingSequence(
                'CCDS86041.1', 'CCDS',
                Seq('ATGACGGGCAATGCCGGGGAGTGGTGCCTCATGGAAAGCGACCCCGGGGTCTTCACCGAGCTCATTAAAGGATTCGGTTGCCGAGGAGCCCAAGTAGAAGAAATATGGAGTTTAGAGCCTGAGAATTTTGAAAAATTAAAGCCAGTTCATGGGTTAATTTTTCTTTTCAAGTGGCAGCCAGGAGAAGAACCAGCAGGCTCTGTGGTTCAGGACTCCCGACTTGACACGATATTTTTTGCTAAGCAGGTAATTAATAATGCTTGTGCTACTCAAGCCATAGTGAGTGTGTTACTGAACTGTACCCACCAGGATGTCCATTTAGGCGAGACATTATCAGAGTTTAAAGAATTTTCACAAAGTTTTGATGCAGCTATGAAAGGCTTGGCACTGAGCAATTCAGATGTGATTCGACAAGTACACAACAGTTTCGCCAGACAGCAAATGTTTGAATTTGATACGAAGACATCAGCAAAAGAAGAAGATGCTTTTCACTTTGTCAGTTATGTTCCTGTTAATGGGAGACTGTATGAATTAGATGGATTAAGAGAAGGACCGATTGATTTAGGTGCATGCAATCAAGATGATTGGATCAGTGCAGTAAGGCCTGTCATAGAAAAAAGGATACAAAAAGACGGGTTTTCACCATGTTGCCCAGGCTGGTCTCAGACTCCTGAGCTCAAGCCATCCGCCTGCCTCGACCTCCCAAAGTGGTACAGTGAAGGTGAAATTCGATTTAATTTAATGGCCATTGTGTCTGACAGAAAAATGATATATGAGCAGAAGATAGCAGAGTTACAAAGACAACTTGCAGAGGAACCCATGGATACAGATCAAGGTAATAGTATGTTAAGTGCTATTCAGTCAGAAGTTGCCAAAAATCAGATGCTTATTGAAGAAGAAGTACAGAAATTAAAAAGATACAAGATTGAGAATATCAGAAGGAAGCATAATTATCTGCCTTTCATTATGGAATTGTTAAAGACTTTAGCAGAACACCAGCAGTTAATACCACTAGTAGAAAAGGCAAAAGAAAAACAGAACGCAAAGAAAGCTCAGGAAACCAAATGA'
                    ),
                Seq('MTGNAGEWCLMESDPGVFTELIKGFGCRGAQVEEIWSLEPENFEKLKPVHGLIFLFKWQPGEEPAGSVVQDSRLDTIFFAKQVINNACATQAIVSVLLNCTHQDVHLGETLSEFKEFSQSFDAAMKGLALSNSDVIRQVHNSFARQQMFEFDTKTSAKEEDAFHFVSYVPVNGRLYELDGLREGPIDLGACNQDDWISAVRPVIEKRIQKDGFSPCCPGWSQTPELKPSACLDLPKWYSEGEIRFNLMAIVSDRKMIYEQKIAELQRQLAEEPMDTDQGNSMLSAIQSEVAKNQMLIEEEVQKLKRYKIENIRRKHNYLPFIMELLKTLAEHQQLIPLVEKAKEKQNAKKAQETK'
                    ))
        ]

        ids = {
            'EMBL': ['CR456855.1', 'DQ917642.1'],
            'RefSeq': ['NM_001270952.1'],
            'CCDS': ['CCDS73586.1', 'CCDS86041.1']
        }
        formatter = dba.UrlFormatter()
        queries = []
        for database, id_list in ids.items():
            queries += formatter.format(database, id_list)
        loop = asyncio.get_event_loop()
        fetcher = dba.Entry_fetcher()
        entries = loop.run_until_complete(fetcher.fetch_all(queries))
        splitter = dba.EntrySplitter()
        entries = splitter.split(entries)
        loop.close()
        parser = dba.DnaParser()
        res = parser.parse(entries)
        for item in res:
            self.assertTrue(item in exp)
        self.assertEqual(len(exp), len(res))
示例#2
0
def REsearch(goi='', goiFile='', mcs='', mcsFile=''):
    rb = RestrictionBatch(suppliers=[
        'C', 'B', 'E', 'I', 'K', 'J', 'M', 'O', 'N', 'Q', 'S', 'R', 'V', 'Y',
        'X'
    ])

    goi = Seq(goi, IUPACUnambiguousDNA()) if goi else read_seq(goiFile)
    if not goi:
        raise Exception('Please provide a GOI sequence!')
    mcs = Seq(mcs, IUPACUnambiguousDNA()) if mcs else read_seq(mcsFile)
    if not mcs:
        raise Exception('Please provide a MCS sequence!')
    result_mcs = rb.search(mcs)
    result_goi = rb.search(goi)
    REs = set([e for e in result_mcs.keys() if result_mcs[e]]) - set(
        [e for e in result_goi.keys() if result_goi[e]])

    # ana = Analysis(RestrictionBatch(list(REs)), mcs)

    # REs_sorted = sorted(REs, key=lambda e: result_mcs[e])

    # result = {e: result_mcs[e] for e in REs_sorted}

    r = []
    for e in REs:
        for site in result_mcs[e]:
            r.append((str(e), site, "blunt" if e.is_blunt() else e.elucidate(),
                      ' '.join(e.suppl)))

    r.sort(key=lambda i: i[1])

    return r
示例#3
0
    def pcr(
        cls,
        seq: Union[str, Seq, SeqRecord],
        fwd_padding: str = "",
        rev_padding: str = "",
    ) -> "Primers":
        """Create Primers to amplify a sequence-like object.

        Args:
            record: the sequence-like object to amplify via primers

        Keyword Args:
            fwd_padding: Additional bp that are added to
                the 5' end of the FWD primer. Example use case:
                a restriction enzyme
            rev_padding: Additional bp that are added to
                the 5' end of the REV primer. Keep in mind that
                these are added to the 5' end of the rev primer,
                so they're the reverse complement bp of the
                template sequence

        Returns:
            A Primers object to amplify the SeqRecord
        """

        template = _get_seq(seq)

        fwd, rev = primers(template, add_fwd=fwd_padding, add_rev=rev_padding)

        return Primers(
            Seq(fwd.seq, alphabet=IUPACUnambiguousDNA()),
            fwd.tm,
            Seq(rev.seq, alphabet=IUPACUnambiguousDNA()),
            rev.tm,
        )
示例#4
0
        def primer(side: str) -> Tuple[Seq, float]:
            """Get the sequence and tm of the FWD or REV primer."""

            assert side in ("LEFT", "RIGHT")
            seq = p3_output[f"PRIMER_{side}_0_SEQUENCE"]
            seq_tm = p3_output[f"PRIMER_{side}_0_TM"]
            return Seq(seq, alphabet=IUPACUnambiguousDNA()), float(seq_tm)
示例#5
0
def gibson(records: List[SeqRecord],
           hifi: bool = False) -> Tuple[SeqRecord, List[Primers]]:
    """Create primers for records for a single Gibson Assembly.

    Create primers to mutate the records' sequences (after PCR) so they
    anneal to their neighboring records

    Args:
        records: list of records to assemble

    Keyword Args:
        hifi: whether to use HiFi DNA assembly

    Returns:
        1. assembled plasmid (SeqRecord)
        2. list of primer pairs (same length as records)
    """

    assert records

    records = [r.upper() for r in records]
    plasmid = records[0].upper()
    primers: List[Primers] = [Primers.pcr(records[0])]

    for i, f1 in enumerate(records):
        j = (i + 1) % len(records)
        f2 = records[j]

        if j != 0:
            primers.append(Primers.pcr(f2))

        # if hifi is false, mismatches is 0
        homology, homology_length, mismatch_lengths = _record_homology(
            str(f1.seq), str(f2.seq), hifi)

        # remove mismatches up to 10 bp if doing HiFi DNA assembly
        if hifi and mismatch_lengths:
            f1_mm_length, f2_mm_length = mismatch_lengths
            f1.seq = f1.seq[:len(f1.seq) - MAX_HOMOLOGY + f1_mm_length - 1]
            f2.seq = f2.seq[f2_mm_length:]

        if homology:  # homology already exists between records.
            plasmid += f2[homology_length:].upper()
            plasmid.id += f"|{f2.id}"
        else:
            # homology does not exist between records, introduce it to primers
            plasmid += f2.upper()
            plasmid.id += f"|{f2.id}"
            _mutate_primers(primers[i], primers[j], MIN_HOMOLOGY // 2)

    plasmid.id = "+".join(r.id for r in records if r.id != "<unknown id>")
    plasmid.seq = Seq(str(plasmid.seq.upper()), alphabet=IUPACUnambiguousDNA())

    # extend primers in 5' direction to avoid duplicate junctions
    _fix_duplicate_junctions(records, primers)

    return plasmid, primers
 def parse(self, database, xml_soup):
     '''
     Input: Beautifulsoup(xml) of Genbank entry
     Output:
     entry_id, DNA sequence, Protein sequence
     Raises NotAnORF if DNA does not contain a coding sequence (CDS) 
     '''
     is_mrna = bool(xml_soup.find_all('GBSeq_moltype')[0].text.strip() == 'mRNA')
     is_dna = bool(xml_soup.find_all('GBSeq_moltype')[0].text.strip() == 'cDNA')
     if not (is_dna or is_mrna):
         raise NotAnORF
     id_ = xml_soup.find_all('GBSeq_locus')[0].text.strip()
     features = xml_soup.find_all('GBFeature')
     if not features:  # entirely unannotated entry - ultra rare
         raise SequenceNotFoundError
     found = False
     for f in features: 
         if f.GBFeature_key.text.strip() == 'CDS':
             loc = f.find_all('GBFeature_location')[0].text
             loc = loc
             #sometimes format is start...pos2,pos3..stop);
             #if pos2 != pos3, or more intervals are indicated, we ignore this entry   
             if loc.startswith('join('):
                 loc = loc[5:-1].split('..')
                 positions = []
                 for pos in loc:
                     positions = positions + pos.split(',') #pos2,pos2
                 positions = list(set(positions))
                 try:
                     assert len(positions) == 3
                     start, stop = positions[0], positions[2]
                 except (AssertionError, ValueError):
                     raise NotAnORF 
             #mostly feature location is simply encoded as start..stop;
             else:
                 start, stop = loc.split('..')
             if '<' in start or '>' in stop:  # start or stop codon not known
                 raise NotAnORF
             start, stop = int(start), int(stop)
             found = True
             break
     if not found:  # not sure this ever happens
         raise SequenceNotFoundError
     dna_seq = xml_soup.GBSeq_sequence.text.strip().upper()
     orf = dna_seq[start-1:stop] 
     try: 
         assert orf.startswith('ATG') 
         assert orf.endswith(('TAA', 'TGA', 'TAG')) 
         assert len(orf)%3 == 0 
     except AssertionError: 
         raise NotAnORF 
     cds = Seq(orf, IUPACUnambiguousDNA()) 
     return CodingSequence(id_, database, cds, cds.translate(cds=True))
示例#7
0
def hustle_bustle(seq):
    pept_seq = str(
        Seq("".join(seq), alphabet=IUPACUnambiguousDNA()).translate())
    try:
        first_stop = pept_seq.index("*")
        prefix = seq[:first_stop * 3]
        nu_seq = seq[first_stop * 3:]  #[1:]
        #nu_seq.append(seq[first_stop*3:][0])
        suffix = truffle_shuffle(nu_seq)
        prefix.extend(suffix)
    except ValueError:
        return (seq)
    return (prefix)
 def parse(self, html_soup): 
     # title: Report for CCDS[id].[version] (current version)
     #.[version] is optional.
     #" (current version)" might not be present
     titlematcher = re.compile(r'Report for CCDS[0-9]*(?:\.[0-9]*)(?:\ \(current version\))?')
     id_ = html_soup.find_all(string=titlematcher)[0] #find() does not take kwargs
     idmatcher = r'CCDS[0-9]*(?:.[0-9]*)?'
     id_ = re.search(idmatcher, id_).group(0)
     nucleotides = html_soup.find_all('span', {'id':re.compile('n[0-9]+')})
     aminoacids = html_soup.find_all('span', {'id':re.compile('p[0-9]+')})
     dna_seq = Seq(''.join([nt.text for nt in nucleotides]), 
                   IUPACUnambiguousDNA())
     aa_seq = Seq(''.join([aa.text for aa in aminoacids]), ExtendedIUPACProtein())
     assert aa_seq == dna_seq.translate(cds=True) 
     return CodingSequence(id_, 'CCDS', dna_seq, aa_seq)
 def parse_non_eukaryotes(self, database, xml):
     xml_soup = BeautifulSoup(xml, 'xml')
     id_ = str(xml_soup.find_all('GBSeq_primary-accession')[0].text).strip()
     features = xml_soup.find_all('GBFeature')
     if not features:  # entirely unannotated entry - ultra rare
         raise SequenceNotFoundError
     found = False
     #unlike eukaryotes, there are usually multiple CDS per entry, and they
     #might be on complementary strands...
     coding_sequences = []
     for f in features: 
         if f.GBFeature_key.text.strip() == 'CDS':
             try:
                 dna_seq = Seq(xml_soup.GBSeq_sequence.text.strip().upper(),
                               IUPACUnambiguousDNA()) #don't move out of loop...
             except AttributeError: #entry does not actually have a normal sequence (e.g. HOPD_ECOLX)
                 raise SequenceNotFoundError 
             loc = f.find_all('GBFeature_location')[0].text
             start, stop = loc.split('..')
             if '<' in start or '>' in stop:  # start or stop codon not known
                 continue
             try:
                 if not 'complement('.upper() in start.upper(): #cds on sense strand
                     start, stop = int(start), int(stop)
                 elif 'complement('.upper() in start.upper(): #cds on other strand
                     #complement([start]..[stop])
                     start = int(start.split('(')[-1])
                     stop = int(stop.replace(')', '')) 
                     #reverse complement dna and remap
                     dna_seq = dna_seq.reverse_complement()
                     temp = start
                     start = len(dna_seq) - stop +1
                     stop = len(dna_seq) - temp +1
             except ValueError: #some other abstruse way of indicating starts and stops
                 continue 
             orf = dna_seq[start-1:stop]
             try: 
                 protein_seq = orf.translate(table=11, cds=True) #note that we use bacterial codon table
                 coding_sequences.append(CodingSequence(id_, database, orf, protein_seq))
                 found = True
             except TranslationError: 
                 continue #not a good CDS
     if not found:  # not sure this ever happens
         raise NotAnORF
     return coding_sequences
示例#10
0
def do_it_all(motif_file,
              copy_rule1: int = 10,
              copy_rule2: int = 12,
              how_many_Ns: int = 1,
              nresults: int = 1) -> list:
    """
    :param motif_file: a file in a format that i need to come up with
    :param copy_rule1: pick this many random variants
    :param copy_rule2: make each possible variant in this many copies
    :param how_many_Ns: How many N's between motifs?
    :param nresults: number of motif assemblies to output
    return: actual results
    """

    # generate de-ambigulated motifs in the right copy numbers
    motifs = generate_parts_for_cassette(motif_file, copy_rule1, copy_rule2)

    # shuffle motif positions in the cassette
    motif_set = set()
    while len(motif_set) != nresults:
        motif_set.add(tuple(shuffle_motifs(motifs)))

    cassette_strs = []
    for i, x in enumerate(motif_set):
        # link with N's
        cassette_str = SeqRecord(Seq("", IUPACUnambiguousDNA()),
                                 id=f"id_cassette_{i+1}",
                                 name=f"name_cassette_{i+1}",
                                 description=f"metmap generated cassette",
                                 annotations={'date': "08-MAR-1983"})
        current_pos = 0
        for (motif, de_motif) in x:
            cassette_str += deambigulate_random("N" * how_many_Ns)
            current_pos += how_many_Ns
            cassette_str += de_motif
            cassette_str.features.append(
                SeqFeature(FeatureLocation(current_pos,
                                           current_pos + len(de_motif)),
                           type='misc_binding',
                           qualifiers={'note': motif}))
            current_pos += len(de_motif)
        cassette_strs.append(cassette_str)
    return cassette_strs
示例#11
0
def _get_seq_from_surrounding(record, start: int, end: int) -> str:
    species = record.annotations['reference_species']
    chr_num = record.annotations['reference_chromosome_number']
    transcript_strand = record.annotations['transcript_strand']
    if transcript_strand == -1:
        seq_end = record.annotations['reference_right_index']
        seq = _fetch_seq(
            species,
            chr_num,
            seq_end - end + 1,
            # Note: ensembl is inclusive range and biopython is exclusive
            seq_end - start)
        seq = Seq(seq, IUPACUnambiguousDNA()).reverse_complement()
    else:
        seq_start = record.annotations['reference_left_index']
        seq = _fetch_seq(
            species,
            chr_num,
            seq_start + start,
            # Note: ensembl is inclusive range and biopython is exclusive
            seq_start + end - 1)
    return seq
示例#12
0
class DNA(Polynucleotide):
    alphabet_dict = {'strict':IUPACUnambiguousDNA(),'permissive':IUPACAmbiguousDNA()}

    def get_dna(self,*args,**kwargs):
        as_string = kwargs.pop('as_string',False)
        x = self.convert_sequence(*args,**kwargs)
        if as_string:
            return str(x)
        return x

    def get_rna(self,*args,**kwargs):
        as_string = kwargs.pop('as_string',False)
        x = self.convert_sequence(*args,**kwargs).transcribe()
        if as_string:
            return str(x)
        return x

    def get_protein(self,*args,**kwargs):
        as_string = kwargs.pop('as_string',False)
        x = self.translate(*args,**kwargs)
        if as_string:
            return str(x)
        return x
示例#13
0
#!python3
import pandas as pd
import numpy as np
import os
from Bio import AlignIO
from Bio.Alphabet.IUPAC import IUPACUnambiguousDNA
from Bio.Data.CodonTable import TranslationError
from ete3 import Tree

f = os.path.abspath('..') + "/DataEmpirical/Cetacea"

phy = AlignIO.read("{0}/datadryad/DATASET_B.phylip".format(f), format="phylip-relaxed", alphabet=IUPACUnambiguousDNA())
print("{0} taxa.".format(len(phy)))
taxa = Tree("{0}/rootedtree.nhx".format(f), format=1).get_leaf_names()

precision_dict = {}
coverage_dict = {}
with open("{0}/datadryad/Cetacea_gene_partition.txt".format(f), "r") as gene_partition:
    for line in gene_partition:
        name, pos = line.replace("DNA,", "").replace(" ", "").split("=")
        down, up = pos.split("-")
        down, up = int(down), int(up)
        diff = 1 + up - down
        if diff % 3 != 0:
            continue

        sequences = phy[:, down - 1:up]
        output = phy[:, :0]
        filtered = [rec for rec in sequences if rec.id in taxa]
        for pos in range(0, int(diff / 3)):
            keep_site = True
 def __init__(self, promoter_sequences_filename= None):
     if promoter_sequences_filename is None:
         promoter_sequences_filename= "promoter_sequences.fasta"   
         fullfile = path.join('Data', promoter_sequences_filename)
         self.promoter_sequences= SeqIO.to_dict(SeqIO.parse(fullfile, "fasta", alphabet=IUPACUnambiguousDNA()))
         self.promoter_sequences= cks.change_keys_SGDID(self.promoter_sequences)
     else:
         self.promoter_sequences= SeqIO.to_dict(SeqIO.parse(promoter_sequences_filename, "fasta", alphabet=IUPACUnambiguousDNA()))
         self.promoter_sequences= cks.change_keys_SGDID(self.promoter_sequences)
示例#15
0
    'T': 'A',
    'C': 'G',
    'G': 'C',
    'X': 'X',
    'N': 'N',
    'a': 't',
    't': 'a',
    'c': 'g',
    'g': 'c',
    'x': 'x',
    'n': 'n',
    '-': '-'
}

# Shortcut to the list of DNA bases:
bases = sorted(list(IUPACUnambiguousDNA().letters))
ambiguous_bases = sorted(list(IUPACAmbiguousDNA().letters))


def base_complement(k):
    """ Return complement of base.

    Performs the subsitutions: A<=>T, C<=>G, X=>X for both upper and lower
    case. The return value is identical to the argument for all other values.

    :param k: A base.
    :returns: Complement of base.
    :rtype: str

    """
    try:
示例#16
0
def alignment_summary(inFastaFileOne,
                      inFastaFileTwo,
                      outfileName=None,
                      printCounts=False):
    """ Write or print pairwise alignment summary information for sequences in two FASTA
        files, including SNPs, ambiguous bases, and indels.
    """
    gap = '-'
    ambiguous = 'N'
    aligner = tools.muscle.MuscleTool()

    per_chr_fastas = interhost.transposeChromosomeFiles(
        [inFastaFileOne, inFastaFileTwo])

    results = OrderedDict()
    results["same_unambig"] = 0
    results["snp_unambig"] = 0
    results["indel_unambig"] = 0
    results["indel_ambig"] = 0
    results["ambig_one"] = 0
    results["ambig_two"] = 0
    results["ambig_both"] = 0
    results["unambig_both"] = 0

    for chr_fasta in per_chr_fastas:
        same_unambig = 0
        snp_unambig = 0
        indel_unambig = 0
        indel_ambig = 0
        ambig_one = 0
        ambig_two = 0
        ambig_both = 0
        unambig_both = 0

        alignOutFileName = util.file.mkstempfname('.fasta')
        aligner.execute(chr_fasta, alignOutFileName, fmt="clw")

        with open(alignOutFileName, "r") as f:
            alignment = Bio.AlignIO.read(f, "clustal")

            for col_idx in range(0, alignment.get_alignment_length()):
                col = alignment[:, col_idx]
                c1 = col[0]
                c2 = col[1]

                if (c1 in ambiguous and c2 in ambiguous):
                    ambig_both += 1
                elif c1 in ambiguous:
                    ambig_one += 1
                elif c2 in ambiguous:
                    ambig_two += 1

                if (c1 in IUPACUnambiguousDNA().letters
                        and c2 in IUPACUnambiguousDNA().letters):
                    unambig_both += 1
                    if c1 == c2:
                        same_unambig += 1
                    else:
                        snp_unambig += 1

                if ((c1 == gap and c2 in IUPACUnambiguousDNA().letters) or
                    (c2 == gap and c1 in IUPACUnambiguousDNA().letters)):
                    indel_unambig += 1

                if ((c1 == gap and c2 in ambiguous)
                        or (c2 == gap and c1 in ambiguous)):
                    indel_ambig += 1

        if printCounts:
            print("Counts for this segment/chromosome:")
            print("same_unambig ", same_unambig)
            print("snp_unambig  ", snp_unambig)
            print("indel_unambig", indel_unambig)
            print("indel_ambig  ", indel_ambig)
            print("ambig_one    ", ambig_one)
            print("ambig_two    ", ambig_two)
            print("ambig_both   ", ambig_both)
            print("unambig_both ", unambig_both)

        results["same_unambig"] += same_unambig
        results["snp_unambig"] += snp_unambig
        results["indel_unambig"] += indel_unambig
        results["indel_ambig"] += indel_ambig
        results["ambig_one"] += ambig_one
        results["ambig_two"] += ambig_two
        results["ambig_both"] += ambig_both
        results["unambig_both"] += unambig_both

    if printCounts:
        print("\nCounts for this sample:")
        print("same_unambig ", results["same_unambig"])
        print("snp_unambig  ", results["snp_unambig"])
        print("indel_unambig", results["indel_unambig"])
        print("indel_ambig  ", results["indel_ambig"])
        print("ambig_one    ", results["ambig_one"])
        print("ambig_two    ", results["ambig_two"])
        print("ambig_both   ", results["ambig_both"])
        print("unambig_both ", results["unambig_both"])

    if outfileName:
        with open(outfileName, "wt") as of:
            csvout = csv.writer(of, delimiter='\t')
            csvout.writerow(list(results.keys()))
            csvout.writerow(list(results.values()))
示例#17
0
def clone_combinatorial(
    record_set: List[SeqRecord],
    enzymes: List[RestrictionType],
    include: List[str] = None,
    min_count: int = -1,
    linear: bool = True,
) -> List[Tuple[List[SeqRecord], List[SeqRecord]]]:
    """Parse a single list of SeqRecords to find all circularizable plasmids.

    Turn each SeqRecord's post-digest seqs into a graph where the nodes are
    the overhangs and the edges are the linear fragments
    post-digest/catalyzing with BsaI/BpiI.

    Args:
        record_set: single record set that might circularize
        enzymes: list of enzymes to digest the input records with

    Keyword Args:
        include: the include to filter assemblies
        min_count: mininum number of SeqRecords for an assembly to be considered
        linear: Whether the individual SeqRecords are assumed to be linear

    Returns:
        A list of tuples with:
            1. plasmids that will form
            2. SeqRecords that went into each formed plasmid
    """

    graph = nx.MultiDiGraph()

    seen_seqs: Set[str] = set(
    )  # stored list of input seqs (not new combinations)
    for record in record_set:
        seen_seqs.add(str(record.seq + record.seq).upper())
        seen_seqs.add(
            str((record.seq + record.seq).reverse_complement().upper()))

        for left, frag, right in _catalyze(record, enzymes, linear):
            graph.add_node(left)
            graph.add_node(right)
            graph.add_edge(left, right, frag=frag)

    try:  # find all circularizable cycles
        cycles = simple_cycles(graph)
    except NetworkXNoCycle:
        return []

    # get the fragments, enzymes back out of the cycle
    ids_to_fragments: Dict[str, List[SeqRecord]] = defaultdict(list)
    ids_to_plasmids: Dict[str, List[SeqRecord]] = defaultdict(list)
    for cycle in cycles:
        # filter for the minimum number of SeqRecords
        if min_count > 0 and len(cycle) < min_count:
            continue

        combinations = CombinatorialBins()
        for i, overhang in enumerate(cycle):
            next_overhang = cycle[(i + 1) % len(cycle)]
            record_bin = []
            for out_edge in graph.out_edges(keys=True):
                src, dest, index = out_edge
                if src != overhang or dest != next_overhang:
                    continue
                record_bin.append(graph.edges[src, dest, index]["frag"])
            combinations.append(record_bin)

        for fragments in combinations:
            # create the composite plasmid
            plasmid = SeqRecord(Seq("", IUPACUnambiguousDNA()))
            for fragment in fragments:
                plasmid += fragment.upper()

            # make sure it's not just a re-ligation of insert + backbone
            plasmid_seq = str(plasmid.seq)
            if any(plasmid_seq in seq for seq in seen_seqs):
                continue

            # filter for plasmids that have an 'include' feature
            if not _has_features(plasmid, include):
                continue

            # re-order the fragments to try and match the input order
            fragments = _reorder_fragments(record_set, fragments)

            seen_seqs.add(str(plasmid.seq + plasmid.seq))
            seen_seqs.add(str(
                (plasmid.seq + plasmid.seq).reverse_complement()))

            # make a unique id for the fragments
            fragments_id = _hash_fragments(fragments)
            ids_to_fragments[fragments_id] = fragments
            ids_to_plasmids[fragments_id].append(plasmid)

    plasmids_and_fragments: List[Tuple[List[SeqRecord], List[SeqRecord]]] = []
    for ids, fragments in ids_to_fragments.items():
        plasmids = ids_to_plasmids[ids]
        for i, plasmid in enumerate(plasmids):
            plasmid.id = "+".join(f.id for f in fragments
                                  if f.id != "<unknown id>")
            plasmid.description = f"cloned from {', '.join(str(e) for e in enzymes)}"

            if len(plasmids) > 1:
                plasmid.id += f"({i + 1})"
        plasmids_and_fragments.append((plasmids, fragments))
    return plasmids_and_fragments
示例#18
0
def _parse_row(row: str) -> Optional[SeqRecord]:
    """Parse a single row of the iGEM XML into a SeqRecord.

    I'm using the regex package here because there are characters
    in the XML's sha1 that throw the XML parsers that I tried. I know
    that this regex is slow...

    Args:
        row: A single 'row' element in the XML

    Returns:
        A SeqRecord with the part id, sequence, and description stored
    """
    def match(regex):
        row_match = regex.search(row)
        if row_match:
            return row_match[1]
        return ""

    matches = [match(r) for r in RES]
    name, seq, desc_short, desc_long, cats, cache, ftype, nickname = matches

    if not seq or len(seq) > 10_000:
        return None

    features: List[SeqFeature] = []
    feature_matches = RE_FEATURES.search(cache)
    if feature_matches:
        feature_match = feature_matches[1]
        for feature in feature_match.split("]"):
            feature = feature.replace(", [", "")

            if feature.count(",") < 4:
                continue

            f_type, f_start, f_end, f_name, f_strand, *_ = [
                f.replace("[", "").replace("'", "").replace("(", "").strip()
                for f in feature.split(",")
            ]

            f_start_int = int(f_start)
            f_end_int = int(f_end)

            if (f_start_int == f_end_int or f_start_int > f_end_int
                    or f_end_int - f_start_int < DNA_WORD_SIZE):
                continue

            features.append(
                SeqFeature(
                    id=f_name,
                    # have to -1 here. more CDS are of a length % 3 == 0 w/ this
                    # I don't think this was enforced on iGEM teams when making features
                    location=FeatureLocation(f_start_int - 1, f_end_int,
                                             1 if f_strand == "0" else -1),
                    type=_get_type(f_name, f_type.lower()),
                    strand=1 if f_strand == "0" else -1,
                ))

    return SeqRecord(
        Seq(seq, IUPACUnambiguousDNA()),
        id=name,
        dbxrefs=[name],
        annotations={
            "short_desc": desc_short,
            "description": desc_long,
            "categories": cats,
            "nickname": nickname,
            "type": _get_type(name, ftype),
        },
        features=features,
    )
示例#19
0
def fetch_ensembl_transcript(ensembl_transcript_id: str,
                             is_retry: bool = False) -> SeqRecord:
    """Fetch the requested Ensembl transcript.

    Get the requested Ensembl transcript, together with exon and
    coding region (CDS) boundaries.

    Parameters
    ----------
    ensembl_transcript_id : str
      the ensembl transcript id, of the form ENST...

    Returns
    -------
    `Bio.SeqRecord`

      The requested transcript sequence, in 5' -> 3' order, together
      with exon and CDS features. The coordinates of exons and CDS
      features are relative to the sequence fragment.

    >>> fetch_ensembl_transcript('ENST00000398844').description
    'chromosome:GRCh38:5:134648789:134727823:1'

    >>> fetch_ensembl_transcript('ATL3').description
    'Reverse complement of chromosome:GRCh38:11:63624087:63671612:-1'

    >>> fs = fetch_ensembl_transcript('ENST00000398844').features
    >>> len([f for f in fs if f.type == 'exon'])
    23
    """
    base_url = "http://rest.ensembl.org"

    if not ensembl_transcript_id.startswith('ENS'):
        # could be a gene symbol
        ensembl_transcript_id = _gene_to_enst(ensembl_transcript_id)

    # First, fetch the transcript sequence
    url = base_url + f"/sequence/id/{ensembl_transcript_id}"

    log.debug(f"Querying Ensembl for sequence of {ensembl_transcript_id}")
    response = _cached_session.get(url,
                                   params={
                                       "type": "genomic",
                                       "content-type": "application/json"
                                   })
    log.debug('Request cached: {}'.format(
        getattr(response, 'from_cache', False)))
    try:
        response.raise_for_status()
    except requests.exceptions.HTTPError:
        log.error("Ensembl sequence REST query returned error "
                  "{}".format(response.text))
        raise ValueError(response.text)

    response_data = response.json()

    try:
        description = response_data['desc'].split(':')
        species = description[1]
        chromosome_number = description[2]  # may be X
        sequence_left = int(description[3])
        sequence_right = int(description[4])
        transcript_strand = int(description[5])

        if sequence_left > sequence_right:
            raise ValueError(
                f"Expected left sequence boundary {sequence_left} "
                f"<= right sequence boundary {sequence_right}: did "
                "the format of the Ensembl REST response change?")

        sequence_id = response_data['id']

        seq_str = response_data['seq']

        log.debug(f"Retrieved sequence {response_data['desc']} of length "
                  f"{sequence_right - sequence_left} for species {species} on "
                  f"strand {transcript_strand}")
    except (KeyError, ValueError) as e:
        log.error(e)
        log.error(
            'Error parsing sequence metadata from Ensembl REST response - '
            'did the format of the response change?')
        raise ValueError(e)

    if transcript_strand == -1:
        # If the transcript strand is -1, the sequence returned by
        # Ensembl is on the strand opposite the reference strand,
        # which is the strand of the Ensembl coordinates for
        # exons/coding regions. In this case, we initially store the
        # reverse complement of the sequence, and after fetching the
        # exon/coding regions, we'll return the reverse complement of
        # the `Bio.SeqRecord` object, which will properly re-index the
        # exon/coding regions.
        seq = Seq(seq_str, IUPACUnambiguousDNA()).reverse_complement()
    else:
        seq = Seq(seq_str, IUPACUnambiguousDNA())

    record = SeqRecord(seq, id=sequence_id, description=":".join(description))

    url = base_url + f"/overlap/id/{ensembl_transcript_id}"

    log.debug(f"Querying Ensembl for overlaps of {ensembl_transcript_id}")
    response = _cached_session.get(url,
                                   params={
                                       "feature": ["cds", "exon"],
                                       "content-type": "application/json"
                                   })
    try:
        response.raise_for_status()
    except requests.exceptions.HTTPError:
        log.error("Ensembl sequence REST query returned error "
                  "{}".format(response.text))
        # You have exceeded the limit of 15 requests per second; please reduce your concurrent connections
        if not is_retry and \
                response.text.strip().startswith('You have exceeded the limit of'):
            log.warn(
                'Waiting 4 seconds then retrying fetch_ensembl_transcript')
            time.sleep(4)
            return fetch_ensembl_transcript(ensembl_transcript_id,
                                            is_retry=True)

        raise ValueError(response.text)

    response_data = response.json()

    try:
        # Handle the unlikely event of a single piece of information
        # overlapping a lonely transcript
        if not hasattr(response_data, '__iter__'):
            response_data = [response_data]

        for response_datum in response_data:
            if response_datum['Parent'] != ensembl_transcript_id:
                continue

            if response_datum['assembly_name'] != species:
                continue

            # We store feature locations 0-indexed from the left-most
            # sequence boundary
            record.features.append(
                SeqFeature(location=FeatureLocation(
                    int(response_datum['start']) - sequence_left,
                    int(response_datum['end']) - sequence_left + 1,
                    strand=int(response_datum['strand'])),
                           type=response_datum['feature_type']))
        num_exon_boundaries = len(
            [f for f in record.features if f.type == 'exon'])

        num_cds_boundaries = len(
            [f for f in record.features if f.type == 'cds'])

        log.debug(f"Retrieved {num_exon_boundaries} exons and "
                  f"{num_cds_boundaries} coding regions for transcript "
                  f"{ensembl_transcript_id}")
    except (KeyError, ValueError) as e:
        log.error(e)
        log.error(
            'Error parsing overlap metadata from Ensembl REST response - '
            'did the format of the response change?')
        raise ValueError(e)

    if transcript_strand == -1:
        # By default `reverse_complement` doesn't preserve
        # description, so force it...
        record = record.reverse_complement(description=True)

        # ...but update the description to make clear the sequence
        # we're storing is the reverse complement of the sequence
        # described by the metadata in the description
        record.description = "Reverse complement of " + record.description

    record.annotations['reference_species'] = species
    record.annotations['reference_chromosome_number'] = chromosome_number
    record.annotations['reference_left_index'] = sequence_left
    record.annotations['reference_right_index'] = sequence_right
    record.annotations['transcript_strand'] = transcript_strand

    # Finally, sort features by their start locations
    record.features.sort(key=lambda f: f.location.start)

    return record
Created on Wed Oct 24 17:30:42 2018

@author: xies
"""

import numpy as np
import matplotlib.pylab as plt
import pandas as pd
from Bio import SeqIO, motifs, SeqRecord
from Bio.Alphabet.IUPAC import IUPACUnambiguousDNA
from timeit import default_timer

filename = '/data/crispri_hamming/nanog/nanog_chip_peaks.fa'
peaks = [rec for rec in SeqIO.parse(filename, 'fasta')]
for rec in peaks:
    rec.seq.alphabet = IUPACUnambiguousDNA()
Npeaks = len(peaks)
has_guide = np.zeros(Npeaks, dtype=bool)
Lpeak = len(peaks[0])

# Load NANOG PWM and rewrite into JASPAR format
filename = '/data/crispri_hamming/nanog/nanog_GSE11724.jaspar'
with open(filename) as fh:
    m = [m for m in motifs.parse(fh, "jaspar")]
pwm = m[0]
pssm = pwm.pssm

Lmotif = len(pwm)

# Assuming 25-25-25-25 background, find min and max scores and 80% threshold
min_score = pssm.min