Пример #1
0
    def test_EMBL_CCDS_RefSeq(self):
        exp = [
            CodingSequence(
                'CR456855', 'EMBL',
                Seq(
                    'ATGGAGGGTCAACGCTGGCTGCCGCTGGAGGCCAATCCCGAGGTCACCAACCAGTTTCTTAAACAATTAGGTCTACATCCTAACTGGCAATTCGTTGATGTATATGGAATGGATCCTGAACTCCTTAGCATGGTACCAAGACCAGTCTGTGCAGTCTTACTTCTCTTTCCTATTACAGAAAAGTATGAAGTATTCAGAACAGAAGAGGAAGAAAAAATAAAATCTCAGGGACAAGATGTTACATCATCAGTATATTTCATGAAGCAAACAATCAGCAATGCCTGTGGAACAATTGGACTGATTCATGCTATTGCAAACAATAAAGACAAGATGCACTTTGAATCTGGATCAACCTTGAAAAAATTCCTGGAGGAATCTGTGTCAATGAGCCCTGAAGAACGAGCCAGATACCTGGAGAACTATGATGCCATCCGAGTTACTCATGAGACCAGTGCCCATGAAGGTCAGACTGAGGCACCAAGTATAGATGAGAAAGTAGATCTTCATTTTATTGCATTAGTTCATGTAGATGGGCATCTCTATGAATTAGATGGGCGGAAGCCATTTCCAATTAACCATGGTGAAACTAGTGATGAAACTTTATTAGAGGATGCCATAGAAGTTTGCAAGAAGTTTATGGAGCGCGACCCTGATGAACTAAGATTTAATGCGATTGCTCTTTCTGCAGCTTAA',
                    IUPACUnambiguousDNA()),
                Seq(
                    'MEGQRWLPLEANPEVTNQFLKQLGLHPNWQFVDVYGMDPELLSMVPRPVCAVLLLFPITEKYEVFRTEEEEKIKSQGQDVTSSVYFMKQTISNACGTIGLIHAIANNKDKMHFESGSTLKKFLEESVSMSPEERARYLENYDAIRVTHETSAHEGQTEAPSIDEKVDLHFIALVHVDGHLYELDGRKPFPINHGETSDETLLEDAIEVCKKFMERDPDELRFNAIALSAA',
                    ExtendedIUPACProtein())),
            CodingSequence(
                'DQ917642', 'EMBL',
                Seq(
                    'ATGACGGGCAATGCCGGGGAGTGGTGCCTCATGGAAAGCGACCCCGGAGTCTTCACCGAGCTCATTAAAGGATTCGGTTGCCGAGGAGCCCAAGTAGAAGAAATATGGAGTTTAGAGCCTGAGAATTTTGAAAAATTAAAGCCAGTTCATGGGCTGATTTTTCTTTTCAAGTGGCAGCCCGGAGAAGAACCAGCAGGCTCTGTGGTTCAGGACTCCCGACTTGACACGATATTTTTTGCCAAGCAGGTAATTAATAATGCTTGTGCTACTCAAGCCATAGTAAGTGTGTTATTGAACTGTACCCATCAGGATGTCCATTTAGGAGAGACATTGTCAGAGTTTAAGGAATTCTCACAAAGTTTTGATGCAGCTATGAAAGGTTTGGCCCTGAGTAATTCGGATGTGATTCGCCAAGTACACAACAGTTTCGCCAGACAGCAAATGTTTGAATTTGATGCAAAGACATCAGCAAAAGAAGAAGATGCTTTTCACTTTGTCAGTTACGTTCCTGTGAATGGAAGACTGTACGAATTAGATGGATTAAGAGAAGGACCGATCGATTTAGGTGCATGCAATCAAGATGACTGGATCAGCGCAGTGAGGCCAGTCATAGAAAAAAGGATACAAAAGTACAGTGAAGGTGAAATTCGATTTAACTTAATGGCCATTGTGTCTGACAGGAAAATGATATATGAACAGAAGATAGCAGAGTTACAAAGACAGCTTGCTGAGGAGGAACCCATGGATACAGATCAGGGTAGTAACATGTTAAGTGCTATTCAGTCAGAAGTTGCCAAAAATCAGATGCTTATTGAAGAAGAAGTACAGAAATTAAAAAGATATAAGATTGAAAACATCAGAAGGAAGCATAATTATCTGCCTTTCATTATGGAACTGTTAAAGACTTTAGCAGAACACCAGCAGTTAATACCTCTCGTAGAAAAGGCAAAAGAAAAACAGAATGCGAAGAAGGCACAGGAAACCAAATGA',
                    IUPACUnambiguousDNA()),
                Seq(
                    'MTGNAGEWCLMESDPGVFTELIKGFGCRGAQVEEIWSLEPENFEKLKPVHGLIFLFKWQPGEEPAGSVVQDSRLDTIFFAKQVINNACATQAIVSVLLNCTHQDVHLGETLSEFKEFSQSFDAAMKGLALSNSDVIRQVHNSFARQQMFEFDAKTSAKEEDAFHFVSYVPVNGRLYELDGLREGPIDLGACNQDDWISAVRPVIEKRIQKYSEGEIRFNLMAIVSDRKMIYEQKIAELQRQLAEEEPMDTDQGSNMLSAIQSEVAKNQMLIEEEVQKLKRYKIENIRRKHNYLPFIMELLKTLAEHQQLIPLVEKAKEKQNAKKAQETK',
                    ExtendedIUPACProtein())),
            CodingSequence(
                'NM_001270952', 'RefSeq',
                Seq(
                    'ATGGATCCTGAACTCCTTAGCATGGTACCAAGACCAGTCTGTGCAGTCTTACTTCTCTTTCCTATTACAGAAAAGTATGAAGTATTCAGAACAGAAGAGGAAGAAAAAATAAAATCTCAGGGACAAGATGTTACATCATCAGTATATTTCATGAAGCAAACAATCAGCAATGCCTGTGGAACAATTGGACTGATTCATGCTATTGCAAACAATAAAGACAAGATGCACTTTGAATCTGGATCAACCTTGAAAAAATTCCTGGAGGAATCTGTGTCAATGAGCCCTGAAGAACGAGCCAGATACCTGGAGAACTATGATGCCATCCGAGTTACTCATGAGACCAGTGCCCATGAAGGTCAGACTGAGGCACCAAGTATAGATGAGAAAGTAGATCTTCATTTTATTGCATTAGTTCATGTAGATGGGCATCTCTATGAATTAGATGGGCGGAAGCCATTTCCAATTAACCATGGTGAAACTAGTGATGAAACTTTATTAGAGGATGCCATAGAAGTTTGCAAGAAGTTTATGGAGCGCGACCCTGATGAACTAAGATTTAATGCGATTGCTCTTTCTGCAGCATAG',
                    IUPACUnambiguousDNA()),
                Seq(
                    'MDPELLSMVPRPVCAVLLLFPITEKYEVFRTEEEEKIKSQGQDVTSSVYFMKQTISNACGTIGLIHAIANNKDKMHFESGSTLKKFLEESVSMSPEERARYLENYDAIRVTHETSAHEGQTEAPSIDEKVDLHFIALVHVDGHLYELDGRKPFPINHGETSDETLLEDAIEVCKKFMERDPDELRFNAIALSAA',
                    ExtendedIUPACProtein())),
            CodingSequence(
                'CCDS73586.1', 'CCDS',
                Seq('ATGGATCCTGAACTCCTTAGCATGGTACCAAGACCAGTCTGTGCAGTCTTACTTCTCTTTCCTATTACAGAAAAGTATGAAGTATTCAGAACAGAAGAGGAAGAAAAAATAAAATCTCAGGGACAAGATGTTACATCATCAGTATATTTCATGAAGCAAACAATCAGCAATGCCTGTGGAACAATTGGACTGATTCATGCTATTGCAAACAATAAAGACAAGATGCACTTTGAATCTGGATCAACCTTGAAAAAATTCCTGGAGGAATCTGTGTCAATGAGCCCTGAAGAACGAGCCAGATACCTGGAGAACTATGATGCCATCCGAGTTACTCATGAGACCAGTGCCCATGAAGGTCAGACTGAGGCACCAAGTATAGATGAGAAAGTAGATCTTCATTTTATTGCATTAGTTCATGTAGATGGGCATCTCTATGAATTAGATGGGCGGAAGCCATTTCCAATTAACCATGGTGAAACTAGTGATGAAACTTTATTAGAGGATGCCATAGAAGTTTGCAAGAAGTTTATGGAGCGCGACCCTGATGAACTAAGATTTAATGCGATTGCTCTTTCTGCAGCATAG'
                    ),
                Seq('MDPELLSMVPRPVCAVLLLFPITEKYEVFRTEEEEKIKSQGQDVTSSVYFMKQTISNACGTIGLIHAIANNKDKMHFESGSTLKKFLEESVSMSPEERARYLENYDAIRVTHETSAHEGQTEAPSIDEKVDLHFIALVHVDGHLYELDGRKPFPINHGETSDETLLEDAIEVCKKFMERDPDELRFNAIALSAA'
                    )),
            CodingSequence(
                'CCDS86041.1', 'CCDS',
                Seq('ATGACGGGCAATGCCGGGGAGTGGTGCCTCATGGAAAGCGACCCCGGGGTCTTCACCGAGCTCATTAAAGGATTCGGTTGCCGAGGAGCCCAAGTAGAAGAAATATGGAGTTTAGAGCCTGAGAATTTTGAAAAATTAAAGCCAGTTCATGGGTTAATTTTTCTTTTCAAGTGGCAGCCAGGAGAAGAACCAGCAGGCTCTGTGGTTCAGGACTCCCGACTTGACACGATATTTTTTGCTAAGCAGGTAATTAATAATGCTTGTGCTACTCAAGCCATAGTGAGTGTGTTACTGAACTGTACCCACCAGGATGTCCATTTAGGCGAGACATTATCAGAGTTTAAAGAATTTTCACAAAGTTTTGATGCAGCTATGAAAGGCTTGGCACTGAGCAATTCAGATGTGATTCGACAAGTACACAACAGTTTCGCCAGACAGCAAATGTTTGAATTTGATACGAAGACATCAGCAAAAGAAGAAGATGCTTTTCACTTTGTCAGTTATGTTCCTGTTAATGGGAGACTGTATGAATTAGATGGATTAAGAGAAGGACCGATTGATTTAGGTGCATGCAATCAAGATGATTGGATCAGTGCAGTAAGGCCTGTCATAGAAAAAAGGATACAAAAAGACGGGTTTTCACCATGTTGCCCAGGCTGGTCTCAGACTCCTGAGCTCAAGCCATCCGCCTGCCTCGACCTCCCAAAGTGGTACAGTGAAGGTGAAATTCGATTTAATTTAATGGCCATTGTGTCTGACAGAAAAATGATATATGAGCAGAAGATAGCAGAGTTACAAAGACAACTTGCAGAGGAACCCATGGATACAGATCAAGGTAATAGTATGTTAAGTGCTATTCAGTCAGAAGTTGCCAAAAATCAGATGCTTATTGAAGAAGAAGTACAGAAATTAAAAAGATACAAGATTGAGAATATCAGAAGGAAGCATAATTATCTGCCTTTCATTATGGAATTGTTAAAGACTTTAGCAGAACACCAGCAGTTAATACCACTAGTAGAAAAGGCAAAAGAAAAACAGAACGCAAAGAAAGCTCAGGAAACCAAATGA'
                    ),
                Seq('MTGNAGEWCLMESDPGVFTELIKGFGCRGAQVEEIWSLEPENFEKLKPVHGLIFLFKWQPGEEPAGSVVQDSRLDTIFFAKQVINNACATQAIVSVLLNCTHQDVHLGETLSEFKEFSQSFDAAMKGLALSNSDVIRQVHNSFARQQMFEFDTKTSAKEEDAFHFVSYVPVNGRLYELDGLREGPIDLGACNQDDWISAVRPVIEKRIQKDGFSPCCPGWSQTPELKPSACLDLPKWYSEGEIRFNLMAIVSDRKMIYEQKIAELQRQLAEEPMDTDQGNSMLSAIQSEVAKNQMLIEEEVQKLKRYKIENIRRKHNYLPFIMELLKTLAEHQQLIPLVEKAKEKQNAKKAQETK'
                    ))
        ]

        ids = {
            'EMBL': ['CR456855.1', 'DQ917642.1'],
            'RefSeq': ['NM_001270952.1'],
            'CCDS': ['CCDS73586.1', 'CCDS86041.1']
        }
        formatter = dba.UrlFormatter()
        queries = []
        for database, id_list in ids.items():
            queries += formatter.format(database, id_list)
        loop = asyncio.get_event_loop()
        fetcher = dba.Entry_fetcher()
        entries = loop.run_until_complete(fetcher.fetch_all(queries))
        splitter = dba.EntrySplitter()
        entries = splitter.split(entries)
        loop.close()
        parser = dba.DnaParser()
        res = parser.parse(entries)
        for item in res:
            self.assertTrue(item in exp)
        self.assertEqual(len(exp), len(res))
Пример #2
0
def seqcategory(oneseq):
    seqtype=''
    seqDNA=Seq(oneseq,IUPACAmbiguousDNA()) #Produce a sequence using the string received and the DNA alphabet.
    seqRNA=Seq(oneseq,IUPACAmbiguousRNA()) #Produce a sequence using the string received and the RNA alphabet.
    seqProt=Seq(oneseq,ExtendedIUPACProtein()) #Produce a sequence using the string received and the protein alphabet.
    if Alphabet._verify_alphabet(seqDNA): #Verify if is a DNA sequence.
        seqtype='DNA'
    elif Alphabet._verify_alphabet(seqRNA): #Verify if is a RNA sequence.
        seqtype='RNA'
    else:
        if Alphabet._verify_alphabet(seqProt): #Verify if is a protein sequence.
            seqtype='protein'
        else:
            seqtype='noseq' #If any, is not a valid sequence.
    return seqtype
 def parse(self, html_soup): 
     # title: Report for CCDS[id].[version] (current version)
     #.[version] is optional.
     #" (current version)" might not be present
     titlematcher = re.compile(r'Report for CCDS[0-9]*(?:\.[0-9]*)(?:\ \(current version\))?')
     id_ = html_soup.find_all(string=titlematcher)[0] #find() does not take kwargs
     idmatcher = r'CCDS[0-9]*(?:.[0-9]*)?'
     id_ = re.search(idmatcher, id_).group(0)
     nucleotides = html_soup.find_all('span', {'id':re.compile('n[0-9]+')})
     aminoacids = html_soup.find_all('span', {'id':re.compile('p[0-9]+')})
     dna_seq = Seq(''.join([nt.text for nt in nucleotides]), 
                   IUPACUnambiguousDNA())
     aa_seq = Seq(''.join([aa.text for aa in aminoacids]), ExtendedIUPACProtein())
     assert aa_seq == dna_seq.translate(cds=True) 
     return CodingSequence(id_, 'CCDS', dna_seq, aa_seq)
Пример #4
0
    def __init__(self,
                 unknown1letter='X',
                 unknown3letter='UNK',
                 gapchar='',
                 separator='',
                 UPPER=1):
        """Optional parameters are: unknown letter and code (when unknown code or letter encountered, respecively), 
        gap code for single letter, separator between codes in returned three-letter sequence, plus UPPER flag whether
        not to distinguish upper/lower case as an input - output is always uppercase.
        """
        self.alphabet = ExtendedIUPACProtein()
        self.unknown1 = unknown1letter
        self.unknown3 = unknown3letter
        self.gap_char = gapchar
        self.separator = separator
        self.upper = UPPER

        self.blankseq3 = ['Xer', 'Xaa', 'Ter', 'Sel']
                    type=int,
                    default='60',
                    help='pairwise alignment output width')
args = parser.parse_args()

for file in (args.query_seq, args.target_seq):
    if not path.isfile(file):
        parser.error("File %s doesn't exist" % file)

# alphabet
if args.seq_type == 'dna':
    args.seq_abc = IUPACAmbiguousDNA()
elif args.seq_type == 'rna':
    args.seq_abc = IUPACAmbiguousRNA()
else:
    args.seq_abc = ExtendedIUPACProtein()

# Aligners setup
aligners = {'global': PairwiseAligner(), 'local': None}
aligners['global'].mode = 'global'
if args.seq_type in ('dna', 'rna'):
    aligners['global'].match = args.match_score
    aligners['global'].mismatch = args.mismatch_score
    if not args.open_gap_score:
        args.open_gap_score = -5
    if not args.extend_gap_score:
        args.extend_gap_score = -2
else:
    sub_matrix = getattr(import_module('Bio.SubsMat.MatrixInfo'),
                         args.sub_matrix)
    aligners['global'].substitution_matrix = sub_matrix
Пример #6
0
class Polypeptide(seq.SequenceMolecule):
    alphabet_dict = {'strict':IUPACProtein(),'permissive':ExtendedIUPACProtein()}
def realign(aln_file, path_contig_ma_prot):
    """
    :param aln_file: aln file opened by open_aln function
    :param path_contig_ma_prot: path to fasta file containing
                            the contigs multi-aligned in proteins
    :type aln: list of lists
    :type path_contig_ma_prot: String

    :return: msa_prot
    :rtype: list of BioPython sequences multi aligned in nt

    .. note:: requires BioPython library
            meant to be used in main function
            to have an output, use realign_w_output
    """
    #initialization
    msa_prot = []

    #opening fasta file
    prot_bioseq, prot_id0 = open_fasta(path_contig_ma_prot,
                                       ab=ExtendedIUPACProtein())

    prot_id = []
    for i in range(len(prot_id0)):
        prot_id.append(prot_id0[i].split('_ORGANISM')[0])
    """	
    opening the aln file and creating dictionaries	
    """
    nb_seq_aln = aln_file[0]
    contig_query = aln_file[1]
    contig_frame = aln_file[2]
    contig_beg, contig_end = aln_file[3:5]
    contig_prot = aln_file[5]
    subject_acc = aln_file[6]
    subject_beg = aln_file[8]
    subject_prot = aln_file[10]

    #some contig queries are the same so we change the second name
    used_queries = []
    for i in range(len(contig_query)):
        if contig_query[i] in used_queries:
            n = used_queries.count(contig_query[i])
            used_queries.append(contig_query[i])
            #defining the frame
            frame = True
            if contig_frame[i] < 0:
                frame = False
            contig_query[i] = contig_query[i]+"__"+str(n+1)+"__coord_"+\
                str(contig_beg[i])+"_"+str(contig_end[i])+"_"+str(frame)
        else:
            used_queries.append(contig_query[i])
    """
    we build nested dictionaries, one dictionary per protein acc
    each one contains one dictionary per subject_query having
    start, contig_prot, contig_end as keys
    aln name of the main dict
    
    aln = {
            acc1 = {
                "entire",
                query1 = {"start",
                          "contig",
                          "subject"},
                query2:
                ...
                    },
            acc2 = { 
                    ...
                    },
            ...
                
        }
    """
    aln = {}

    #find all unique subject_acc
    unique_acc = []
    for acc in subject_acc:
        if acc not in unique_acc:
            unique_acc.append(acc)

    #creating the dictionaries
    for acc in unique_acc:
        if acc in prot_id:
            acc_in_fasta = prot_id.index(acc)
            aln.update({acc: {"entire": prot_bioseq[acc_in_fasta].seq}})
            indexes = [i for i in range(nb_seq_aln) if subject_acc[i] == acc]
            for index in indexes:
                aln[acc].update({
                    contig_query[index]: {
                        "start": subject_beg[index] - 1,
                        "contig": contig_prot[index],
                        "subject": subject_prot[index]
                    }
                })
    """
    counting the number of indels that need to be added in front of the partial 
    sequence
    using indels_front function from realigning_functions module
    """
    used_acc = []
    for acc in aln:
        used_queries = []
        for query in aln[acc]:
            if query != "entire":
                aln[acc][query]["indels_front"] = indels_front(
                    aln[acc][query]["start"], aln[acc][query]["subject"],
                    aln[acc]["entire"])
                #adding the indels of this query on both the entire and partial sequence
                j = 0
                no_indels_front = aln[acc][query]["indels_front"]
                while j <= (len(aln[acc][query]["subject"])) - 1:
                    if aln[acc]["entire"][
                            no_indels_front +
                            j] == "-" and aln[acc][query]["subject"][j] != "-":
                        aln[acc][query]["subject"] = aln[acc][query][
                            "subject"][:j] + "-" + aln[acc][query]["subject"][
                                j:]
                        aln[acc][query]["contig"] = aln[acc][query][
                            "contig"][:j] + "-" + aln[acc][query]["contig"][j:]
                    if aln[acc][query]["subject"][j] == "-" and aln[acc][
                            "entire"][no_indels_front + j] != "-":
                        aln[acc]["entire"] = aln[acc]["entire"][:(no_indels_front + j)]\
                            +"-" \
                            +aln[acc]["entire"][(no_indels_front + j):]
                        #we add the same indel on the previous queries from this acc
                        for prev_query in used_queries:
                            aln[acc][prev_query]["subject"] = \
                                aln[acc][prev_query]["subject"][:(no_indels_front + j)] \
                                    + "-" \
                                    + aln[acc][prev_query]["subject"][(no_indels_front + j):]
                            aln[acc][prev_query]["contig"] = \
                                aln[acc][prev_query]["contig"][:(no_indels_front + j)] \
                                    + "-" \
                                    + aln[acc][prev_query]["contig"][(no_indels_front + j):]

                        #we add the same indel on the entires of acc not processed
                        for acc0 in aln:
                            if acc0 != acc and acc0 not in used_acc:
                                aln[acc0]["entire"] = aln[acc0]["entire"][:(no_indels_front + j)] \
                                    + "-" \
                                    + aln[acc0]["entire"][(no_indels_front + j):]

                        #we add the same indel everywhere on the sequences we already processed
                        for acc0 in aln:
                            if acc0 != acc and acc0 in used_acc:
                                for query0 in aln[acc0]:
                                    if query0 == "entire":
                                        aln[acc0][query0] = aln[acc0][query0][:(no_indels_front + j)] \
                                            + "-" \
                                                + aln[acc0][query0][(no_indels_front + j):]
                                    else:
                                        aln[acc0][query0]["subject"] = \
                                        aln[acc0][query0]["subject"][:(no_indels_front + j)] \
                                            + "-" \
                                             + aln[acc0][query0]["subject"][(no_indels_front + j):]
                                        aln[acc0][query0]["contig"] = \
                                        aln[acc0][query0]["contig"][:(no_indels_front + j)] \
                                             + "-" \
                                             + aln[acc0][query0]["contig"][(no_indels_front + j):]
                    j += 1
                #counting the indels at the back of subject
                dico = aln[acc][query]
                dico["indels_back"] = len(aln[acc]["entire"]) - \
                    (dico["indels_front"] + len(dico["subject"]))
                aln[acc][query] = dico

                #adding the indels on the subject
                dico = aln[acc][query]
                dico["subject"] = "-" * dico["indels_front"] + dico["subject"] \
                                + "-" * dico["indels_back"]
                dico["contig"] = "-" * dico["indels_front"] \
                        + dico["contig"] \
                        + "-" * dico["indels_back"]
                used_queries.append(query)
                aln[acc][query] = dico
        used_acc.append(acc)
    """ 
    converting to BioPython protein sequences 
    """
    for acc in aln:
        aln[acc]["entire_bioseq"] = SeqRecord(Seq(str(aln[acc]["entire"]),
                                                  Alphabet),
                                              id=acc,
                                              name="No name",
                                              description="subject")
        msa_prot.append(aln[acc]["entire_bioseq"])
        for query in aln[acc]:
            if query != "entire" and query != "entire_bioseq" and "coord" not in query:
                dico = aln[acc][query]
                dico["contig_bioseq"] = SeqRecord(Seq(dico["contig"],
                                                      Alphabet),
                                                  id=query,
                                                  name="No name",
                                                  description="contig")
                aln[acc][query] = dico
                msa_prot.append(aln[acc][query]["contig_bioseq"])

            elif query != "entire" and query != "entire_bioseq":
                dico = aln[acc][query]
                dico["contig_bioseq"] = SeqRecord(
                    Seq(dico["contig"], Alphabet),
                    id=(query.split("__coord")[0]),
                    name="No name",
                    description="_contig_" + query.split("__")[-1])
                aln[acc][query] = dico
                msa_prot.append(aln[acc][query]["contig_bioseq"])

    return msa_prot