Пример #1
0
    def setUp(self):
        self.tt = TraceTracker()
        mock('helperlibs.bio.seqio.write', tracker=self.tt)
        self.options = Mock('options',
                            tracker=self.tt,
                            outputfoldername='test',
                            input_type='nucl')

        rec1_features = [
            generate_cluster_feature(23, 42, 'lantipeptide', 1),
            generate_cluster_feature(300, 500, 'nrps', 2)
        ]
        rec2_features = [
            generate_cluster_feature(50, 70, 'lassopeptide', 3),
            generate_cluster_feature(500, 700, 't1pks', 4)
        ]

        record1 = SeqRecord(UnknownSeq(1000),
                            'record1',
                            name='record1',
                            features=rec1_features)
        record2 = SeqRecord(UnknownSeq(1000),
                            'record2',
                            name='record2',
                            features=rec2_features)
        self.records = [record1, record2]

        self.expected_template = """\
Пример #2
0
    def test_join_UnknownSeq(self):
        """Checks if UnknownSeq join correctly concatenates sequence with the spacer."""
        spacer1 = UnknownSeq(5, character="-")
        spacer2 = UnknownSeq(0, character="-")
        spacers = [spacer1, spacer2]

        self.assertEqual(
            "-" * 15,
            spacer1.join([UnknownSeq(5, character="-"), UnknownSeq(5, character="-")]),
        )
        self.assertEqual(
            "N" * 5 + "-" * 10,
            spacer1.join([Seq("NNNNN"), UnknownSeq(5, character="-")]),
        )

        example_strings = ["ATG", "ATG", "ATG", "ATG"]
        example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"]

        # strings with empty spacer
        str_concatenated = spacer2.join(example_strings)

        self.assertEqual(str_concatenated, "".join(example_strings))

        for spacer in spacers:
            seq_concatenated = spacer.join(example_strings_seqs)
            self.assertEqual(seq_concatenated, str(spacer).join(example_strings))
            # Now try single sequence arguments, should join the letters
            for target in example_strings + example_strings_seqs:
                self.assertEqual(
                    str(spacer).join(str(target)), str(spacer.join(target))
                )
Пример #3
0
    def test_join_UnknownSeq(self):
        """Checks if UnknownSeq join correctly concatenates sequence with the spacer."""
        # Only expect it to take Seq objects and/or strings in an iterable!

        spacer1 = UnknownSeq(0, character="-", alphabet=generic_dna)
        spacers = [
            spacer1,
            UnknownSeq(5, character="-", alphabet=generic_dna),
            UnknownSeq(5, character="-", alphabet=generic_nucleotide)
        ]

        example_strings = ["ATG", "ATG", "ATG", "ATG"]
        example_strings_seqs = ["ATG", "ATG", Seq("ATG", generic_dna), "ATG"]

        # strings with empty spacer
        str_concatenated = spacer1.join(example_strings)

        self.assertEqual(str(str_concatenated), "".join(example_strings))
        self.assertEqual(str_concatenated.alphabet, spacer1.alphabet)

        for spacer in spacers:
            seq_concatenated = spacer.join(example_strings_seqs)
            self.assertEqual(str(seq_concatenated),
                             str(spacer).join(example_strings))
            self.assertEqual(seq_concatenated.alphabet, spacer.alphabet)
Пример #4
0
    def test_count_overlap_start_end_NN(self):
        """Check our count_overlap method using NN with variable ends and starts."""
        # Testing Seq() and MutableSeq() with variable start and end arguments
        start_end_exp = [(1, 7, 0), (3, None, 0), (3, 6, 0), (4, 6, 0),
                         (4, -1, 0), (-5, None, 0), (-5, 7, 0), (7, -5, 0),
                         (-100, None, 0), (None, 100, 0), (-100, 1000, 0)]

        testing_seq = "GTAGGGGAG"

        for start, end, exp in start_end_exp:
            self.assertEqual(
                Seq(testing_seq).count_overlap("NN", start, end), exp)
            self.assertEqual(
                MutableSeq(testing_seq).count_overlap("NN", start, end), exp)

        # Testing Seq() and MutableSeq() with a more heterogeneous sequenece
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN"), 0)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN"), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -10, -2), 0)

        # Testing UnknownSeq() with variable start and end arguments
        alphabet_char_start_end_exp = [(generic_rna, "N", 1, 7, 5),
                                       (generic_dna, "N", 1, 7, 5),
                                       (generic_rna, "N", -4, None, 3),
                                       (generic_dna, "N", -4, None, 3),
                                       (generic_protein, "X", 1, 7, 0)]

        for alpha, char, start, end, exp in alphabet_char_start_end_exp:
            self.assertEqual(
                UnknownSeq(12, alpha, char).count_overlap("NN", start, end),
                exp)
        self.assertEqual(
            UnknownSeq(12, character="X").count_overlap("NN", 1, 7), 0)

        # Testing UnknownSeq() with some more cases including unusual edge cases
        substr_start_end_exp = [("N", 100, 105, 0), ("N", -1, 4, 0),
                                ("N", 4, -1, 2), ("N", -8, -2, 5),
                                ("N", -2, -8, 0), ("N", 8, 2, 0),
                                ("N", 2, 8, 5), ("NN", 8, 2, 0),
                                ("NN", 2, 8, 4), ("NN", -5, -1, 3),
                                ("NN", 1, 5, 3), ("NNN", None, None, 5),
                                ("NNNNNNNNN", None, None, 0), ("NNN", 1, 2, 0)]

        for substr, start, end, exp in substr_start_end_exp:
            self.assertEqual(
                UnknownSeq(7, character="N").count_overlap(substr, start, end),
                exp)
        self.assertEqual(
            UnknownSeq(7, character="N").count_overlap("NN", 1), 5)
Пример #5
0
 def test_join_UnknownSeq_TypeError(self):
     """Checks that a TypeError is thrown for incompatible alphabets."""
     spacer = UnknownSeq(5, character="-", alphabet=generic_dna)
     self.assertRaises(TypeError, spacer.join, [
         UnknownSeq(5, character="-", alphabet=generic_rna),
         UnknownSeq(5, character="-", alphabet=generic_rna)
     ])
     self.assertRaises(TypeError, spacer.join, [
         Seq("NNNNN", generic_protein),
         UnknownSeq(5, character="-", alphabet=generic_protein)
     ])
Пример #6
0
def NoStop(input_file_path, out_name):
    """NoStop removes stop codons (that are hard coded in the function; change based on taxa/phyla). Function takes an input "file/path" and a "suffix" which is appended to the infile name."""
    codon_stop_array = ["TGA", "TAG", "TAA", "UGA", "UAA", "UAG"]
    #input_file_path = "/Users/chriswirth/Desktop/PruinescenceSeqs/CAD/CAD_Mod/CAD_AllData_mod.fasta"
    file_ext = os.path.basename(input_file_path)
    file, ext = os.path.splitext(file_ext)
    #NB/caution: Assumes sequences are in frame!
    for record in SeqIO.parse(input_file_path, "fasta", generic_alphabet):
        temp_seq = Seq("", generic_alphabet)
        for index in range(0, len(record.seq), 3):
            codon = record.seq[index:index + 3]
            if codon in codon_stop_array:
                codon = UnknownSeq(3, character='?')
            #Note += syntax here
            temp_seq += codon
        #Write output to a .fasta file, note format
        fasta_format_string = (">%s\n%s" % (record.name, temp_seq))
        a = open("%s_%s.fasta" % (file, out_name), "a+")
        print >> a, fasta_format_string
        a.close()
        #Write change log to a .txt file; could record issues/changes across all "genes" if desired, but I've restricted to only a single AllData file for now
        b = open("%s_log.csv" % file, "a+")
        # Prints CSV with path and filename, taxon name, number of stop codons, and position (-1 if none, can be cleaned up easily) of first stop codon
        print >> b, file, ",", record.name, ",", temp_seq.count(
            "???"), ",", temp_seq.find("???")
        b.close()
    return
Пример #7
0
    def test_join_UnknownSeq_TypeError_iter(self):
        """Checks that a TypeError is thrown for all non-iterable types."""
        # No iterable types which contain non-accepted types either.

        spacer = UnknownSeq(5, character="-")
        self.assertRaises(TypeError, spacer.join, 5)
        self.assertRaises(TypeError, spacer.join, ["ATG", "ATG", 5, "ATG"])
Пример #8
0
 def test_generated(self):
     """Write and read back odd SeqRecord objects"""
     record1 = SeqRecord(Seq("ACGT"*500, generic_dna),  id="Test", description="Long "*500,
                        letter_annotations={"phred_quality":[40,30,20,10]*500})
     record2 = SeqRecord(MutableSeq("NGGC"*1000),  id="Mut", description="very "*1000+"long",
                        letter_annotations={"phred_quality":[0,5,5,10]*1000})
     record3 = SeqRecord(UnknownSeq(2000,character="N"),  id="Unk", description="l"+("o"*1000)+"ng",
                        letter_annotations={"phred_quality":[0,1]*1000})
     record4 = SeqRecord(Seq("ACGT"*500),  id="no_descr", description="", name="",
                        letter_annotations={"phred_quality":[40,50,60,62]*500})
     record5 = SeqRecord(Seq("",generic_dna),  id="empty_p", description="(could have been trimmed lots)",
                        letter_annotations={"phred_quality":[]})
     record6 = SeqRecord(Seq(""),  id="empty_s", description="(could have been trimmed lots)",
                        letter_annotations={"solexa_quality":[]})
     record7 = SeqRecord(Seq("ACNN"*500),  id="Test_Sol", description="Long "*500,
                        letter_annotations={"solexa_quality":[40,30,0,-5]*500})
     record8 = SeqRecord(Seq("ACGT"),  id="HighQual", description="With very large qualities that even Sanger FASTQ can't hold!",
                        letter_annotations={"solexa_quality":[0,10,100,1000]})
     #TODO - Record with no identifier?
     records = [record1, record2, record3, record4, record5, record6, record7, record8]
     #TODO - Have a Biopython defined "DataLossWarning?"
     warnings.simplefilter('ignore', BiopythonWarning)
     #TODO - Include phd output?
     for format in ["fasta", "fastq", "fastq-solexa", "fastq-illumina", "qual"]:
         handle = StringIO()
         SeqIO.write(records, handle, format)
         handle.seek(0)
         compare_records(records,
                         list(SeqIO.parse(handle, format)),
                         truncation_expected(format))
     warnings.filters.pop()
Пример #9
0
def merge(records, length=20, spacer='n'):
    """Merge multiple SeqRecords into one, using a defined spacer

    :param records: Iterable containing SeqRecords to be merged
    :param length: Length of the spacer in kbp
    :param spacer: Kind of spacer to use ('n' for UnknownSeq spacer, 'stop' for all-frame stop codon spacer)

    :return: A single SeqRecord that is the product of the merge.
    """

    if spacer not in ('n', 'stop'):
        raise ValueError("Invalid spacer: %r, use either 'n' or 'stop'" % spacer)

    if not len(records):
        raise ValueError("No records given")

    if spacer == 'stop':
        spacer_seq = Seq(ALL_FRAME_STOP_MOTIF * 40 * length, Alphabet.generic_dna)
    else:
        spacer_seq = UnknownSeq(length * 1000, alphabet=Alphabet.generic_dna, character='N')

    new_rec = records[0]

    if len(records) == 1:
        return new_rec

    rec_id = new_rec.id
    rec_name = new_rec.name
    rec_desc = new_rec.description
    date = new_rec.annotations.get('date', '')
    source = new_rec.annotations.get("source", '')
    organism = new_rec.annotations.get('organism', '')
    taxonomy = new_rec.annotations.get('taxonomy', [])
    data_file_division = new_rec.annotations.get('data_file_division', 'UNK')
    topology = new_rec.annotations.get('topology', 'linear')

    for i, rec in enumerate(records[1:]):
        spacer_id = 'spacer_{}'.format(i + 1)

        spacer_feature = SeqFeature(FeatureLocation(0, length * 1000, 0),
                                    type='misc_feature', id=spacer_id,
                                    qualifiers={'note': [spacer_id]})

        spacer_rec = SeqRecord(spacer_seq, id=spacer_id, name=spacer_id,
                               description=spacer_id, features=[spacer_feature])

        new_rec = new_rec + spacer_rec + rec

    new_rec.id = rec_id
    new_rec.name = rec_name
    new_rec.description = rec_desc
    new_rec.annotations["date"] = date
    new_rec.annotations["source"] = source
    new_rec.annotations["organism"] = organism
    new_rec.annotations["taxonomy"] = taxonomy
    new_rec.annotations["data_file_division"] = data_file_division
    new_rec.annotations["topology"] = topology

    return new_rec
Пример #10
0
def replace_missing(alignment):
    '''alignment is a MultipleseqAlignment object.
       taxaToRemove is a list of the ids of taxa to replace with N's'''
    for record in alignment:
        gapSeq = '-'*len(alignment[0])
        if (str(record.seq).upper().replace("N", "-")) == gapSeq:
            record.seq = UnknownSeq(len(record.seq), character='N')
    return alignment
Пример #11
0
 def test_join_UnknownSeq_mixed_alpha(self):
     """Check UnknownSeq can join incompatible alphabets."""
     spacer = UnknownSeq(5, character="-", alphabet=generic_dna)
     self.assertEqual(
         "-" * 15,
         spacer.join([
             UnknownSeq(5, character="-", alphabet=generic_rna),
             UnknownSeq(5, character="-", alphabet=generic_rna),
         ]),
     )
     self.assertEqual(
         "N" * 5 + "-" * 10,
         spacer.join([
             Seq("NNNNN", generic_protein),
             UnknownSeq(5, character="-", alphabet=generic_protein),
         ]),
     )
 def gff_write(self, out_rec_features, contig_id = None, contig_len = None, out_file = None, global_context = None):
   if out_file is None:
     return
   if out_rec_features:
     self.update_stashed(out_rec_features, global_context)
     out_rec = SeqRecord(UnknownSeq(length=contig_len), id = contig_id)
     out_rec.features = out_rec_features
     GFF.write([out_rec], out_file)
Пример #13
0
    def test_process_wgs_master_scaffolds_wgs(self):
        """Test process_wgs_master_scaffolds() gets contigs from NCBI if there are WGS/WGS_SCAFLD annotations"""
        mock('utils.fix_wgs_master_record',
             tracker=self.tt,
             returns_func=lambda x: [x])
        fake_records = [
            FakeRecord(annotations={'wgs': 'foo'}),
            FakeRecord(annotations={'wgs_scafld': 'bar'})
        ]
        fake_records[0].seq = UnknownSeq(23)
        fake_records[1].seq = UnknownSeq(42)
        trace = """Called utils.fix_wgs_master_record(
    <antismash.test.test_utils.FakeRecord object at ...>)
Called utils.fix_wgs_master_record(
    <antismash.test.test_utils.FakeRecord object at ...>)
"""
        procesed = utils.process_wgs_master_scaffolds(fake_records)
        self.assertEqual(fake_records, procesed)
        assert_same_trace(self.tt, trace)
 def setUp(self):
     self.seqrec = SeqRecord(UnknownSeq(21))
     loc = CompoundLocation([
         FeatureLocation(12, 21, strand=1),
         FeatureLocation(0, 9, strand=1)
     ],
                            operator="join")
     self.seqcds = SeqFeature(loc, type="CDS")
     self.seqgene = SeqFeature(loc, type="gene")
     self.seqrec.annotations["topology"] = "circular"
Пример #15
0
def _retrieve_seq(adaptor, primary_id):
    # The database schema ensures there will be only one matching
    # row in the table.

    # If an UnknownSeq was recorded, seq will be NULL,
    # but length will be populated.  This means length(seq)
    # will return None.
    seqs = adaptor.execute_and_fetchall(
        "SELECT alphabet, length, length(seq) FROM biosequence WHERE bioentry_id = %s",
        (primary_id, ),
    )
    if not seqs:
        return
    assert len(seqs) == 1
    moltype, given_length, length = seqs[0]

    try:
        length = int(length)
        given_length = int(length)
        assert length == given_length
        have_seq = True
    except TypeError:
        assert length is None
        seqs = adaptor.execute_and_fetchall(
            "SELECT alphabet, length, seq FROM biosequence WHERE bioentry_id = %s",
            (primary_id, ),
        )
        assert len(seqs) == 1
        moltype, given_length, seq = seqs[0]
        assert seq is None or seq == ""
        length = int(given_length)
        have_seq = False
        del seq
    del given_length

    moltype = moltype.lower()  # might be upper case in database
    # We have no way of knowing if these sequences will use IUPAC
    # alphabets, and we certainly can't assume they are unambiguous!
    if moltype == "dna":
        alphabet = Alphabet.generic_dna
    elif moltype == "rna":
        alphabet = Alphabet.generic_rna
    elif moltype == "protein":
        alphabet = Alphabet.generic_protein
    elif moltype == "unknown":
        # This is used in BioSQL/Loader.py and would happen
        # for any generic or nucleotide alphabets.
        alphabet = Alphabet.single_letter_alphabet
    else:
        raise AssertionError("Unknown moltype: %s" % moltype)

    if have_seq:
        return DBSeq(primary_id, adaptor, alphabet, 0, int(length))
    else:
        return UnknownSeq(length, alphabet)
Пример #16
0
    def test_equality(self):
        """Test equality when mixing types."""
        self.assertEqual(Seq("6"), "6")
        self.assertNotEqual(Seq("6"), 6)
        self.assertEqual(Seq(""), "")
        self.assertNotEqual(Seq(""), None)
        self.assertEqual(Seq("None"), "None")
        self.assertNotEqual(Seq("None"), None)

        self.assertEqual(MutableSeq("6"), "6")
        self.assertNotEqual(MutableSeq("6"), 6)
        self.assertEqual(MutableSeq(""), "")
        self.assertNotEqual(MutableSeq(""), None)
        self.assertEqual(MutableSeq("None"), "None")
        self.assertNotEqual(MutableSeq("None"), None)

        self.assertEqual(UnknownSeq(1, character="6"), "6")
        self.assertNotEqual(UnknownSeq(1, character="6"), 6)
        self.assertEqual(UnknownSeq(0), "")
        self.assertNotEqual(UnknownSeq(0), None)
Пример #17
0
    def test_join_UnknownSeq_with_file(self):
        """Checks if UnknownSeq join correctly concatenates sequence from a file with the spacer."""
        filename = 'Fasta/f003'
        seqlist = [record.seq for record in SeqIO.parse(filename, 'fasta')]
        seqlist_as_strings = [str(_) for _ in seqlist]

        spacer = UnknownSeq(0, character="-", alphabet=generic_dna)
        spacer1 = UnknownSeq(5, character="-", alphabet=generic_dna)
        # seq objects with spacer
        seq_concatenated = spacer.join(seqlist)
        # seq objects with empty spacer
        seq_concatenated1 = spacer1.join(seqlist)

        ref_data = ref_data1 = ""
        ref_data = str(spacer).join(seqlist_as_strings)
        ref_data1 = str(spacer1).join(seqlist_as_strings)

        self.assertEqual(str(seq_concatenated), ref_data)
        self.assertEqual(str(seq_concatenated1), ref_data1)
        with self.assertRaises(TypeError):
            spacer.join(SeqIO.parse(filename, 'fasta'))
Пример #18
0
 def __init__(self, biopython_object=None):
     
     # first we define our underlying SeqRecord object
     if biopython_object == None:
         self._record = SeqRecord(seq=UnknownSeq(0,alphabet=NucleotideAlphabet()),id='',name='',description='')
     elif isinstance(biopython_object,Seq):
         self._record = SeqRecord(seq=copy.deepcopy(biopython_object),id='',name='',description='')
     elif isinstance(biopython_object,SeqRecord):
         self._record = copy.deepcopy(biopython_object)
     
     # define dictionary of features for faster lookup
     self._features = {}
     for (i,feature) in enumerate(self._record.features):
         self._features.setdefault(feature.type,[]).append(i)
def _retrieve_seq(adaptor, primary_id):
    # The database schema ensures there will be only one matching
    # row in the table.

    # If an UnknownSeq was recorded, seq will be NULL,
    # but length will be populated.  This means length(seq)
    # will return None.
    seqs = adaptor.execute_and_fetchall(
        "SELECT alphabet, length, length(seq) FROM biosequence WHERE bioentry_id = %s",
        (primary_id, ),
    )
    if not seqs:
        return
    assert len(seqs) == 1
    moltype, given_length, length = seqs[0]

    try:
        length = int(length)
        given_length = int(length)
        assert length == given_length
        have_seq = True
    except TypeError:
        assert length is None
        seqs = adaptor.execute_and_fetchall(
            "SELECT alphabet, length, seq FROM biosequence WHERE bioentry_id = %s",
            (primary_id, ),
        )
        assert len(seqs) == 1
        moltype, given_length, seq = seqs[0]
        assert seq is None or seq == ""
        length = int(given_length)
        have_seq = False
        del seq
    del given_length

    if have_seq:
        return DBSeq(primary_id,
                     adaptor,
                     alphabet=None,
                     start=0,
                     length=int(length))
    else:
        if moltype in ("dna", "rna"):
            character = "N"
        elif moltype == "protein":
            character = "X"
        else:
            character = "?"
        return UnknownSeq(length, character=character)
Пример #20
0
def concatenate(infiles, outfile):
    alignments = [AlignIO.read(open(f, "r"), "fasta") for f in infiles]

    # Get the full set of labels (i.e. sequence ids) for all the alignments
    all_labels = set(seq.id for aln in alignments for seq in aln)

    # Make a dictionary to store info as we go along
    # (defaultdict is convenient -- asking for a missing key gives back an empty list)
    tmp = defaultdict(list)

    # Assume all alignments have same alphabet
    #alphabet = alignments[0]._alphabet

    for aln in alignments:
        length = aln.get_alignment_length()

        # check if any labels are missing in the current alignment
        these_labels = set(rec.id for rec in aln)
        missing = all_labels - these_labels

        # if any are missing, create unknown data of the right length,
        # stuff the string representation into the tmp dict
        for label in missing:
            #new_seq = UnknownSeq(length, alphabet=alphabet)
            new_seq = UnknownSeq(length)
            tmp[label].append(str(new_seq))

        # else stuff the string representation into the tmp dict
        for rec in aln:
            tmp[rec.id].append(str(rec.seq))

    # Stitch all the substrings together using join (most efficient way),
    # and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment
    #msa = MultipleSeqAlignment(SeqRecord(Seq(''.join(v), alphabet=alphabet), id=k)
    msa = MultipleSeqAlignment(
        SeqRecord(Seq(''.join(v)), id=k) for (k, v) in tmp.items())

    #    with open(outfile, "w") as out:
    AlignIO.write(msa, outfile, "fasta")


#tmpdir = tempfile.TemporaryDirectory()
#print(tmpdir.name)
#timeit.timeit('concatenate(infiles,outfile)',
#    setup='infiles=simAlignments(10,10,tmpdir.name),outfile=tempfile.NamedTemporaryFile(dir=tmpdir).name')

# python -m timeit -s 'import tempfile; tmpdir=tempfile.TemporaryDirectory(); from concatenate import simAlignments; infiles=simAlignments(10,10,tmpdir.name); outf=tempfile.NamedTemporaryFile().name' "from concatenate import concatenate; concatenate(infiles,outf)"
#100 loops, best of 3: 2.94 msec per loop
Пример #21
0
 def _get_rec(self, base, info_dict):
     """Retrieve a record to add features to."""
     max_loc = info_dict.get("location", (0, 1))[1]
     try:
         cur_rec = base[info_dict["rec_id"]]
         # update generated unknown sequences with the expected maximum length
         if isinstance(cur_rec.seq, UnknownSeq):
             cur_rec.seq._length = max([max_loc, cur_rec.seq._length])
         return cur_rec, base
     except KeyError:
         if self._create_missing:
             new_rec = SeqRecord(UnknownSeq(max_loc), info_dict["rec_id"])
             base[info_dict["rec_id"]] = new_rec
             return new_rec, base
         else:
             raise
Пример #22
0
def concatenate(alignments):
    # Get the full set of labels (i.e. sequence ids) for all the alignments
    all_labels = set(seq.id for aln in alignments for seq in aln)
    logger.debug("extracted {} different labels in all alignments: {}".format(
        len(all_labels), all_labels))

    # Make a dictionary to store info as we go along
    # (defaultdict is convenient -- asking for a missing key gives back an empty list)
    concat_buf = defaultdict(list)

    # Assume all alignments have same alphabet
    alphabet = alignments[0]._alphabet
    logger.debug('detected alphabet: {}'.format(alphabet))

    for aln in alignments:
        length = aln.get_alignment_length()

        # check if any labels are missing in the current alignment
        these_labels = set(rec.id for rec in aln)
        missing = all_labels - these_labels
        logger.debug(
            "alignment of length {} with {} sequences, {} missing ({})".format(
                length, len(these_labels), len(missing), missing))

        # if any are missing, create unknown data of the right length,
        # stuff the string representation into the concat_buf dict
        for label in missing:
            new_seq = UnknownSeq(length, alphabet=alphabet)
            concat_buf[label].append(str(new_seq))

        # else stuff the string representation into the concat_buf dict
        for rec in aln:
            concat_buf[rec.id].append(str(rec.seq))

    # Stitch all the substrings together using join (most efficient way),
    # and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment
    msa = MultipleSeqAlignment(
        SeqRecord(Seq(''.join(seq_arr), alphabet=alphabet), id=label)
        for (label, seq_arr) in concat_buf.items())
    logger.info(
        "concatenated MSA of {} taxa and total length {} created".format(
            len(msa), len(msa[0])))
    return msa
Пример #23
0
def concatenate(alignments):
    """
    Concatenates a list of Bio.Align.MultipleSeqAlignment objects.
    If any sequences are missing the are padded with unknown data
    (Bio.Seq.UnknownSeq).
    Returns a single Bio.Align.MultipleSeqAlignment.
    Limitations: any annotations in the sub-alignments are lost in
    the concatenated alignment.
    """

    # Get the full set of labels (i.e. sequence ids) for all the alignments
    all_labels = set(seq.id for aln in alignments for seq in aln)

    # Make a dictionary to store info as we go along
    # (defaultdict is convenient -- asking for a missing key gives back an empty list)
    tmp = defaultdict(list)

    # Assume all alignments have same alphabet
    alphabet = alignments[0]._alphabet

    for aln in alignments:
        length = aln.get_alignment_length()

        # check if any labels are missing in the current alignment
        these_labels = set(rec.id for rec in aln)
        missing = all_labels - these_labels

        # if any are missing, create unknown data of the right length,
        # stuff the string representation into the tmp dict
        for label in missing:
            new_seq = UnknownSeq(length, alphabet=alphabet)
            tmp[label].append(str(new_seq))

        # else stuff the string representation into the tmp dict
        for rec in aln:
            tmp[rec.id].append(str(rec.seq))

    # Stitch all the substrings together using join (most efficient way),
    # and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment
    msa = MultipleSeqAlignment(SeqRecord(Seq(''.join(v), alphabet=alphabet), id=k, name=k, description=k)
               for (k,v) in tmp.items())
    return msa
Пример #24
0
def cfg_out_iterator(handle, alphabet=single_letter_alphabet):
    """Generator to iterate Centrifuge output (as SeqRecord objects)

    Arguments:
     - handle - input file
     - alphabet - optional alphabet
    """
    for (read_id, seq_id, tax_id, score, second_score, hit_length,
         query_length, num_matches) in simple_out_parser(handle):
        try:
            first_word = read_id.split(None, 1)[0]
        except IndexError:
            assert not read_id, repr(read_id)
            # Should we use SeqRecord default for no ID?
            first_word = ""
        # From Centrifuge score get the "single hit equivalent length"
        try:
            adapted_score = float(score)**0.5 + 15
        except ValueError:
            print(f'Error parsing score ({score}) for taxid {tax_id}'
                  f' in {handle}...')
            raise
        try:
            adapted_2nd_score = float(second_score)**0.5 + 15
        except ValueError:
            print(f'Error parsing score ({second_score}) for taxid {tax_id}'
                  f' in {handle}...')
            raise
        yield SeqRecord(UnknownSeq(0, alphabet),
                        id=first_word,
                        name=first_word,
                        description=read_id,
                        dbxrefs=[seq_id],
                        annotations={
                            'taxID': tax_id,
                            'score': adapted_score,
                            '2ndBestScore': adapted_2nd_score,
                            'hitLength': hit_length,
                            'queryLength': query_length,
                            'numMatches': int(num_matches),
                        })
Пример #25
0
def join_seqs(s1, s2, length=None):
    if length:
        pad_length = length - len(s1) - len(s2)
        try:
            pad = SeqRecord(
                UnknownSeq(pad_length, character='-'),
                letter_annotations={'phred_quality': [0] * pad_length},
            )
        except ValueError:
            sys.exit(
                'Total length of the two reads exceeds given length (%s)' %
                (length))
        else:
            s_joined = s1 + pad + s2.reverse_complement()
    else:
        s_joined = s1 + s2.reverse_complement()

    ## assumes the read ID ends in a 2-char suffix for direction (e.g. _1)
    s_joined.id = s1.id[:-2]
    s_joined.description = ''  ## not required for fastq
    return s_joined
Пример #26
0
def prepare_cluster_qual_files(work_dir, qual_file, cluster_seq_dir):
    cluster_qual_dir = work_dir + "/cluster_qual"
    os.mkdir(cluster_qual_dir)
    # get a list of all quality scores
    fd_qual = open(qual_file, "rU")
    quals = SeqIO.to_dict(SeqIO.parse(fd_qual, "qual"))
    # get quality scores for the clusters
    for cluster_seq_file in os.listdir(cluster_seq_dir):
        if os.path.isfile(
                cluster_seq_dir + "/" + cluster_seq_file
        ):  # check if file, can do some more checking here e.g. is fasta file
            fd_cluster_seq = open(cluster_seq_dir + "/" + cluster_seq_file,
                                  "rU")
            cluster_seqs = SeqIO.parse(fd_cluster_seq, "fasta")
            cluster_quals = []
            for seq in cluster_seqs:
                qual = quals[seq.name]
                cluster_qual = SeqRecord(seq=UnknownSeq(
                    len(qual.letter_annotations["phred_quality"])),
                                         id="",
                                         description=qual.description)
                cluster_qual.letter_annotations[
                    "phred_quality"] = qual.letter_annotations["phred_quality"]
                cluster_quals.append(cluster_qual)

            cluster_qual_file = cluster_qual_dir + "/" + cluster_seq_file.split(
                ".")[0] + ".qual"
            fd_cluster_qual = open(cluster_qual_file, "w")
            SeqIO.write(cluster_quals, fd_cluster_qual, "qual")
            fd_cluster_qual.close()
            os.system("sed -i \"s/> />/g\" " + cluster_qual_file
                      )  # need to replace the space after the > in header
            fd_cluster_seq.close()

    fd_qual.close()
    return cluster_qual_dir
Пример #27
0
 def test_MutableSeq_init_typeerror(self):
     """Check MutableSeq __init__ gives TypeError exceptions."""
     self.assertRaises(TypeError, MutableSeq, (Seq("A")))
     self.assertRaises(TypeError, MutableSeq, (UnknownSeq(1)))
Пример #28
0
class StringMethodTests(unittest.TestCase):
    _examples = [
        # These are length 9, a multiple of 3 for translation tests:
        Seq("ACGTGGGGT", generic_protein),
        Seq("ACGTGGGGT", generic_nucleotide),
        Seq("ACGTGGGGT", generic_dna),
        Seq("ACGUGGGGU", generic_rna),
        Seq("GG", generic_protein),
        Seq("GG", generic_nucleotide),
        Seq("GG", generic_dna),
        Seq("GG", generic_rna),
        Seq("A", generic_protein),
        Seq("A", generic_nucleotide),
        Seq("A", generic_dna),
        Seq("A", generic_rna),
        UnknownSeq(1),
        UnknownSeq(1, character="n"),
        UnknownSeq(1, generic_rna),
        UnknownSeq(1, generic_rna, "n"),
        UnknownSeq(1, generic_rna, "N"),
        UnknownSeq(12, generic_rna, "N"),
        UnknownSeq(12, generic_dna, "N"),
        UnknownSeq(12, generic_nucleotide, "N"),
        UnknownSeq(12, generic_protein, "X"),
        UnknownSeq(12, character="X"),
        UnknownSeq(12),
    ]
    for seq in _examples[:]:
        if isinstance(seq, Seq):
            _examples.append(seq.tomutable())
    _start_end_values = [0, 1, 2, 1000, -1, -2, -999, None]

    def _test_method(self,
                     method_name,
                     pre_comp_function=None,
                     start_end=False):
        """Check this method matches the plain string's method."""
        self.assertTrue(isinstance(method_name, str))
        for example1 in self._examples:
            if not hasattr(example1, method_name):
                # e.g. MutableSeq does not support find
                continue
            str1 = str(example1)

            for example2 in self._examples:
                if not hasattr(example2, method_name):
                    # e.g. MutableSeq does not support find
                    continue
                str2 = str(example2)

                i = getattr(example1, method_name)(str2)
                j = getattr(str1, method_name)(str2)
                if pre_comp_function:
                    i = pre_comp_function(i)
                    j = pre_comp_function(j)
                if i != j:
                    raise ValueError(
                        "%s.%s(%s) = %i, not %i" %
                        (repr(example1), method_name, repr(str2), i, j))

                try:
                    i = getattr(example1, method_name)(example2)
                    j = getattr(str1, method_name)(str2)
                    if pre_comp_function:
                        i = pre_comp_function(i)
                        j = pre_comp_function(j)
                    if i != j:
                        raise ValueError("%s.%s(%s) = %i, not %i" %
                                         (repr(example1), method_name,
                                          repr(example2), i, j))
                except TypeError:
                    # TODO - Check the alphabets do clash!
                    pass

                if start_end:
                    for start in self._start_end_values:
                        i = getattr(example1, method_name)(str2, start)
                        j = getattr(str1, method_name)(str2, start)
                        if pre_comp_function:
                            i = pre_comp_function(i)
                            j = pre_comp_function(j)
                        if i != j:
                            raise ValueError("%s.%s(%s, %i) = %i, not %i" %
                                             (repr(example1), method_name,
                                              repr(str2), start, i, j))

                        for end in self._start_end_values:
                            i = getattr(example1, method_name)(str2, start,
                                                               end)
                            j = getattr(str1, method_name)(str2, start, end)
                            if pre_comp_function:
                                i = pre_comp_function(i)
                                j = pre_comp_function(j)
                            if i != j:
                                raise ValueError(
                                    "%s.%s(%s, %i, %i) = %i, not %i" %
                                    (repr(example1), method_name, repr(str2),
                                     start, end, i, j))

    def test_str_count(self):
        """Check matches the python string count method."""
        self._test_method("count", start_end=True)

    def test_str_count_overlap_GG(self):
        """Check our count_overlap method using GG."""

        # Testing with self._examples
        expected = [
            3,
            3,
            3,
            3,
            1,
            1,
            1,
            1,
            0,
            0,
            0,
            0,  # Seq() Tests
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0
        ]  # UnknownSeq() Tests
        expected *= 2  # MutableSeq() Tests

        assert len(self._examples) == len(expected)

        for seq, exp in zip(self._examples, expected):
            # Using search term GG as a string
            self.assertEqual(seq.count_overlap("GG"), exp)
            self.assertEqual(seq.count_overlap("G" * 5), 0)
            # Using search term GG as a Seq with generic alphabet
            self.assertEqual(seq.count_overlap(Seq("GG")), exp)
            self.assertEqual(seq.count_overlap(Seq("G" * 5)), 0)

    def test_count_overlap_start_end_GG(self):
        """Check our count_overlap method using GG with variable ends and starts."""
        # Testing Seq() and MutableSeq() with variable start and end arguments
        start_end_exp = [(1, 7, 3), (3, None, 3), (3, 6, 2), (4, 6, 1),
                         (4, -1, 2), (-5, None, 2), (-5, 7, 2), (7, -5, 0),
                         (-100, None, 3), (None, 100, 3), (-100, 1000, 3)]

        testing_seq = "GTAGGGGAG"

        for start, end, exp in start_end_exp:
            self.assertEqual(
                Seq(testing_seq).count_overlap("GG", start, end), exp)
            self.assertEqual(
                MutableSeq(testing_seq).count_overlap("GG", start, end), exp)

        # Testing Seq() and MutableSeq() with a more heterogeneous sequenece
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG"), 5)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG"), 5)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -2, -10), 0)

        # Testing UnknownSeq() with variable start and end arguments
        alphabet_char_start_end_exp = [(generic_rna, "N", 1, 7, 0),
                                       (generic_dna, "N", 1, 7, 0),
                                       (generic_rna, "N", -4, None, 0),
                                       (generic_dna, "N", -4, None, 0),
                                       (generic_protein, "X", 1, 7, 0)]

        for alpha, char, start, end, exp in alphabet_char_start_end_exp:
            self.assertEqual(
                UnknownSeq(12, alpha, char).count_overlap("GG", start, end),
                exp)
        self.assertEqual(
            UnknownSeq(12, character="X").count_overlap("GG", 1, 7), 0)

        # Testing UnknownSeq() with some more cases including unusual edge cases
        substr_start_end_exp = [("G", 100, 105, 0), ("G", -1, 4, 0),
                                ("G", 4, -1, 0), ("G", -8, -2, 0),
                                ("G", -2, -8, 0), ("G", 8, 2, 0),
                                ("G", 2, 8, 0), ("GG", 8, 2, 0),
                                ("GG", 2, 8, 0), ("GG", -5, -1, 0),
                                ("GG", 1, 5, 0), ("GGG", None, None, 0),
                                ("GGGGGGGGG", None, None, 0), ("GGG", 1, 2, 0)]

        for substr, start, end, exp in substr_start_end_exp:
            self.assertEqual(
                UnknownSeq(7, character="N").count_overlap(substr, start, end),
                exp)
        self.assertEqual(
            UnknownSeq(7, character="N").count_overlap("GG", 1), 0)

    def test_str_count_overlap_NN(self):
        """Check our count_overlap method using NN."""

        # Testing with self._examples
        expected = [
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,  # Seq() Tests
            0,
            0,
            0,
            0,
            0,
            11,
            11,
            11,
            0,
            0,
            0
        ]  # UnknownSeq() Tests
        expected *= 2  # MutableSeq() Tests

        assert len(self._examples) == len(expected)

        for seq, exp in zip(self._examples, expected):
            # Using search term NN as a string
            self.assertEqual(seq.count_overlap("NN"), exp)
            self.assertEqual(seq.count_overlap("N" * 13), 0)
            # Using search term NN as a Seq with generic alphabet
            self.assertEqual(seq.count_overlap(Seq("NN")), exp)
            self.assertEqual(seq.count_overlap(Seq("N" * 13)), 0)

    def test_count_overlap_start_end_NN(self):
        """Check our count_overlap method using NN with variable ends and starts."""
        # Testing Seq() and MutableSeq() with variable start and end arguments
        start_end_exp = [(1, 7, 0), (3, None, 0), (3, 6, 0), (4, 6, 0),
                         (4, -1, 0), (-5, None, 0), (-5, 7, 0), (7, -5, 0),
                         (-100, None, 0), (None, 100, 0), (-100, 1000, 0)]

        testing_seq = "GTAGGGGAG"

        for start, end, exp in start_end_exp:
            self.assertEqual(
                Seq(testing_seq).count_overlap("NN", start, end), exp)
            self.assertEqual(
                MutableSeq(testing_seq).count_overlap("NN", start, end), exp)

        # Testing Seq() and MutableSeq() with a more heterogeneous sequenece
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN"), 0)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN"), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -10, -2), 0)

        # Testing UnknownSeq() with variable start and end arguments
        alphabet_char_start_end_exp = [(generic_rna, "N", 1, 7, 5),
                                       (generic_dna, "N", 1, 7, 5),
                                       (generic_rna, "N", -4, None, 3),
                                       (generic_dna, "N", -4, None, 3),
                                       (generic_protein, "X", 1, 7, 0)]

        for alpha, char, start, end, exp in alphabet_char_start_end_exp:
            self.assertEqual(
                UnknownSeq(12, alpha, char).count_overlap("NN", start, end),
                exp)
        self.assertEqual(
            UnknownSeq(12, character="X").count_overlap("NN", 1, 7), 0)

        # Testing UnknownSeq() with some more cases including unusual edge cases
        substr_start_end_exp = [("N", 100, 105, 0), ("N", -1, 4, 0),
                                ("N", 4, -1, 2), ("N", -8, -2, 5),
                                ("N", -2, -8, 0), ("N", 8, 2, 0),
                                ("N", 2, 8, 5), ("NN", 8, 2, 0),
                                ("NN", 2, 8, 4), ("NN", -5, -1, 3),
                                ("NN", 1, 5, 3), ("NNN", None, None, 5),
                                ("NNNNNNNNN", None, None, 0), ("NNN", 1, 2, 0)]

        for substr, start, end, exp in substr_start_end_exp:
            self.assertEqual(
                UnknownSeq(7, character="N").count_overlap(substr, start, end),
                exp)
        self.assertEqual(
            UnknownSeq(7, character="N").count_overlap("NN", 1), 5)

    def test_str_find(self):
        """Check matches the python string find method."""
        self._test_method("find", start_end=True)

    def test_str_rfind(self):
        """Check matches the python string rfind method."""
        self._test_method("rfind", start_end=True)

    def test_str_startswith(self):
        """Check matches the python string startswith method."""
        self._test_method("startswith", start_end=True)
        self.assertTrue("ABCDE".startswith(("ABE", "OBE", "ABC")))

        # Now check with a tuple of sub sequences
        for example1 in self._examples:
            if not hasattr(example1, "startswith"):
                # e.g. MutableSeq does not support this
                continue
            subs = tuple([
                example1[start:start + 2]
                for start in range(0,
                                   len(example1) - 2, 3)
            ])
            subs_str = tuple([str(s) for s in subs])

            self.assertEqual(
                str(example1).startswith(subs_str), example1.startswith(subs))
            self.assertEqual(
                str(example1).startswith(subs_str),
                example1.startswith(subs_str))  # strings!
            self.assertEqual(
                str(example1).startswith(subs_str, 3),
                example1.startswith(subs, 3))
            self.assertEqual(
                str(example1).startswith(subs_str, 2, 6),
                example1.startswith(subs, 2, 6))

    def test_str_endswith(self):
        """Check matches the python string endswith method."""
        self._test_method("endswith", start_end=True)
        self.assertTrue("ABCDE".endswith(("ABE", "OBE", "CDE")))

        # Now check with a tuple of sub sequences
        for example1 in self._examples:
            if not hasattr(example1, "endswith"):
                # e.g. MutableSeq does not support this
                continue
            subs = tuple([
                example1[start:start + 2]
                for start in range(0,
                                   len(example1) - 2, 3)
            ])
            subs_str = tuple([str(s) for s in subs])

            self.assertEqual(
                str(example1).endswith(subs_str), example1.endswith(subs))
            self.assertEqual(
                str(example1).startswith(subs_str),
                example1.startswith(subs_str))  # strings!
            self.assertEqual(
                str(example1).endswith(subs_str, 3),
                example1.endswith(subs, 3))
            self.assertEqual(
                str(example1).endswith(subs_str, 2, 6),
                example1.endswith(subs, 2, 6))

    def test_str_strip(self):
        """Check matches the python string strip method."""
        self._test_method("strip", pre_comp_function=str)

    def test_str_rstrip(self):
        """Check matches the python string rstrip method."""
        self._test_method("rstrip", pre_comp_function=str)

    def test_str_split(self):
        """Check matches the python string rstrip method."""
        # Calling (r)split should return a list of Seq-like objects, we'll
        # just apply str() to each of them so it matches the string method
        self._test_method("rstrip",
                          pre_comp_function=lambda x: [str(y) for y in x])

    def test_str_rsplit(self):
        """Check matches the python string rstrip method."""
        # Calling (r)split should return a list of Seq-like objects, we'll
        # just apply str() to each of them so it matches the string method
        self._test_method("rstrip",
                          pre_comp_function=lambda x: [str(y) for y in x])

    def test_str_lsplit(self):
        """Check matches the python string rstrip method."""
        # Calling (r)split should return a list of Seq-like objects, we'll
        # just apply str() to each of them so it matches the string method
        self._test_method("rstrip",
                          pre_comp_function=lambda x: [str(y) for y in x])

    def test_str_length(self):
        """Check matches the python string __len__ method."""
        for example1 in self._examples:
            str1 = str(example1)
            self.assertEqual(len(example1), len(str1))

    def test_str_upper(self):
        """Check matches the python string upper method."""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            str1 = str(example1)
            self.assertEqual(str(example1.upper()), str1.upper())

    def test_str_lower(self):
        """Check matches the python string lower method."""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            str1 = str(example1)
            self.assertEqual(str(example1.lower()), str1.lower())

    def test_str_hash(self):
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            with warnings.catch_warnings():
                # Silence change in behaviour warning
                warnings.simplefilter('ignore', BiopythonWarning)
                self.assertEqual(
                    hash(str(example1)), hash(example1),
                    "Hash mismatch, %r for %r vs %r for %r" %
                    (hash(str(example1)), id(example1), hash(example1),
                     example1))

    def test_str_comparison(self):
        for example1 in self._examples:
            for example2 in self._examples:
                with warnings.catch_warnings():
                    # Silence alphabet warning
                    warnings.simplefilter('ignore', BiopythonWarning)
                    self.assertEqual(
                        str(example1) == str(example2), example1 == example2,
                        "Checking %r == %r" % (example1, example2))
                    self.assertEqual(
                        str(example1) != str(example2), example1 != example2,
                        "Checking %r != %r" % (example1, example2))
                    self.assertEqual(
                        str(example1) < str(example2), example1 < example2,
                        "Checking %r < %r" % (example1, example2))
                    self.assertEqual(
                        str(example1) <= str(example2), example1 <= example2,
                        "Checking %r <= %r" % (example1, example2))
                    self.assertEqual(
                        str(example1) > str(example2), example1 > example2,
                        "Checking %r > %r" % (example1, example2))
                    self.assertEqual(
                        str(example1) >= str(example2), example1 >= example2,
                        "Checking %r >= %r" % (example1, example2))

    def test_str_getitem(self):
        """Check slicing and indexing works like a string."""
        for example1 in self._examples:
            str1 = str(example1)
            for i in self._start_end_values:
                if i is not None and abs(i) < len(example1):
                    self.assertEqual(str(example1[i]), str1[i])
                self.assertEqual(str(example1[:i]), str1[:i])
                self.assertEqual(str(example1[i:]), str1[i:])
                for j in self._start_end_values:
                    self.assertEqual(str(example1[i:j]), str1[i:j])
                    for step in range(-3, 4):
                        if step == 0:
                            try:
                                print(example1[i:j:step])
                                self._assert(False)  # Should fail!
                            except ValueError:
                                pass
                        else:
                            self.assertEqual(str(example1[i:j:step]),
                                             str1[i:j:step])

    def test_tomutable(self):
        """Check obj.tomutable() method."""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            mut = example1.tomutable()
            self.assertTrue(isinstance(mut, MutableSeq))
            self.assertEqual(str(mut), str(example1))
            self.assertEqual(mut.alphabet, example1.alphabet)

    def test_toseq(self):
        """Check obj.toseq() method."""
        for example1 in self._examples:
            try:
                seq = example1.toseq()
            except AttributeError:
                self.assertTrue(isinstance(example1, Seq))
                continue
            self.assertTrue(isinstance(seq, Seq))
            self.assertEqual(str(seq), str(example1))
            self.assertEqual(seq.alphabet, example1.alphabet)

    def test_the_complement(self):
        """Check obj.complement() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            try:
                comp = example1.complement()
            except ValueError as e:
                self.assertEqual(str(e), "Proteins do not have complements!")
                continue
            str1 = str(example1)
            # This only does the unambiguous cases
            if any(("U" in str1, "u"
                    in str1, example1.alphabet == generic_rna)):
                mapping = maketrans("ACGUacgu", "UGCAugca")
            elif any(
                ("T" in str1, "t" in str1, example1.alphabet == generic_dna,
                 example1.alphabet == generic_nucleotide)):
                mapping = maketrans("ACGTacgt", "TGCAtgca")
            elif "A" not in str1 and "a" not in str1:
                mapping = maketrans("CGcg", "GCgc")
            else:
                # TODO - look at alphabet?
                raise ValueError(example1)
            self.assertEqual(str1.translate(mapping), str(comp))
            self.assertEqual(comp.alphabet, example1.alphabet)

    def test_the_reverse_complement(self):
        """Check obj.reverse_complement() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            try:
                comp = example1.reverse_complement()
            except ValueError as e:
                self.assertEqual(str(e), "Proteins do not have complements!")
                continue
            str1 = str(example1)
            # This only does the unambiguous cases
            if any(("U" in str1, "u"
                    in str1, example1.alphabet == generic_rna)):
                mapping = maketrans("ACGUacgu", "UGCAugca")
            elif any(
                ("T" in str1, "t" in str1, example1.alphabet == generic_dna,
                 example1.alphabet == generic_nucleotide)):
                mapping = maketrans("ACGTacgt", "TGCAtgca")
            elif "A" not in str1 and "a" not in str1:
                mapping = maketrans("CGcg", "GCgc")
            else:
                # TODO - look at alphabet?
                continue
            self.assertEqual(str1.translate(mapping)[::-1], str(comp))
            self.assertEqual(comp.alphabet, example1.alphabet)

    def test_the_transcription(self):
        """Check obj.transcribe() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            try:
                tran = example1.transcribe()
            except ValueError as e:
                if str(e) == "Proteins cannot be transcribed!":
                    continue
                if str(e) == "RNA cannot be transcribed!":
                    continue
                raise e
            str1 = str(example1)
            if len(str1) % 3 != 0:
                # TODO - Check for or silence the expected warning?
                continue
            self.assertEqual(
                str1.replace("T", "U").replace("t", "u"), str(tran))
            self.assertEqual(tran.alphabet,
                             generic_rna)  # based on limited examples

    def test_the_back_transcription(self):
        """Check obj.back_transcribe() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            try:
                tran = example1.back_transcribe()
            except ValueError as e:
                if str(e) == "Proteins cannot be back transcribed!":
                    continue
                if str(e) == "DNA cannot be back transcribed!":
                    continue
                raise e
            str1 = str(example1)
            self.assertEqual(
                str1.replace("U", "T").replace("u", "t"), str(tran))
            self.assertEqual(tran.alphabet,
                             generic_dna)  # based on limited examples

    def test_the_translate(self):
        """Check obj.translate() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            if len(example1) % 3 != 0:
                # TODO - Check for or silence the expected warning?
                continue
            try:
                tran = example1.translate()
            except ValueError as e:
                if str(e) == "Proteins cannot be translated!":
                    continue
                raise e
            # This is based on the limited example not having stop codons:
            if tran.alphabet not in [
                    extended_protein, protein, generic_protein
            ]:
                print(tran.alphabet)
                self.fail()
            # TODO - check the actual translation, and all the optional args

    def test_the_translation_of_stops(self):
        """Check obj.translate() method with stop codons."""
        misc_stops = "TAATAGTGAAGAAGG"
        for nuc in [
                Seq(misc_stops),
                Seq(misc_stops, generic_nucleotide),
                Seq(misc_stops, generic_dna),
                Seq(misc_stops, unambiguous_dna)
        ]:
            self.assertEqual("***RR", str(nuc.translate()))
            self.assertEqual("***RR", str(nuc.translate(1)))
            self.assertEqual("***RR", str(nuc.translate("SGC0")))
            self.assertEqual("**W**", str(nuc.translate(table=2)))
            self.assertEqual("**WRR",
                             str(nuc.translate(table='Yeast Mitochondrial')))
            self.assertEqual("**WSS", str(nuc.translate(table=5)))
            self.assertEqual("**WSS", str(nuc.translate(table=9)))
            self.assertEqual("**CRR",
                             str(nuc.translate(table='Euplotid Nuclear')))
            self.assertEqual("***RR", str(nuc.translate(table=11)))
            self.assertEqual("***RR", str(nuc.translate(table='11')))
            self.assertEqual("***RR", str(nuc.translate(table='Bacterial')))
            self.assertEqual("**GRR", str(nuc.translate(table=25)))
            self.assertEqual("", str(nuc.translate(to_stop=True)))
            self.assertEqual("O*ORR", str(nuc.translate(table=special_table)))
            self.assertEqual(
                "*QWRR", str(nuc.translate(table=Chilodonella_uncinata_table)))
            # These test the Bio.Seq.translate() function - move these?:
            self.assertEqual(
                "*QWRR", translate(str(nuc),
                                   table=Chilodonella_uncinata_table))
            self.assertEqual("O*ORR", translate(str(nuc), table=special_table))
            self.assertEqual("", translate(str(nuc), to_stop=True))
            self.assertEqual("***RR", translate(str(nuc), table='Bacterial'))
            self.assertEqual("***RR", translate(str(nuc), table='11'))
            self.assertEqual("***RR", translate(str(nuc), table=11))
            self.assertEqual("**W**", translate(str(nuc), table=2))
        self.assertEqual(str(Seq("TAT").translate()), "Y")
        self.assertEqual(str(Seq("TAR").translate()), "*")
        self.assertEqual(str(Seq("TAN").translate()), "X")
        self.assertEqual(str(Seq("NNN").translate()), "X")
        self.assertEqual(str(Seq("TAt").translate()), "Y")
        self.assertEqual(str(Seq("TaR").translate()), "*")
        self.assertEqual(str(Seq("TaN").translate()), "X")
        self.assertEqual(str(Seq("nnN").translate()), "X")
        self.assertEqual(str(Seq("tat").translate()), "Y")
        self.assertEqual(str(Seq("tar").translate()), "*")
        self.assertEqual(str(Seq("tan").translate()), "X")
        self.assertEqual(str(Seq("nnn").translate()), "X")

    def test_the_translation_of_invalid_codons(self):
        """Check obj.translate() method with invalid codons."""
        for codon in ["TA?", "N-N", "AC_", "Ac_"]:
            for nuc in [
                    Seq(codon),
                    Seq(codon, generic_nucleotide),
                    Seq(codon, generic_dna),
                    Seq(codon, unambiguous_dna)
            ]:
                try:
                    print(nuc.translate())
                    self.fail("Translating %s should fail" % codon)
                except TranslationError:
                    pass

    def test_the_translation_of_ambig_codons(self):
        """Check obj.translate() method with ambiguous codons."""
        for letters, ambig_values in [
            (ambiguous_dna.letters, ambiguous_dna_values),
            (ambiguous_rna.letters, ambiguous_rna_values)
        ]:
            ambig = set(letters)
            for c1 in ambig:
                for c2 in ambig:
                    for c3 in ambig:
                        values = set(
                            str(Seq(a + b + c).translate())
                            for a in ambig_values[c1] for b in ambig_values[c2]
                            for c in ambig_values[c3])
                        t = str(Seq(c1 + c2 + c3).translate())
                        if t == "*":
                            self.assertEqual(values, set("*"))
                        elif t == "X":
                            self.assertTrue(
                                len(values) > 1,
                                "translate('%s') = '%s' not '%s'" %
                                (c1 + c2 + c3, t, ",".join(values)))
                        elif t == "Z":
                            self.assertEqual(values, set("EQ"))
                        elif t == "B":
                            self.assertEqual(values, set("DN"))
                        elif t == "J":
                            self.assertEqual(values, set("LI"))
                        else:
                            self.assertEqual(values, set(t))
                        # TODO - Use the Bio.Data.IUPACData module for the
                        # ambiguous protein mappings?

    def test_init_typeerror(self):
        """Check Seq __init__ gives TypeError exceptions."""
        # Only expect it to take strings and unicode - not Seq objects!
        self.assertRaises(TypeError, Seq, (1066))
        self.assertRaises(TypeError, Seq, (Seq("ACGT", generic_dna)))

    def test_MutableSeq_init_typeerror(self):
        """Check MutableSeq __init__ gives TypeError exceptions."""
        self.assertRaises(TypeError, MutableSeq, (Seq("A")))
        self.assertRaises(TypeError, MutableSeq, (UnknownSeq(1)))

    def test_join_Seq_ValueError(self):
        """Checks that a ValueError is thrown for all non-iterable types."""
        # No iterable types which contain non-accepted types either.

        spacer = Seq('NNNNN')
        self.assertRaises(ValueError, spacer.join, 5)
        self.assertRaises(ValueError, spacer.join, "ATG")
        self.assertRaises(ValueError, spacer.join, Seq("ATG"))
        self.assertRaises(ValueError, spacer.join, MutableSeq("ATG"))
        self.assertRaises(ValueError, spacer.join, ["ATG", "ATG", 5, "ATG"])

    def test_join_UnknownSeq_ValueError(self):
        """Checks that a ValueError is thrown for all non-iterable types."""
        # No iterable types which contain non-accepted types either.

        spacer = UnknownSeq(5, character="-")
        self.assertRaises(ValueError, spacer.join, 5)
        self.assertRaises(ValueError, spacer.join, "ATG")
        self.assertRaises(ValueError, spacer.join, Seq("ATG"))
        self.assertRaises(ValueError, spacer.join, MutableSeq("ATG"))
        self.assertRaises(ValueError, spacer.join, ["ATG", "ATG", 5, "ATG"])

    def test_join_MutableSeq_ValueError(self):
        """Checks that a ValueError is thrown for all non-iterable types."""
        # No iterable types which contain non-accepted types either.

        spacer = MutableSeq("MMMMM")
        self.assertRaises(ValueError, spacer.join, 5)
        self.assertRaises(ValueError, spacer.join, "ATG")
        self.assertRaises(ValueError, spacer.join, Seq("ATG"))
        self.assertRaises(ValueError, spacer.join, MutableSeq("ATG"))
        self.assertRaises(ValueError, spacer.join, ["ATG", "ATG", 5, "ATG"])

    def test_join_Seq_TypeError(self):
        """Checks that a TypeError is thrown for incompatible alphabets."""

        spacer = Seq('NNNNN', generic_dna)
        self.assertRaises(
            TypeError, spacer.join,
            [Seq('NNNNN', generic_rna),
             Seq('NNNNN', generic_rna)])
        self.assertRaises(
            TypeError, spacer.join,
            [Seq('NNNNN', generic_protein),
             Seq('NNNNN', generic_protein)])

    def test_join_UnknownSeq_TypeError(self):
        """Checks that a TypeError is thrown for incompatible alphabets."""

        spacer = UnknownSeq(5, character="-", alphabet=generic_dna)
        self.assertRaises(TypeError, spacer.join, [
            UnknownSeq(5, character="-", alphabet=generic_rna),
            UnknownSeq(5, character="-", alphabet=generic_rna)
        ])
        self.assertRaises(TypeError, spacer.join, [
            Seq('NNNNN', generic_protein),
            UnknownSeq(5, character="-", alphabet=generic_protein)
        ])

    def test_join_MutableSeq_TypeError(self):
        """Checks that a TypeError is thrown for incompatible alphabets."""

        spacer = MutableSeq('NNNNN', generic_dna)
        self.assertRaises(TypeError, spacer.join, [
            MutableSeq('NNNNN', generic_rna),
            MutableSeq('NNNNN', generic_rna)
        ])
        self.assertRaises(TypeError, spacer.join, [
            Seq('NNNNN', generic_protein),
            MutableSeq('NNNNN', generic_protein)
        ])

    def test_join_Seq(self):
        """Checks if Seq join correctly concatenates sequence with the spacer."""
        # Only expect it to take Seq objects and/or strings in an iterable!

        spacer1 = Seq('', generic_dna)
        spacers = [
            spacer1,
            Seq('NNNNN', generic_dna),
            Seq('GGG', generic_nucleotide)
        ]
        example_strings = ["ATG", "ATG", "ATG", "ATG"]
        example_strings_seqs = ["ATG", "ATG", Seq("ATG", generic_dna), "ATG"]

        # strings with empty spacer
        str_concatenated = spacer1.join(example_strings)

        self.assertEqual(str(str_concatenated), "".join(example_strings))
        self.assertEqual(str_concatenated.alphabet, spacer1.alphabet)

        for spacer in spacers:
            seq_concatenated = spacer.join(example_strings_seqs)
            self.assertEqual(str(seq_concatenated),
                             str(spacer).join(example_strings))
            self.assertEqual(seq_concatenated.alphabet, spacer.alphabet)

    def test_join_Seq_with_file(self):
        """Checks if Seq join correctly concatenates sequence from a file with the spacer."""
        filename = 'Fasta/f003'
        seqlist = [record.seq for record in SeqIO.parse(filename, 'fasta')]
        seqlist_as_strings = [str(_) for _ in seqlist]

        spacer = Seq('NNNNN')
        spacer1 = Seq('')
        # seq objects with spacer
        seq_concatenated = spacer.join(seqlist)
        # seq objects with empty spacer
        seq_concatenated1 = spacer1.join(seqlist)

        ref_data = ref_data1 = ""
        ref_data = str(spacer).join(seqlist_as_strings)
        ref_data1 = str(spacer1).join(seqlist_as_strings)

        self.assertEqual(str(seq_concatenated), ref_data)
        self.assertEqual(str(seq_concatenated1), ref_data1)
        with self.assertRaises(TypeError):
            spacer.join(SeqIO.parse(filename, 'fasta'))

    def test_join_UnknownSeq(self):
        """Checks if UnknownSeq join correctly concatenates sequence with the spacer."""
        # Only expect it to take Seq objects and/or strings in an iterable!

        spacer1 = UnknownSeq(0, character="-", alphabet=generic_dna)
        spacers = [
            spacer1,
            UnknownSeq(5, character="-", alphabet=generic_dna),
            UnknownSeq(5, character="-", alphabet=generic_nucleotide)
        ]

        example_strings = ["ATG", "ATG", "ATG", "ATG"]
        example_strings_seqs = ["ATG", "ATG", Seq("ATG", generic_dna), "ATG"]

        # strings with empty spacer
        str_concatenated = spacer1.join(example_strings)

        self.assertEqual(str(str_concatenated), "".join(example_strings))
        self.assertEqual(str_concatenated.alphabet, spacer1.alphabet)

        for spacer in spacers:
            seq_concatenated = spacer.join(example_strings_seqs)
            self.assertEqual(str(seq_concatenated),
                             str(spacer).join(example_strings))
            self.assertEqual(seq_concatenated.alphabet, spacer.alphabet)

    def test_join_UnknownSeq_with_file(self):
        """Checks if UnknownSeq join correctly concatenates sequence from a file with the spacer."""
        filename = 'Fasta/f003'
        seqlist = [record.seq for record in SeqIO.parse(filename, 'fasta')]
        seqlist_as_strings = [str(_) for _ in seqlist]

        spacer = UnknownSeq(0, character="-", alphabet=generic_dna)
        spacer1 = UnknownSeq(5, character="-", alphabet=generic_dna)
        # seq objects with spacer
        seq_concatenated = spacer.join(seqlist)
        # seq objects with empty spacer
        seq_concatenated1 = spacer1.join(seqlist)

        ref_data = ref_data1 = ""
        ref_data = str(spacer).join(seqlist_as_strings)
        ref_data1 = str(spacer1).join(seqlist_as_strings)

        self.assertEqual(str(seq_concatenated), ref_data)
        self.assertEqual(str(seq_concatenated1), ref_data1)
        with self.assertRaises(TypeError):
            spacer.join(SeqIO.parse(filename, 'fasta'))

    def test_join_MutableSeq(self):
        """Checks if MutableSeq join correctly concatenates sequence with the spacer."""
        # Only expect it to take Seq objects and/or strings in an iterable!

        spacer1 = MutableSeq('', generic_dna)
        spacers = [
            spacer1,
            MutableSeq('NNNNN', generic_dna),
            MutableSeq('GGG', generic_nucleotide)
        ]
        example_strings = ["ATG", "ATG", "ATG", "ATG"]
        example_strings_seqs = ["ATG", "ATG", Seq("ATG", generic_dna), "ATG"]

        # strings with empty spacer
        str_concatenated = spacer1.join(example_strings)

        self.assertEqual(str(str_concatenated), "".join(example_strings))
        self.assertEqual(str_concatenated.alphabet, spacer1.alphabet)

        for spacer in spacers:
            seq_concatenated = spacer.join(example_strings_seqs)
            self.assertEqual(str(seq_concatenated),
                             str(spacer).join(example_strings))
            self.assertEqual(seq_concatenated.alphabet, spacer.alphabet)

    def test_join_MutableSeq_with_file(self):
        """Checks if MutableSeq join correctly concatenates sequence from a file with the spacer."""
        filename = 'Fasta/f003'
        seqlist = [record.seq for record in SeqIO.parse(filename, 'fasta')]
        seqlist_as_strings = [str(_) for _ in seqlist]

        spacer = MutableSeq('NNNNN')
        spacer1 = MutableSeq('')
        # seq objects with spacer
        seq_concatenated = spacer.join(seqlist)
        # seq objects with empty spacer
        seq_concatenated1 = spacer1.join(seqlist)

        ref_data = ref_data1 = ""
        ref_data = str(spacer).join(seqlist_as_strings)
        ref_data1 = str(spacer1).join(seqlist_as_strings)

        self.assertEqual(str(seq_concatenated), ref_data)
        self.assertEqual(str(seq_concatenated1), ref_data1)
        with self.assertRaises(TypeError):
            spacer.join(SeqIO.parse(filename, 'fasta'))
Пример #29
0
#manipulation des tables pour la traduction
from Bio.Data import CodonTable
std_table = CodonTable.unambiguous_dna_by_name["Standard"]
bact_table = CodonTable.unambiguous_dna_by_name["Bacterial"]
bact_table.start_codons
bact_table.stop_codons

#pour comparer séquences (attention à l'alphabet)
str(bli) == str(blu) 

#on peut faire des séquences mutables, cf tuto

#pour faire des séquences inconnues, avec des N pour nucléotides et X pour les protéines
from Bio.Seq import UnknownSeq
unk_dna = UnknownSeq(20, alphabet=IUPAC.ambiguous_dna)


#SeqRecord
from Bio.SeqRecord import SeqRecord
help(SeqRecord) #pour voir les différents champs
SeqRecord(bli)
from Bio import SeqIO
machin = SeqIO.read("hao.fasta", "fasta") #pour fichier avec une seule séquence
print machin
print machin.format("fasta")
#mêmes types de choses existent pour les .gnk (format GeneBank)

for seq_record in SeqIO.parse("nosZ.fasta", "fasta"):
    print seq_record.id
    print seq_record.seq
Пример #30
0
def concatenate(alignments):
    """
    Concatenates a list of multiple sequence alignment objects.

    The alignments are concatenated based on their label, i.e. the
    sequences from the different alignments which have the same id/labels
    will become a single sequence. The order is preserved.

    If any sequences are missing in one or several alignments, these parts
    are padded with unknown data (:py:class:`Bio.Seq.UnknownSeq`).

    :param alignments: the list of alignments objects, i.e. list(:py:class:`Bio.Align.MultipleSeqAlignment`)
    :returns: a single :py:class:`Bio.Align.MultipleSeqAlignment`

    Example::

        >>> sequences = {'aln1': {'seq1': 'acgtca',
        ...                       'seq2': 'acgtt-',
        ...                       'seq3': 'ac-ta-'},
        ...              'aln2': {'seq2': 'ttg-cta',
        ...                       'seq3': 'tcgacta',
        ...                       'seq4': 'ttgacta'}}
        >>> alignments = [MultipleSeqAlignment([SeqRecord(Seq(sequence,
        ...                    alphabet=IUPAC.extended_dna), id=key)
        ...      for (key, sequence) in sequences[aln].items()])
        ...               for aln in ('aln1', 'aln2')]
        >>> con_alignment = concatenate(alignments)
        >>> con_alignment.sort()
        >>> print(con_alignment)
        ExtendedIUPACDNA() alignment with 4 rows and 13 columns
        acgtcaNNNNNNN seq1
        acgtt-ttg-cta seq2
        ac-ta-tcgacta seq3
        NNNNNNttgacta seq4

    :note:

       Limitations: any annotations in the sub-alignments are lost in
       the concatenated alignment.

    """

    # First check to see whether we're inputting filenames of alignments or the Biopython alignments
    # Assume that it's a biopython alignment if it's not a filename
    tmp_aligns = []
    for filename in alignments:
        if identify_input(filename).name == 'FILENAME':
            tmp_aligns.append(AlignIO.read(filename, "fasta"))
        else:
            tmp_aligns.append(filename)

    # Copy back to alignments
    alignments = tmp_aligns

    # Get the full set of labels (i.e. sequence ids) for all the alignments
    all_labels = set(seq.id for aln in alignments for seq in aln)

    # Make a dictionary to store info as we go along
    # (defaultdict is convenient -- asking for a missing key gives back an empty list)
    tmp = defaultdict(list)

    # Assume all alignments have same alphabet
    alphabet = alignments[0]._alphabet

    for aln in alignments:
        length = aln.get_alignment_length()

        # check if any labels are missing in the current alignment
        these_labels = set(rec.id for rec in aln)
        missing = all_labels - these_labels

        # if any are missing, create unknown data of the right length,
        # stuff the string representation into the tmp dict
        for label in missing:
            new_seq = UnknownSeq(length, alphabet=alphabet)
            tmp[label].append(str(new_seq))

        # else stuff the string representation into the tmp dict
        for rec in aln:
            tmp[rec.id].append(str(rec.seq))

    # Stitch all the substrings together using join (most efficient way),
    # and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment
    return MultipleSeqAlignment(
        SeqRecord(Seq(''.join(v), alphabet=alphabet), id=k)
        for (k, v) in tmp.items())