def setUp(self): self.tt = TraceTracker() mock('helperlibs.bio.seqio.write', tracker=self.tt) self.options = Mock('options', tracker=self.tt, outputfoldername='test', input_type='nucl') rec1_features = [ generate_cluster_feature(23, 42, 'lantipeptide', 1), generate_cluster_feature(300, 500, 'nrps', 2) ] rec2_features = [ generate_cluster_feature(50, 70, 'lassopeptide', 3), generate_cluster_feature(500, 700, 't1pks', 4) ] record1 = SeqRecord(UnknownSeq(1000), 'record1', name='record1', features=rec1_features) record2 = SeqRecord(UnknownSeq(1000), 'record2', name='record2', features=rec2_features) self.records = [record1, record2] self.expected_template = """\
def test_join_UnknownSeq(self): """Checks if UnknownSeq join correctly concatenates sequence with the spacer.""" spacer1 = UnknownSeq(5, character="-") spacer2 = UnknownSeq(0, character="-") spacers = [spacer1, spacer2] self.assertEqual( "-" * 15, spacer1.join([UnknownSeq(5, character="-"), UnknownSeq(5, character="-")]), ) self.assertEqual( "N" * 5 + "-" * 10, spacer1.join([Seq("NNNNN"), UnknownSeq(5, character="-")]), ) example_strings = ["ATG", "ATG", "ATG", "ATG"] example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"] # strings with empty spacer str_concatenated = spacer2.join(example_strings) self.assertEqual(str_concatenated, "".join(example_strings)) for spacer in spacers: seq_concatenated = spacer.join(example_strings_seqs) self.assertEqual(seq_concatenated, str(spacer).join(example_strings)) # Now try single sequence arguments, should join the letters for target in example_strings + example_strings_seqs: self.assertEqual( str(spacer).join(str(target)), str(spacer.join(target)) )
def test_join_UnknownSeq(self): """Checks if UnknownSeq join correctly concatenates sequence with the spacer.""" # Only expect it to take Seq objects and/or strings in an iterable! spacer1 = UnknownSeq(0, character="-", alphabet=generic_dna) spacers = [ spacer1, UnknownSeq(5, character="-", alphabet=generic_dna), UnknownSeq(5, character="-", alphabet=generic_nucleotide) ] example_strings = ["ATG", "ATG", "ATG", "ATG"] example_strings_seqs = ["ATG", "ATG", Seq("ATG", generic_dna), "ATG"] # strings with empty spacer str_concatenated = spacer1.join(example_strings) self.assertEqual(str(str_concatenated), "".join(example_strings)) self.assertEqual(str_concatenated.alphabet, spacer1.alphabet) for spacer in spacers: seq_concatenated = spacer.join(example_strings_seqs) self.assertEqual(str(seq_concatenated), str(spacer).join(example_strings)) self.assertEqual(seq_concatenated.alphabet, spacer.alphabet)
def test_count_overlap_start_end_NN(self): """Check our count_overlap method using NN with variable ends and starts.""" # Testing Seq() and MutableSeq() with variable start and end arguments start_end_exp = [(1, 7, 0), (3, None, 0), (3, 6, 0), (4, 6, 0), (4, -1, 0), (-5, None, 0), (-5, 7, 0), (7, -5, 0), (-100, None, 0), (None, 100, 0), (-100, 1000, 0)] testing_seq = "GTAGGGGAG" for start, end, exp in start_end_exp: self.assertEqual( Seq(testing_seq).count_overlap("NN", start, end), exp) self.assertEqual( MutableSeq(testing_seq).count_overlap("NN", start, end), exp) # Testing Seq() and MutableSeq() with a more heterogeneous sequenece self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN"), 0) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN"), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -10, -2), 0) # Testing UnknownSeq() with variable start and end arguments alphabet_char_start_end_exp = [(generic_rna, "N", 1, 7, 5), (generic_dna, "N", 1, 7, 5), (generic_rna, "N", -4, None, 3), (generic_dna, "N", -4, None, 3), (generic_protein, "X", 1, 7, 0)] for alpha, char, start, end, exp in alphabet_char_start_end_exp: self.assertEqual( UnknownSeq(12, alpha, char).count_overlap("NN", start, end), exp) self.assertEqual( UnknownSeq(12, character="X").count_overlap("NN", 1, 7), 0) # Testing UnknownSeq() with some more cases including unusual edge cases substr_start_end_exp = [("N", 100, 105, 0), ("N", -1, 4, 0), ("N", 4, -1, 2), ("N", -8, -2, 5), ("N", -2, -8, 0), ("N", 8, 2, 0), ("N", 2, 8, 5), ("NN", 8, 2, 0), ("NN", 2, 8, 4), ("NN", -5, -1, 3), ("NN", 1, 5, 3), ("NNN", None, None, 5), ("NNNNNNNNN", None, None, 0), ("NNN", 1, 2, 0)] for substr, start, end, exp in substr_start_end_exp: self.assertEqual( UnknownSeq(7, character="N").count_overlap(substr, start, end), exp) self.assertEqual( UnknownSeq(7, character="N").count_overlap("NN", 1), 5)
def test_join_UnknownSeq_TypeError(self): """Checks that a TypeError is thrown for incompatible alphabets.""" spacer = UnknownSeq(5, character="-", alphabet=generic_dna) self.assertRaises(TypeError, spacer.join, [ UnknownSeq(5, character="-", alphabet=generic_rna), UnknownSeq(5, character="-", alphabet=generic_rna) ]) self.assertRaises(TypeError, spacer.join, [ Seq("NNNNN", generic_protein), UnknownSeq(5, character="-", alphabet=generic_protein) ])
def NoStop(input_file_path, out_name): """NoStop removes stop codons (that are hard coded in the function; change based on taxa/phyla). Function takes an input "file/path" and a "suffix" which is appended to the infile name.""" codon_stop_array = ["TGA", "TAG", "TAA", "UGA", "UAA", "UAG"] #input_file_path = "/Users/chriswirth/Desktop/PruinescenceSeqs/CAD/CAD_Mod/CAD_AllData_mod.fasta" file_ext = os.path.basename(input_file_path) file, ext = os.path.splitext(file_ext) #NB/caution: Assumes sequences are in frame! for record in SeqIO.parse(input_file_path, "fasta", generic_alphabet): temp_seq = Seq("", generic_alphabet) for index in range(0, len(record.seq), 3): codon = record.seq[index:index + 3] if codon in codon_stop_array: codon = UnknownSeq(3, character='?') #Note += syntax here temp_seq += codon #Write output to a .fasta file, note format fasta_format_string = (">%s\n%s" % (record.name, temp_seq)) a = open("%s_%s.fasta" % (file, out_name), "a+") print >> a, fasta_format_string a.close() #Write change log to a .txt file; could record issues/changes across all "genes" if desired, but I've restricted to only a single AllData file for now b = open("%s_log.csv" % file, "a+") # Prints CSV with path and filename, taxon name, number of stop codons, and position (-1 if none, can be cleaned up easily) of first stop codon print >> b, file, ",", record.name, ",", temp_seq.count( "???"), ",", temp_seq.find("???") b.close() return
def test_join_UnknownSeq_TypeError_iter(self): """Checks that a TypeError is thrown for all non-iterable types.""" # No iterable types which contain non-accepted types either. spacer = UnknownSeq(5, character="-") self.assertRaises(TypeError, spacer.join, 5) self.assertRaises(TypeError, spacer.join, ["ATG", "ATG", 5, "ATG"])
def test_generated(self): """Write and read back odd SeqRecord objects""" record1 = SeqRecord(Seq("ACGT"*500, generic_dna), id="Test", description="Long "*500, letter_annotations={"phred_quality":[40,30,20,10]*500}) record2 = SeqRecord(MutableSeq("NGGC"*1000), id="Mut", description="very "*1000+"long", letter_annotations={"phred_quality":[0,5,5,10]*1000}) record3 = SeqRecord(UnknownSeq(2000,character="N"), id="Unk", description="l"+("o"*1000)+"ng", letter_annotations={"phred_quality":[0,1]*1000}) record4 = SeqRecord(Seq("ACGT"*500), id="no_descr", description="", name="", letter_annotations={"phred_quality":[40,50,60,62]*500}) record5 = SeqRecord(Seq("",generic_dna), id="empty_p", description="(could have been trimmed lots)", letter_annotations={"phred_quality":[]}) record6 = SeqRecord(Seq(""), id="empty_s", description="(could have been trimmed lots)", letter_annotations={"solexa_quality":[]}) record7 = SeqRecord(Seq("ACNN"*500), id="Test_Sol", description="Long "*500, letter_annotations={"solexa_quality":[40,30,0,-5]*500}) record8 = SeqRecord(Seq("ACGT"), id="HighQual", description="With very large qualities that even Sanger FASTQ can't hold!", letter_annotations={"solexa_quality":[0,10,100,1000]}) #TODO - Record with no identifier? records = [record1, record2, record3, record4, record5, record6, record7, record8] #TODO - Have a Biopython defined "DataLossWarning?" warnings.simplefilter('ignore', BiopythonWarning) #TODO - Include phd output? for format in ["fasta", "fastq", "fastq-solexa", "fastq-illumina", "qual"]: handle = StringIO() SeqIO.write(records, handle, format) handle.seek(0) compare_records(records, list(SeqIO.parse(handle, format)), truncation_expected(format)) warnings.filters.pop()
def merge(records, length=20, spacer='n'): """Merge multiple SeqRecords into one, using a defined spacer :param records: Iterable containing SeqRecords to be merged :param length: Length of the spacer in kbp :param spacer: Kind of spacer to use ('n' for UnknownSeq spacer, 'stop' for all-frame stop codon spacer) :return: A single SeqRecord that is the product of the merge. """ if spacer not in ('n', 'stop'): raise ValueError("Invalid spacer: %r, use either 'n' or 'stop'" % spacer) if not len(records): raise ValueError("No records given") if spacer == 'stop': spacer_seq = Seq(ALL_FRAME_STOP_MOTIF * 40 * length, Alphabet.generic_dna) else: spacer_seq = UnknownSeq(length * 1000, alphabet=Alphabet.generic_dna, character='N') new_rec = records[0] if len(records) == 1: return new_rec rec_id = new_rec.id rec_name = new_rec.name rec_desc = new_rec.description date = new_rec.annotations.get('date', '') source = new_rec.annotations.get("source", '') organism = new_rec.annotations.get('organism', '') taxonomy = new_rec.annotations.get('taxonomy', []) data_file_division = new_rec.annotations.get('data_file_division', 'UNK') topology = new_rec.annotations.get('topology', 'linear') for i, rec in enumerate(records[1:]): spacer_id = 'spacer_{}'.format(i + 1) spacer_feature = SeqFeature(FeatureLocation(0, length * 1000, 0), type='misc_feature', id=spacer_id, qualifiers={'note': [spacer_id]}) spacer_rec = SeqRecord(spacer_seq, id=spacer_id, name=spacer_id, description=spacer_id, features=[spacer_feature]) new_rec = new_rec + spacer_rec + rec new_rec.id = rec_id new_rec.name = rec_name new_rec.description = rec_desc new_rec.annotations["date"] = date new_rec.annotations["source"] = source new_rec.annotations["organism"] = organism new_rec.annotations["taxonomy"] = taxonomy new_rec.annotations["data_file_division"] = data_file_division new_rec.annotations["topology"] = topology return new_rec
def replace_missing(alignment): '''alignment is a MultipleseqAlignment object. taxaToRemove is a list of the ids of taxa to replace with N's''' for record in alignment: gapSeq = '-'*len(alignment[0]) if (str(record.seq).upper().replace("N", "-")) == gapSeq: record.seq = UnknownSeq(len(record.seq), character='N') return alignment
def test_join_UnknownSeq_mixed_alpha(self): """Check UnknownSeq can join incompatible alphabets.""" spacer = UnknownSeq(5, character="-", alphabet=generic_dna) self.assertEqual( "-" * 15, spacer.join([ UnknownSeq(5, character="-", alphabet=generic_rna), UnknownSeq(5, character="-", alphabet=generic_rna), ]), ) self.assertEqual( "N" * 5 + "-" * 10, spacer.join([ Seq("NNNNN", generic_protein), UnknownSeq(5, character="-", alphabet=generic_protein), ]), )
def gff_write(self, out_rec_features, contig_id = None, contig_len = None, out_file = None, global_context = None): if out_file is None: return if out_rec_features: self.update_stashed(out_rec_features, global_context) out_rec = SeqRecord(UnknownSeq(length=contig_len), id = contig_id) out_rec.features = out_rec_features GFF.write([out_rec], out_file)
def test_process_wgs_master_scaffolds_wgs(self): """Test process_wgs_master_scaffolds() gets contigs from NCBI if there are WGS/WGS_SCAFLD annotations""" mock('utils.fix_wgs_master_record', tracker=self.tt, returns_func=lambda x: [x]) fake_records = [ FakeRecord(annotations={'wgs': 'foo'}), FakeRecord(annotations={'wgs_scafld': 'bar'}) ] fake_records[0].seq = UnknownSeq(23) fake_records[1].seq = UnknownSeq(42) trace = """Called utils.fix_wgs_master_record( <antismash.test.test_utils.FakeRecord object at ...>) Called utils.fix_wgs_master_record( <antismash.test.test_utils.FakeRecord object at ...>) """ procesed = utils.process_wgs_master_scaffolds(fake_records) self.assertEqual(fake_records, procesed) assert_same_trace(self.tt, trace)
def setUp(self): self.seqrec = SeqRecord(UnknownSeq(21)) loc = CompoundLocation([ FeatureLocation(12, 21, strand=1), FeatureLocation(0, 9, strand=1) ], operator="join") self.seqcds = SeqFeature(loc, type="CDS") self.seqgene = SeqFeature(loc, type="gene") self.seqrec.annotations["topology"] = "circular"
def _retrieve_seq(adaptor, primary_id): # The database schema ensures there will be only one matching # row in the table. # If an UnknownSeq was recorded, seq will be NULL, # but length will be populated. This means length(seq) # will return None. seqs = adaptor.execute_and_fetchall( "SELECT alphabet, length, length(seq) FROM biosequence WHERE bioentry_id = %s", (primary_id, ), ) if not seqs: return assert len(seqs) == 1 moltype, given_length, length = seqs[0] try: length = int(length) given_length = int(length) assert length == given_length have_seq = True except TypeError: assert length is None seqs = adaptor.execute_and_fetchall( "SELECT alphabet, length, seq FROM biosequence WHERE bioentry_id = %s", (primary_id, ), ) assert len(seqs) == 1 moltype, given_length, seq = seqs[0] assert seq is None or seq == "" length = int(given_length) have_seq = False del seq del given_length moltype = moltype.lower() # might be upper case in database # We have no way of knowing if these sequences will use IUPAC # alphabets, and we certainly can't assume they are unambiguous! if moltype == "dna": alphabet = Alphabet.generic_dna elif moltype == "rna": alphabet = Alphabet.generic_rna elif moltype == "protein": alphabet = Alphabet.generic_protein elif moltype == "unknown": # This is used in BioSQL/Loader.py and would happen # for any generic or nucleotide alphabets. alphabet = Alphabet.single_letter_alphabet else: raise AssertionError("Unknown moltype: %s" % moltype) if have_seq: return DBSeq(primary_id, adaptor, alphabet, 0, int(length)) else: return UnknownSeq(length, alphabet)
def test_equality(self): """Test equality when mixing types.""" self.assertEqual(Seq("6"), "6") self.assertNotEqual(Seq("6"), 6) self.assertEqual(Seq(""), "") self.assertNotEqual(Seq(""), None) self.assertEqual(Seq("None"), "None") self.assertNotEqual(Seq("None"), None) self.assertEqual(MutableSeq("6"), "6") self.assertNotEqual(MutableSeq("6"), 6) self.assertEqual(MutableSeq(""), "") self.assertNotEqual(MutableSeq(""), None) self.assertEqual(MutableSeq("None"), "None") self.assertNotEqual(MutableSeq("None"), None) self.assertEqual(UnknownSeq(1, character="6"), "6") self.assertNotEqual(UnknownSeq(1, character="6"), 6) self.assertEqual(UnknownSeq(0), "") self.assertNotEqual(UnknownSeq(0), None)
def test_join_UnknownSeq_with_file(self): """Checks if UnknownSeq join correctly concatenates sequence from a file with the spacer.""" filename = 'Fasta/f003' seqlist = [record.seq for record in SeqIO.parse(filename, 'fasta')] seqlist_as_strings = [str(_) for _ in seqlist] spacer = UnknownSeq(0, character="-", alphabet=generic_dna) spacer1 = UnknownSeq(5, character="-", alphabet=generic_dna) # seq objects with spacer seq_concatenated = spacer.join(seqlist) # seq objects with empty spacer seq_concatenated1 = spacer1.join(seqlist) ref_data = ref_data1 = "" ref_data = str(spacer).join(seqlist_as_strings) ref_data1 = str(spacer1).join(seqlist_as_strings) self.assertEqual(str(seq_concatenated), ref_data) self.assertEqual(str(seq_concatenated1), ref_data1) with self.assertRaises(TypeError): spacer.join(SeqIO.parse(filename, 'fasta'))
def __init__(self, biopython_object=None): # first we define our underlying SeqRecord object if biopython_object == None: self._record = SeqRecord(seq=UnknownSeq(0,alphabet=NucleotideAlphabet()),id='',name='',description='') elif isinstance(biopython_object,Seq): self._record = SeqRecord(seq=copy.deepcopy(biopython_object),id='',name='',description='') elif isinstance(biopython_object,SeqRecord): self._record = copy.deepcopy(biopython_object) # define dictionary of features for faster lookup self._features = {} for (i,feature) in enumerate(self._record.features): self._features.setdefault(feature.type,[]).append(i)
def _retrieve_seq(adaptor, primary_id): # The database schema ensures there will be only one matching # row in the table. # If an UnknownSeq was recorded, seq will be NULL, # but length will be populated. This means length(seq) # will return None. seqs = adaptor.execute_and_fetchall( "SELECT alphabet, length, length(seq) FROM biosequence WHERE bioentry_id = %s", (primary_id, ), ) if not seqs: return assert len(seqs) == 1 moltype, given_length, length = seqs[0] try: length = int(length) given_length = int(length) assert length == given_length have_seq = True except TypeError: assert length is None seqs = adaptor.execute_and_fetchall( "SELECT alphabet, length, seq FROM biosequence WHERE bioentry_id = %s", (primary_id, ), ) assert len(seqs) == 1 moltype, given_length, seq = seqs[0] assert seq is None or seq == "" length = int(given_length) have_seq = False del seq del given_length if have_seq: return DBSeq(primary_id, adaptor, alphabet=None, start=0, length=int(length)) else: if moltype in ("dna", "rna"): character = "N" elif moltype == "protein": character = "X" else: character = "?" return UnknownSeq(length, character=character)
def concatenate(infiles, outfile): alignments = [AlignIO.read(open(f, "r"), "fasta") for f in infiles] # Get the full set of labels (i.e. sequence ids) for all the alignments all_labels = set(seq.id for aln in alignments for seq in aln) # Make a dictionary to store info as we go along # (defaultdict is convenient -- asking for a missing key gives back an empty list) tmp = defaultdict(list) # Assume all alignments have same alphabet #alphabet = alignments[0]._alphabet for aln in alignments: length = aln.get_alignment_length() # check if any labels are missing in the current alignment these_labels = set(rec.id for rec in aln) missing = all_labels - these_labels # if any are missing, create unknown data of the right length, # stuff the string representation into the tmp dict for label in missing: #new_seq = UnknownSeq(length, alphabet=alphabet) new_seq = UnknownSeq(length) tmp[label].append(str(new_seq)) # else stuff the string representation into the tmp dict for rec in aln: tmp[rec.id].append(str(rec.seq)) # Stitch all the substrings together using join (most efficient way), # and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment #msa = MultipleSeqAlignment(SeqRecord(Seq(''.join(v), alphabet=alphabet), id=k) msa = MultipleSeqAlignment( SeqRecord(Seq(''.join(v)), id=k) for (k, v) in tmp.items()) # with open(outfile, "w") as out: AlignIO.write(msa, outfile, "fasta") #tmpdir = tempfile.TemporaryDirectory() #print(tmpdir.name) #timeit.timeit('concatenate(infiles,outfile)', # setup='infiles=simAlignments(10,10,tmpdir.name),outfile=tempfile.NamedTemporaryFile(dir=tmpdir).name') # python -m timeit -s 'import tempfile; tmpdir=tempfile.TemporaryDirectory(); from concatenate import simAlignments; infiles=simAlignments(10,10,tmpdir.name); outf=tempfile.NamedTemporaryFile().name' "from concatenate import concatenate; concatenate(infiles,outf)" #100 loops, best of 3: 2.94 msec per loop
def _get_rec(self, base, info_dict): """Retrieve a record to add features to.""" max_loc = info_dict.get("location", (0, 1))[1] try: cur_rec = base[info_dict["rec_id"]] # update generated unknown sequences with the expected maximum length if isinstance(cur_rec.seq, UnknownSeq): cur_rec.seq._length = max([max_loc, cur_rec.seq._length]) return cur_rec, base except KeyError: if self._create_missing: new_rec = SeqRecord(UnknownSeq(max_loc), info_dict["rec_id"]) base[info_dict["rec_id"]] = new_rec return new_rec, base else: raise
def concatenate(alignments): # Get the full set of labels (i.e. sequence ids) for all the alignments all_labels = set(seq.id for aln in alignments for seq in aln) logger.debug("extracted {} different labels in all alignments: {}".format( len(all_labels), all_labels)) # Make a dictionary to store info as we go along # (defaultdict is convenient -- asking for a missing key gives back an empty list) concat_buf = defaultdict(list) # Assume all alignments have same alphabet alphabet = alignments[0]._alphabet logger.debug('detected alphabet: {}'.format(alphabet)) for aln in alignments: length = aln.get_alignment_length() # check if any labels are missing in the current alignment these_labels = set(rec.id for rec in aln) missing = all_labels - these_labels logger.debug( "alignment of length {} with {} sequences, {} missing ({})".format( length, len(these_labels), len(missing), missing)) # if any are missing, create unknown data of the right length, # stuff the string representation into the concat_buf dict for label in missing: new_seq = UnknownSeq(length, alphabet=alphabet) concat_buf[label].append(str(new_seq)) # else stuff the string representation into the concat_buf dict for rec in aln: concat_buf[rec.id].append(str(rec.seq)) # Stitch all the substrings together using join (most efficient way), # and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment msa = MultipleSeqAlignment( SeqRecord(Seq(''.join(seq_arr), alphabet=alphabet), id=label) for (label, seq_arr) in concat_buf.items()) logger.info( "concatenated MSA of {} taxa and total length {} created".format( len(msa), len(msa[0]))) return msa
def concatenate(alignments): """ Concatenates a list of Bio.Align.MultipleSeqAlignment objects. If any sequences are missing the are padded with unknown data (Bio.Seq.UnknownSeq). Returns a single Bio.Align.MultipleSeqAlignment. Limitations: any annotations in the sub-alignments are lost in the concatenated alignment. """ # Get the full set of labels (i.e. sequence ids) for all the alignments all_labels = set(seq.id for aln in alignments for seq in aln) # Make a dictionary to store info as we go along # (defaultdict is convenient -- asking for a missing key gives back an empty list) tmp = defaultdict(list) # Assume all alignments have same alphabet alphabet = alignments[0]._alphabet for aln in alignments: length = aln.get_alignment_length() # check if any labels are missing in the current alignment these_labels = set(rec.id for rec in aln) missing = all_labels - these_labels # if any are missing, create unknown data of the right length, # stuff the string representation into the tmp dict for label in missing: new_seq = UnknownSeq(length, alphabet=alphabet) tmp[label].append(str(new_seq)) # else stuff the string representation into the tmp dict for rec in aln: tmp[rec.id].append(str(rec.seq)) # Stitch all the substrings together using join (most efficient way), # and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment msa = MultipleSeqAlignment(SeqRecord(Seq(''.join(v), alphabet=alphabet), id=k, name=k, description=k) for (k,v) in tmp.items()) return msa
def cfg_out_iterator(handle, alphabet=single_letter_alphabet): """Generator to iterate Centrifuge output (as SeqRecord objects) Arguments: - handle - input file - alphabet - optional alphabet """ for (read_id, seq_id, tax_id, score, second_score, hit_length, query_length, num_matches) in simple_out_parser(handle): try: first_word = read_id.split(None, 1)[0] except IndexError: assert not read_id, repr(read_id) # Should we use SeqRecord default for no ID? first_word = "" # From Centrifuge score get the "single hit equivalent length" try: adapted_score = float(score)**0.5 + 15 except ValueError: print(f'Error parsing score ({score}) for taxid {tax_id}' f' in {handle}...') raise try: adapted_2nd_score = float(second_score)**0.5 + 15 except ValueError: print(f'Error parsing score ({second_score}) for taxid {tax_id}' f' in {handle}...') raise yield SeqRecord(UnknownSeq(0, alphabet), id=first_word, name=first_word, description=read_id, dbxrefs=[seq_id], annotations={ 'taxID': tax_id, 'score': adapted_score, '2ndBestScore': adapted_2nd_score, 'hitLength': hit_length, 'queryLength': query_length, 'numMatches': int(num_matches), })
def join_seqs(s1, s2, length=None): if length: pad_length = length - len(s1) - len(s2) try: pad = SeqRecord( UnknownSeq(pad_length, character='-'), letter_annotations={'phred_quality': [0] * pad_length}, ) except ValueError: sys.exit( 'Total length of the two reads exceeds given length (%s)' % (length)) else: s_joined = s1 + pad + s2.reverse_complement() else: s_joined = s1 + s2.reverse_complement() ## assumes the read ID ends in a 2-char suffix for direction (e.g. _1) s_joined.id = s1.id[:-2] s_joined.description = '' ## not required for fastq return s_joined
def prepare_cluster_qual_files(work_dir, qual_file, cluster_seq_dir): cluster_qual_dir = work_dir + "/cluster_qual" os.mkdir(cluster_qual_dir) # get a list of all quality scores fd_qual = open(qual_file, "rU") quals = SeqIO.to_dict(SeqIO.parse(fd_qual, "qual")) # get quality scores for the clusters for cluster_seq_file in os.listdir(cluster_seq_dir): if os.path.isfile( cluster_seq_dir + "/" + cluster_seq_file ): # check if file, can do some more checking here e.g. is fasta file fd_cluster_seq = open(cluster_seq_dir + "/" + cluster_seq_file, "rU") cluster_seqs = SeqIO.parse(fd_cluster_seq, "fasta") cluster_quals = [] for seq in cluster_seqs: qual = quals[seq.name] cluster_qual = SeqRecord(seq=UnknownSeq( len(qual.letter_annotations["phred_quality"])), id="", description=qual.description) cluster_qual.letter_annotations[ "phred_quality"] = qual.letter_annotations["phred_quality"] cluster_quals.append(cluster_qual) cluster_qual_file = cluster_qual_dir + "/" + cluster_seq_file.split( ".")[0] + ".qual" fd_cluster_qual = open(cluster_qual_file, "w") SeqIO.write(cluster_quals, fd_cluster_qual, "qual") fd_cluster_qual.close() os.system("sed -i \"s/> />/g\" " + cluster_qual_file ) # need to replace the space after the > in header fd_cluster_seq.close() fd_qual.close() return cluster_qual_dir
def test_MutableSeq_init_typeerror(self): """Check MutableSeq __init__ gives TypeError exceptions.""" self.assertRaises(TypeError, MutableSeq, (Seq("A"))) self.assertRaises(TypeError, MutableSeq, (UnknownSeq(1)))
class StringMethodTests(unittest.TestCase): _examples = [ # These are length 9, a multiple of 3 for translation tests: Seq("ACGTGGGGT", generic_protein), Seq("ACGTGGGGT", generic_nucleotide), Seq("ACGTGGGGT", generic_dna), Seq("ACGUGGGGU", generic_rna), Seq("GG", generic_protein), Seq("GG", generic_nucleotide), Seq("GG", generic_dna), Seq("GG", generic_rna), Seq("A", generic_protein), Seq("A", generic_nucleotide), Seq("A", generic_dna), Seq("A", generic_rna), UnknownSeq(1), UnknownSeq(1, character="n"), UnknownSeq(1, generic_rna), UnknownSeq(1, generic_rna, "n"), UnknownSeq(1, generic_rna, "N"), UnknownSeq(12, generic_rna, "N"), UnknownSeq(12, generic_dna, "N"), UnknownSeq(12, generic_nucleotide, "N"), UnknownSeq(12, generic_protein, "X"), UnknownSeq(12, character="X"), UnknownSeq(12), ] for seq in _examples[:]: if isinstance(seq, Seq): _examples.append(seq.tomutable()) _start_end_values = [0, 1, 2, 1000, -1, -2, -999, None] def _test_method(self, method_name, pre_comp_function=None, start_end=False): """Check this method matches the plain string's method.""" self.assertTrue(isinstance(method_name, str)) for example1 in self._examples: if not hasattr(example1, method_name): # e.g. MutableSeq does not support find continue str1 = str(example1) for example2 in self._examples: if not hasattr(example2, method_name): # e.g. MutableSeq does not support find continue str2 = str(example2) i = getattr(example1, method_name)(str2) j = getattr(str1, method_name)(str2) if pre_comp_function: i = pre_comp_function(i) j = pre_comp_function(j) if i != j: raise ValueError( "%s.%s(%s) = %i, not %i" % (repr(example1), method_name, repr(str2), i, j)) try: i = getattr(example1, method_name)(example2) j = getattr(str1, method_name)(str2) if pre_comp_function: i = pre_comp_function(i) j = pre_comp_function(j) if i != j: raise ValueError("%s.%s(%s) = %i, not %i" % (repr(example1), method_name, repr(example2), i, j)) except TypeError: # TODO - Check the alphabets do clash! pass if start_end: for start in self._start_end_values: i = getattr(example1, method_name)(str2, start) j = getattr(str1, method_name)(str2, start) if pre_comp_function: i = pre_comp_function(i) j = pre_comp_function(j) if i != j: raise ValueError("%s.%s(%s, %i) = %i, not %i" % (repr(example1), method_name, repr(str2), start, i, j)) for end in self._start_end_values: i = getattr(example1, method_name)(str2, start, end) j = getattr(str1, method_name)(str2, start, end) if pre_comp_function: i = pre_comp_function(i) j = pre_comp_function(j) if i != j: raise ValueError( "%s.%s(%s, %i, %i) = %i, not %i" % (repr(example1), method_name, repr(str2), start, end, i, j)) def test_str_count(self): """Check matches the python string count method.""" self._test_method("count", start_end=True) def test_str_count_overlap_GG(self): """Check our count_overlap method using GG.""" # Testing with self._examples expected = [ 3, 3, 3, 3, 1, 1, 1, 1, 0, 0, 0, 0, # Seq() Tests 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] # UnknownSeq() Tests expected *= 2 # MutableSeq() Tests assert len(self._examples) == len(expected) for seq, exp in zip(self._examples, expected): # Using search term GG as a string self.assertEqual(seq.count_overlap("GG"), exp) self.assertEqual(seq.count_overlap("G" * 5), 0) # Using search term GG as a Seq with generic alphabet self.assertEqual(seq.count_overlap(Seq("GG")), exp) self.assertEqual(seq.count_overlap(Seq("G" * 5)), 0) def test_count_overlap_start_end_GG(self): """Check our count_overlap method using GG with variable ends and starts.""" # Testing Seq() and MutableSeq() with variable start and end arguments start_end_exp = [(1, 7, 3), (3, None, 3), (3, 6, 2), (4, 6, 1), (4, -1, 2), (-5, None, 2), (-5, 7, 2), (7, -5, 0), (-100, None, 3), (None, 100, 3), (-100, 1000, 3)] testing_seq = "GTAGGGGAG" for start, end, exp in start_end_exp: self.assertEqual( Seq(testing_seq).count_overlap("GG", start, end), exp) self.assertEqual( MutableSeq(testing_seq).count_overlap("GG", start, end), exp) # Testing Seq() and MutableSeq() with a more heterogeneous sequenece self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG"), 5) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG"), 5) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -2, -10), 0) # Testing UnknownSeq() with variable start and end arguments alphabet_char_start_end_exp = [(generic_rna, "N", 1, 7, 0), (generic_dna, "N", 1, 7, 0), (generic_rna, "N", -4, None, 0), (generic_dna, "N", -4, None, 0), (generic_protein, "X", 1, 7, 0)] for alpha, char, start, end, exp in alphabet_char_start_end_exp: self.assertEqual( UnknownSeq(12, alpha, char).count_overlap("GG", start, end), exp) self.assertEqual( UnknownSeq(12, character="X").count_overlap("GG", 1, 7), 0) # Testing UnknownSeq() with some more cases including unusual edge cases substr_start_end_exp = [("G", 100, 105, 0), ("G", -1, 4, 0), ("G", 4, -1, 0), ("G", -8, -2, 0), ("G", -2, -8, 0), ("G", 8, 2, 0), ("G", 2, 8, 0), ("GG", 8, 2, 0), ("GG", 2, 8, 0), ("GG", -5, -1, 0), ("GG", 1, 5, 0), ("GGG", None, None, 0), ("GGGGGGGGG", None, None, 0), ("GGG", 1, 2, 0)] for substr, start, end, exp in substr_start_end_exp: self.assertEqual( UnknownSeq(7, character="N").count_overlap(substr, start, end), exp) self.assertEqual( UnknownSeq(7, character="N").count_overlap("GG", 1), 0) def test_str_count_overlap_NN(self): """Check our count_overlap method using NN.""" # Testing with self._examples expected = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # Seq() Tests 0, 0, 0, 0, 0, 11, 11, 11, 0, 0, 0 ] # UnknownSeq() Tests expected *= 2 # MutableSeq() Tests assert len(self._examples) == len(expected) for seq, exp in zip(self._examples, expected): # Using search term NN as a string self.assertEqual(seq.count_overlap("NN"), exp) self.assertEqual(seq.count_overlap("N" * 13), 0) # Using search term NN as a Seq with generic alphabet self.assertEqual(seq.count_overlap(Seq("NN")), exp) self.assertEqual(seq.count_overlap(Seq("N" * 13)), 0) def test_count_overlap_start_end_NN(self): """Check our count_overlap method using NN with variable ends and starts.""" # Testing Seq() and MutableSeq() with variable start and end arguments start_end_exp = [(1, 7, 0), (3, None, 0), (3, 6, 0), (4, 6, 0), (4, -1, 0), (-5, None, 0), (-5, 7, 0), (7, -5, 0), (-100, None, 0), (None, 100, 0), (-100, 1000, 0)] testing_seq = "GTAGGGGAG" for start, end, exp in start_end_exp: self.assertEqual( Seq(testing_seq).count_overlap("NN", start, end), exp) self.assertEqual( MutableSeq(testing_seq).count_overlap("NN", start, end), exp) # Testing Seq() and MutableSeq() with a more heterogeneous sequenece self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN"), 0) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN"), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -10, -2), 0) # Testing UnknownSeq() with variable start and end arguments alphabet_char_start_end_exp = [(generic_rna, "N", 1, 7, 5), (generic_dna, "N", 1, 7, 5), (generic_rna, "N", -4, None, 3), (generic_dna, "N", -4, None, 3), (generic_protein, "X", 1, 7, 0)] for alpha, char, start, end, exp in alphabet_char_start_end_exp: self.assertEqual( UnknownSeq(12, alpha, char).count_overlap("NN", start, end), exp) self.assertEqual( UnknownSeq(12, character="X").count_overlap("NN", 1, 7), 0) # Testing UnknownSeq() with some more cases including unusual edge cases substr_start_end_exp = [("N", 100, 105, 0), ("N", -1, 4, 0), ("N", 4, -1, 2), ("N", -8, -2, 5), ("N", -2, -8, 0), ("N", 8, 2, 0), ("N", 2, 8, 5), ("NN", 8, 2, 0), ("NN", 2, 8, 4), ("NN", -5, -1, 3), ("NN", 1, 5, 3), ("NNN", None, None, 5), ("NNNNNNNNN", None, None, 0), ("NNN", 1, 2, 0)] for substr, start, end, exp in substr_start_end_exp: self.assertEqual( UnknownSeq(7, character="N").count_overlap(substr, start, end), exp) self.assertEqual( UnknownSeq(7, character="N").count_overlap("NN", 1), 5) def test_str_find(self): """Check matches the python string find method.""" self._test_method("find", start_end=True) def test_str_rfind(self): """Check matches the python string rfind method.""" self._test_method("rfind", start_end=True) def test_str_startswith(self): """Check matches the python string startswith method.""" self._test_method("startswith", start_end=True) self.assertTrue("ABCDE".startswith(("ABE", "OBE", "ABC"))) # Now check with a tuple of sub sequences for example1 in self._examples: if not hasattr(example1, "startswith"): # e.g. MutableSeq does not support this continue subs = tuple([ example1[start:start + 2] for start in range(0, len(example1) - 2, 3) ]) subs_str = tuple([str(s) for s in subs]) self.assertEqual( str(example1).startswith(subs_str), example1.startswith(subs)) self.assertEqual( str(example1).startswith(subs_str), example1.startswith(subs_str)) # strings! self.assertEqual( str(example1).startswith(subs_str, 3), example1.startswith(subs, 3)) self.assertEqual( str(example1).startswith(subs_str, 2, 6), example1.startswith(subs, 2, 6)) def test_str_endswith(self): """Check matches the python string endswith method.""" self._test_method("endswith", start_end=True) self.assertTrue("ABCDE".endswith(("ABE", "OBE", "CDE"))) # Now check with a tuple of sub sequences for example1 in self._examples: if not hasattr(example1, "endswith"): # e.g. MutableSeq does not support this continue subs = tuple([ example1[start:start + 2] for start in range(0, len(example1) - 2, 3) ]) subs_str = tuple([str(s) for s in subs]) self.assertEqual( str(example1).endswith(subs_str), example1.endswith(subs)) self.assertEqual( str(example1).startswith(subs_str), example1.startswith(subs_str)) # strings! self.assertEqual( str(example1).endswith(subs_str, 3), example1.endswith(subs, 3)) self.assertEqual( str(example1).endswith(subs_str, 2, 6), example1.endswith(subs, 2, 6)) def test_str_strip(self): """Check matches the python string strip method.""" self._test_method("strip", pre_comp_function=str) def test_str_rstrip(self): """Check matches the python string rstrip method.""" self._test_method("rstrip", pre_comp_function=str) def test_str_split(self): """Check matches the python string rstrip method.""" # Calling (r)split should return a list of Seq-like objects, we'll # just apply str() to each of them so it matches the string method self._test_method("rstrip", pre_comp_function=lambda x: [str(y) for y in x]) def test_str_rsplit(self): """Check matches the python string rstrip method.""" # Calling (r)split should return a list of Seq-like objects, we'll # just apply str() to each of them so it matches the string method self._test_method("rstrip", pre_comp_function=lambda x: [str(y) for y in x]) def test_str_lsplit(self): """Check matches the python string rstrip method.""" # Calling (r)split should return a list of Seq-like objects, we'll # just apply str() to each of them so it matches the string method self._test_method("rstrip", pre_comp_function=lambda x: [str(y) for y in x]) def test_str_length(self): """Check matches the python string __len__ method.""" for example1 in self._examples: str1 = str(example1) self.assertEqual(len(example1), len(str1)) def test_str_upper(self): """Check matches the python string upper method.""" for example1 in self._examples: if isinstance(example1, MutableSeq): continue str1 = str(example1) self.assertEqual(str(example1.upper()), str1.upper()) def test_str_lower(self): """Check matches the python string lower method.""" for example1 in self._examples: if isinstance(example1, MutableSeq): continue str1 = str(example1) self.assertEqual(str(example1.lower()), str1.lower()) def test_str_hash(self): for example1 in self._examples: if isinstance(example1, MutableSeq): continue with warnings.catch_warnings(): # Silence change in behaviour warning warnings.simplefilter('ignore', BiopythonWarning) self.assertEqual( hash(str(example1)), hash(example1), "Hash mismatch, %r for %r vs %r for %r" % (hash(str(example1)), id(example1), hash(example1), example1)) def test_str_comparison(self): for example1 in self._examples: for example2 in self._examples: with warnings.catch_warnings(): # Silence alphabet warning warnings.simplefilter('ignore', BiopythonWarning) self.assertEqual( str(example1) == str(example2), example1 == example2, "Checking %r == %r" % (example1, example2)) self.assertEqual( str(example1) != str(example2), example1 != example2, "Checking %r != %r" % (example1, example2)) self.assertEqual( str(example1) < str(example2), example1 < example2, "Checking %r < %r" % (example1, example2)) self.assertEqual( str(example1) <= str(example2), example1 <= example2, "Checking %r <= %r" % (example1, example2)) self.assertEqual( str(example1) > str(example2), example1 > example2, "Checking %r > %r" % (example1, example2)) self.assertEqual( str(example1) >= str(example2), example1 >= example2, "Checking %r >= %r" % (example1, example2)) def test_str_getitem(self): """Check slicing and indexing works like a string.""" for example1 in self._examples: str1 = str(example1) for i in self._start_end_values: if i is not None and abs(i) < len(example1): self.assertEqual(str(example1[i]), str1[i]) self.assertEqual(str(example1[:i]), str1[:i]) self.assertEqual(str(example1[i:]), str1[i:]) for j in self._start_end_values: self.assertEqual(str(example1[i:j]), str1[i:j]) for step in range(-3, 4): if step == 0: try: print(example1[i:j:step]) self._assert(False) # Should fail! except ValueError: pass else: self.assertEqual(str(example1[i:j:step]), str1[i:j:step]) def test_tomutable(self): """Check obj.tomutable() method.""" for example1 in self._examples: if isinstance(example1, MutableSeq): continue mut = example1.tomutable() self.assertTrue(isinstance(mut, MutableSeq)) self.assertEqual(str(mut), str(example1)) self.assertEqual(mut.alphabet, example1.alphabet) def test_toseq(self): """Check obj.toseq() method.""" for example1 in self._examples: try: seq = example1.toseq() except AttributeError: self.assertTrue(isinstance(example1, Seq)) continue self.assertTrue(isinstance(seq, Seq)) self.assertEqual(str(seq), str(example1)) self.assertEqual(seq.alphabet, example1.alphabet) def test_the_complement(self): """Check obj.complement() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue try: comp = example1.complement() except ValueError as e: self.assertEqual(str(e), "Proteins do not have complements!") continue str1 = str(example1) # This only does the unambiguous cases if any(("U" in str1, "u" in str1, example1.alphabet == generic_rna)): mapping = maketrans("ACGUacgu", "UGCAugca") elif any( ("T" in str1, "t" in str1, example1.alphabet == generic_dna, example1.alphabet == generic_nucleotide)): mapping = maketrans("ACGTacgt", "TGCAtgca") elif "A" not in str1 and "a" not in str1: mapping = maketrans("CGcg", "GCgc") else: # TODO - look at alphabet? raise ValueError(example1) self.assertEqual(str1.translate(mapping), str(comp)) self.assertEqual(comp.alphabet, example1.alphabet) def test_the_reverse_complement(self): """Check obj.reverse_complement() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue try: comp = example1.reverse_complement() except ValueError as e: self.assertEqual(str(e), "Proteins do not have complements!") continue str1 = str(example1) # This only does the unambiguous cases if any(("U" in str1, "u" in str1, example1.alphabet == generic_rna)): mapping = maketrans("ACGUacgu", "UGCAugca") elif any( ("T" in str1, "t" in str1, example1.alphabet == generic_dna, example1.alphabet == generic_nucleotide)): mapping = maketrans("ACGTacgt", "TGCAtgca") elif "A" not in str1 and "a" not in str1: mapping = maketrans("CGcg", "GCgc") else: # TODO - look at alphabet? continue self.assertEqual(str1.translate(mapping)[::-1], str(comp)) self.assertEqual(comp.alphabet, example1.alphabet) def test_the_transcription(self): """Check obj.transcribe() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue try: tran = example1.transcribe() except ValueError as e: if str(e) == "Proteins cannot be transcribed!": continue if str(e) == "RNA cannot be transcribed!": continue raise e str1 = str(example1) if len(str1) % 3 != 0: # TODO - Check for or silence the expected warning? continue self.assertEqual( str1.replace("T", "U").replace("t", "u"), str(tran)) self.assertEqual(tran.alphabet, generic_rna) # based on limited examples def test_the_back_transcription(self): """Check obj.back_transcribe() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue try: tran = example1.back_transcribe() except ValueError as e: if str(e) == "Proteins cannot be back transcribed!": continue if str(e) == "DNA cannot be back transcribed!": continue raise e str1 = str(example1) self.assertEqual( str1.replace("U", "T").replace("u", "t"), str(tran)) self.assertEqual(tran.alphabet, generic_dna) # based on limited examples def test_the_translate(self): """Check obj.translate() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue if len(example1) % 3 != 0: # TODO - Check for or silence the expected warning? continue try: tran = example1.translate() except ValueError as e: if str(e) == "Proteins cannot be translated!": continue raise e # This is based on the limited example not having stop codons: if tran.alphabet not in [ extended_protein, protein, generic_protein ]: print(tran.alphabet) self.fail() # TODO - check the actual translation, and all the optional args def test_the_translation_of_stops(self): """Check obj.translate() method with stop codons.""" misc_stops = "TAATAGTGAAGAAGG" for nuc in [ Seq(misc_stops), Seq(misc_stops, generic_nucleotide), Seq(misc_stops, generic_dna), Seq(misc_stops, unambiguous_dna) ]: self.assertEqual("***RR", str(nuc.translate())) self.assertEqual("***RR", str(nuc.translate(1))) self.assertEqual("***RR", str(nuc.translate("SGC0"))) self.assertEqual("**W**", str(nuc.translate(table=2))) self.assertEqual("**WRR", str(nuc.translate(table='Yeast Mitochondrial'))) self.assertEqual("**WSS", str(nuc.translate(table=5))) self.assertEqual("**WSS", str(nuc.translate(table=9))) self.assertEqual("**CRR", str(nuc.translate(table='Euplotid Nuclear'))) self.assertEqual("***RR", str(nuc.translate(table=11))) self.assertEqual("***RR", str(nuc.translate(table='11'))) self.assertEqual("***RR", str(nuc.translate(table='Bacterial'))) self.assertEqual("**GRR", str(nuc.translate(table=25))) self.assertEqual("", str(nuc.translate(to_stop=True))) self.assertEqual("O*ORR", str(nuc.translate(table=special_table))) self.assertEqual( "*QWRR", str(nuc.translate(table=Chilodonella_uncinata_table))) # These test the Bio.Seq.translate() function - move these?: self.assertEqual( "*QWRR", translate(str(nuc), table=Chilodonella_uncinata_table)) self.assertEqual("O*ORR", translate(str(nuc), table=special_table)) self.assertEqual("", translate(str(nuc), to_stop=True)) self.assertEqual("***RR", translate(str(nuc), table='Bacterial')) self.assertEqual("***RR", translate(str(nuc), table='11')) self.assertEqual("***RR", translate(str(nuc), table=11)) self.assertEqual("**W**", translate(str(nuc), table=2)) self.assertEqual(str(Seq("TAT").translate()), "Y") self.assertEqual(str(Seq("TAR").translate()), "*") self.assertEqual(str(Seq("TAN").translate()), "X") self.assertEqual(str(Seq("NNN").translate()), "X") self.assertEqual(str(Seq("TAt").translate()), "Y") self.assertEqual(str(Seq("TaR").translate()), "*") self.assertEqual(str(Seq("TaN").translate()), "X") self.assertEqual(str(Seq("nnN").translate()), "X") self.assertEqual(str(Seq("tat").translate()), "Y") self.assertEqual(str(Seq("tar").translate()), "*") self.assertEqual(str(Seq("tan").translate()), "X") self.assertEqual(str(Seq("nnn").translate()), "X") def test_the_translation_of_invalid_codons(self): """Check obj.translate() method with invalid codons.""" for codon in ["TA?", "N-N", "AC_", "Ac_"]: for nuc in [ Seq(codon), Seq(codon, generic_nucleotide), Seq(codon, generic_dna), Seq(codon, unambiguous_dna) ]: try: print(nuc.translate()) self.fail("Translating %s should fail" % codon) except TranslationError: pass def test_the_translation_of_ambig_codons(self): """Check obj.translate() method with ambiguous codons.""" for letters, ambig_values in [ (ambiguous_dna.letters, ambiguous_dna_values), (ambiguous_rna.letters, ambiguous_rna_values) ]: ambig = set(letters) for c1 in ambig: for c2 in ambig: for c3 in ambig: values = set( str(Seq(a + b + c).translate()) for a in ambig_values[c1] for b in ambig_values[c2] for c in ambig_values[c3]) t = str(Seq(c1 + c2 + c3).translate()) if t == "*": self.assertEqual(values, set("*")) elif t == "X": self.assertTrue( len(values) > 1, "translate('%s') = '%s' not '%s'" % (c1 + c2 + c3, t, ",".join(values))) elif t == "Z": self.assertEqual(values, set("EQ")) elif t == "B": self.assertEqual(values, set("DN")) elif t == "J": self.assertEqual(values, set("LI")) else: self.assertEqual(values, set(t)) # TODO - Use the Bio.Data.IUPACData module for the # ambiguous protein mappings? def test_init_typeerror(self): """Check Seq __init__ gives TypeError exceptions.""" # Only expect it to take strings and unicode - not Seq objects! self.assertRaises(TypeError, Seq, (1066)) self.assertRaises(TypeError, Seq, (Seq("ACGT", generic_dna))) def test_MutableSeq_init_typeerror(self): """Check MutableSeq __init__ gives TypeError exceptions.""" self.assertRaises(TypeError, MutableSeq, (Seq("A"))) self.assertRaises(TypeError, MutableSeq, (UnknownSeq(1))) def test_join_Seq_ValueError(self): """Checks that a ValueError is thrown for all non-iterable types.""" # No iterable types which contain non-accepted types either. spacer = Seq('NNNNN') self.assertRaises(ValueError, spacer.join, 5) self.assertRaises(ValueError, spacer.join, "ATG") self.assertRaises(ValueError, spacer.join, Seq("ATG")) self.assertRaises(ValueError, spacer.join, MutableSeq("ATG")) self.assertRaises(ValueError, spacer.join, ["ATG", "ATG", 5, "ATG"]) def test_join_UnknownSeq_ValueError(self): """Checks that a ValueError is thrown for all non-iterable types.""" # No iterable types which contain non-accepted types either. spacer = UnknownSeq(5, character="-") self.assertRaises(ValueError, spacer.join, 5) self.assertRaises(ValueError, spacer.join, "ATG") self.assertRaises(ValueError, spacer.join, Seq("ATG")) self.assertRaises(ValueError, spacer.join, MutableSeq("ATG")) self.assertRaises(ValueError, spacer.join, ["ATG", "ATG", 5, "ATG"]) def test_join_MutableSeq_ValueError(self): """Checks that a ValueError is thrown for all non-iterable types.""" # No iterable types which contain non-accepted types either. spacer = MutableSeq("MMMMM") self.assertRaises(ValueError, spacer.join, 5) self.assertRaises(ValueError, spacer.join, "ATG") self.assertRaises(ValueError, spacer.join, Seq("ATG")) self.assertRaises(ValueError, spacer.join, MutableSeq("ATG")) self.assertRaises(ValueError, spacer.join, ["ATG", "ATG", 5, "ATG"]) def test_join_Seq_TypeError(self): """Checks that a TypeError is thrown for incompatible alphabets.""" spacer = Seq('NNNNN', generic_dna) self.assertRaises( TypeError, spacer.join, [Seq('NNNNN', generic_rna), Seq('NNNNN', generic_rna)]) self.assertRaises( TypeError, spacer.join, [Seq('NNNNN', generic_protein), Seq('NNNNN', generic_protein)]) def test_join_UnknownSeq_TypeError(self): """Checks that a TypeError is thrown for incompatible alphabets.""" spacer = UnknownSeq(5, character="-", alphabet=generic_dna) self.assertRaises(TypeError, spacer.join, [ UnknownSeq(5, character="-", alphabet=generic_rna), UnknownSeq(5, character="-", alphabet=generic_rna) ]) self.assertRaises(TypeError, spacer.join, [ Seq('NNNNN', generic_protein), UnknownSeq(5, character="-", alphabet=generic_protein) ]) def test_join_MutableSeq_TypeError(self): """Checks that a TypeError is thrown for incompatible alphabets.""" spacer = MutableSeq('NNNNN', generic_dna) self.assertRaises(TypeError, spacer.join, [ MutableSeq('NNNNN', generic_rna), MutableSeq('NNNNN', generic_rna) ]) self.assertRaises(TypeError, spacer.join, [ Seq('NNNNN', generic_protein), MutableSeq('NNNNN', generic_protein) ]) def test_join_Seq(self): """Checks if Seq join correctly concatenates sequence with the spacer.""" # Only expect it to take Seq objects and/or strings in an iterable! spacer1 = Seq('', generic_dna) spacers = [ spacer1, Seq('NNNNN', generic_dna), Seq('GGG', generic_nucleotide) ] example_strings = ["ATG", "ATG", "ATG", "ATG"] example_strings_seqs = ["ATG", "ATG", Seq("ATG", generic_dna), "ATG"] # strings with empty spacer str_concatenated = spacer1.join(example_strings) self.assertEqual(str(str_concatenated), "".join(example_strings)) self.assertEqual(str_concatenated.alphabet, spacer1.alphabet) for spacer in spacers: seq_concatenated = spacer.join(example_strings_seqs) self.assertEqual(str(seq_concatenated), str(spacer).join(example_strings)) self.assertEqual(seq_concatenated.alphabet, spacer.alphabet) def test_join_Seq_with_file(self): """Checks if Seq join correctly concatenates sequence from a file with the spacer.""" filename = 'Fasta/f003' seqlist = [record.seq for record in SeqIO.parse(filename, 'fasta')] seqlist_as_strings = [str(_) for _ in seqlist] spacer = Seq('NNNNN') spacer1 = Seq('') # seq objects with spacer seq_concatenated = spacer.join(seqlist) # seq objects with empty spacer seq_concatenated1 = spacer1.join(seqlist) ref_data = ref_data1 = "" ref_data = str(spacer).join(seqlist_as_strings) ref_data1 = str(spacer1).join(seqlist_as_strings) self.assertEqual(str(seq_concatenated), ref_data) self.assertEqual(str(seq_concatenated1), ref_data1) with self.assertRaises(TypeError): spacer.join(SeqIO.parse(filename, 'fasta')) def test_join_UnknownSeq(self): """Checks if UnknownSeq join correctly concatenates sequence with the spacer.""" # Only expect it to take Seq objects and/or strings in an iterable! spacer1 = UnknownSeq(0, character="-", alphabet=generic_dna) spacers = [ spacer1, UnknownSeq(5, character="-", alphabet=generic_dna), UnknownSeq(5, character="-", alphabet=generic_nucleotide) ] example_strings = ["ATG", "ATG", "ATG", "ATG"] example_strings_seqs = ["ATG", "ATG", Seq("ATG", generic_dna), "ATG"] # strings with empty spacer str_concatenated = spacer1.join(example_strings) self.assertEqual(str(str_concatenated), "".join(example_strings)) self.assertEqual(str_concatenated.alphabet, spacer1.alphabet) for spacer in spacers: seq_concatenated = spacer.join(example_strings_seqs) self.assertEqual(str(seq_concatenated), str(spacer).join(example_strings)) self.assertEqual(seq_concatenated.alphabet, spacer.alphabet) def test_join_UnknownSeq_with_file(self): """Checks if UnknownSeq join correctly concatenates sequence from a file with the spacer.""" filename = 'Fasta/f003' seqlist = [record.seq for record in SeqIO.parse(filename, 'fasta')] seqlist_as_strings = [str(_) for _ in seqlist] spacer = UnknownSeq(0, character="-", alphabet=generic_dna) spacer1 = UnknownSeq(5, character="-", alphabet=generic_dna) # seq objects with spacer seq_concatenated = spacer.join(seqlist) # seq objects with empty spacer seq_concatenated1 = spacer1.join(seqlist) ref_data = ref_data1 = "" ref_data = str(spacer).join(seqlist_as_strings) ref_data1 = str(spacer1).join(seqlist_as_strings) self.assertEqual(str(seq_concatenated), ref_data) self.assertEqual(str(seq_concatenated1), ref_data1) with self.assertRaises(TypeError): spacer.join(SeqIO.parse(filename, 'fasta')) def test_join_MutableSeq(self): """Checks if MutableSeq join correctly concatenates sequence with the spacer.""" # Only expect it to take Seq objects and/or strings in an iterable! spacer1 = MutableSeq('', generic_dna) spacers = [ spacer1, MutableSeq('NNNNN', generic_dna), MutableSeq('GGG', generic_nucleotide) ] example_strings = ["ATG", "ATG", "ATG", "ATG"] example_strings_seqs = ["ATG", "ATG", Seq("ATG", generic_dna), "ATG"] # strings with empty spacer str_concatenated = spacer1.join(example_strings) self.assertEqual(str(str_concatenated), "".join(example_strings)) self.assertEqual(str_concatenated.alphabet, spacer1.alphabet) for spacer in spacers: seq_concatenated = spacer.join(example_strings_seqs) self.assertEqual(str(seq_concatenated), str(spacer).join(example_strings)) self.assertEqual(seq_concatenated.alphabet, spacer.alphabet) def test_join_MutableSeq_with_file(self): """Checks if MutableSeq join correctly concatenates sequence from a file with the spacer.""" filename = 'Fasta/f003' seqlist = [record.seq for record in SeqIO.parse(filename, 'fasta')] seqlist_as_strings = [str(_) for _ in seqlist] spacer = MutableSeq('NNNNN') spacer1 = MutableSeq('') # seq objects with spacer seq_concatenated = spacer.join(seqlist) # seq objects with empty spacer seq_concatenated1 = spacer1.join(seqlist) ref_data = ref_data1 = "" ref_data = str(spacer).join(seqlist_as_strings) ref_data1 = str(spacer1).join(seqlist_as_strings) self.assertEqual(str(seq_concatenated), ref_data) self.assertEqual(str(seq_concatenated1), ref_data1) with self.assertRaises(TypeError): spacer.join(SeqIO.parse(filename, 'fasta'))
#manipulation des tables pour la traduction from Bio.Data import CodonTable std_table = CodonTable.unambiguous_dna_by_name["Standard"] bact_table = CodonTable.unambiguous_dna_by_name["Bacterial"] bact_table.start_codons bact_table.stop_codons #pour comparer séquences (attention à l'alphabet) str(bli) == str(blu) #on peut faire des séquences mutables, cf tuto #pour faire des séquences inconnues, avec des N pour nucléotides et X pour les protéines from Bio.Seq import UnknownSeq unk_dna = UnknownSeq(20, alphabet=IUPAC.ambiguous_dna) #SeqRecord from Bio.SeqRecord import SeqRecord help(SeqRecord) #pour voir les différents champs SeqRecord(bli) from Bio import SeqIO machin = SeqIO.read("hao.fasta", "fasta") #pour fichier avec une seule séquence print machin print machin.format("fasta") #mêmes types de choses existent pour les .gnk (format GeneBank) for seq_record in SeqIO.parse("nosZ.fasta", "fasta"): print seq_record.id print seq_record.seq
def concatenate(alignments): """ Concatenates a list of multiple sequence alignment objects. The alignments are concatenated based on their label, i.e. the sequences from the different alignments which have the same id/labels will become a single sequence. The order is preserved. If any sequences are missing in one or several alignments, these parts are padded with unknown data (:py:class:`Bio.Seq.UnknownSeq`). :param alignments: the list of alignments objects, i.e. list(:py:class:`Bio.Align.MultipleSeqAlignment`) :returns: a single :py:class:`Bio.Align.MultipleSeqAlignment` Example:: >>> sequences = {'aln1': {'seq1': 'acgtca', ... 'seq2': 'acgtt-', ... 'seq3': 'ac-ta-'}, ... 'aln2': {'seq2': 'ttg-cta', ... 'seq3': 'tcgacta', ... 'seq4': 'ttgacta'}} >>> alignments = [MultipleSeqAlignment([SeqRecord(Seq(sequence, ... alphabet=IUPAC.extended_dna), id=key) ... for (key, sequence) in sequences[aln].items()]) ... for aln in ('aln1', 'aln2')] >>> con_alignment = concatenate(alignments) >>> con_alignment.sort() >>> print(con_alignment) ExtendedIUPACDNA() alignment with 4 rows and 13 columns acgtcaNNNNNNN seq1 acgtt-ttg-cta seq2 ac-ta-tcgacta seq3 NNNNNNttgacta seq4 :note: Limitations: any annotations in the sub-alignments are lost in the concatenated alignment. """ # First check to see whether we're inputting filenames of alignments or the Biopython alignments # Assume that it's a biopython alignment if it's not a filename tmp_aligns = [] for filename in alignments: if identify_input(filename).name == 'FILENAME': tmp_aligns.append(AlignIO.read(filename, "fasta")) else: tmp_aligns.append(filename) # Copy back to alignments alignments = tmp_aligns # Get the full set of labels (i.e. sequence ids) for all the alignments all_labels = set(seq.id for aln in alignments for seq in aln) # Make a dictionary to store info as we go along # (defaultdict is convenient -- asking for a missing key gives back an empty list) tmp = defaultdict(list) # Assume all alignments have same alphabet alphabet = alignments[0]._alphabet for aln in alignments: length = aln.get_alignment_length() # check if any labels are missing in the current alignment these_labels = set(rec.id for rec in aln) missing = all_labels - these_labels # if any are missing, create unknown data of the right length, # stuff the string representation into the tmp dict for label in missing: new_seq = UnknownSeq(length, alphabet=alphabet) tmp[label].append(str(new_seq)) # else stuff the string representation into the tmp dict for rec in aln: tmp[rec.id].append(str(rec.seq)) # Stitch all the substrings together using join (most efficient way), # and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment return MultipleSeqAlignment( SeqRecord(Seq(''.join(v), alphabet=alphabet), id=k) for (k, v) in tmp.items())