def _prepResults(self): #print "Query", self.state['queryName'], self.state['queryLen'] #print "HitINFO:", self.hitInfo #print "HitRecord", self.hitRecord #print "HitAlign:", self.alignMap if ( len(self.hitInfo) == 0 and not self.state['eof'] ): alignment = MultipleSeqAlignment( [], self.alphabet) self.outList.append( alignment ) for hit in self.hitInfo: for domain in self.hitRecord[ hit ]: queryStr = "".join( self.alignMap[ hit ][ domain ][ 'query' ] ) targetStr = "".join( self.alignMap[ hit ][ domain ][ 'target' ] ) query = SeqRecord(Seq(queryStr, self.alphabet), id = self.state['queryName'], description = self.state.get( 'desc', "" ), annotations = {}) target = SeqRecord(Seq(targetStr, self.alphabet), id = hit, annotations = {}) alignment = HMMERAlign( [query,target], self.alphabet) alignment._annotations = self.hitRecord[ hit ][ domain ] alignment._annotations[ 'seqName' ] = self.state['queryName'] alignment._annotations[ 'hmmName' ] = hit self.outList.append( alignment )
def __next__(self): handle = self.handle if self._header is None: line = handle.readline() else: # Header we saved from when we were parsing # the previous alignment. line = self._header self._header = None if not line: # Empty file - just give up. raise StopIteration if not line.strip() == '# STOCKHOLM 1.0': raise ValueError("Did not find STOCKHOLM header") # Note: If this file follows the PFAM conventions, there should be # a line containing the number of sequences, e.g. "#=GF SQ 67" # We do not check for this - perhaps we should, and verify that # if present it agrees with our parsing. seqs = {} ids = OrderedDict( ) # Really only need an OrderedSet, but python lacks this gs = {} gr = {} gf = {} passed_end_alignment = False while True: line = handle.readline() if not line: break # end of file line = line.strip() # remove trailing \n if line == '# STOCKHOLM 1.0': self._header = line break elif line == "//": # The "//" line indicates the end of the alignment. # There may still be more meta-data passed_end_alignment = True elif line == "": # blank line, ignore pass elif line[0] != "#": # Sequence # Format: "<seqname> <sequence>" assert not passed_end_alignment parts = [x.strip() for x in line.split(" ", 1)] if len(parts) != 2: # This might be someone attempting to store a zero length sequence? raise ValueError("Could not split line into identifier " + "and sequence:\n" + line) id, seq = parts if id not in ids: ids[id] = True seqs.setdefault(id, '') seqs[id] += seq.replace(".", "-") elif len(line) >= 5: # Comment line or meta-data if line[:5] == "#=GF ": # Generic per-File annotation, free text # Format: #=GF <feature> <free text> feature, text = line[5:].strip().split(None, 1) # Each feature key could be used more than once, # so store the entries as a list of strings. if feature not in gf: gf[feature] = [text] else: gf[feature].append(text) elif line[:5] == '#=GC ': # Generic per-Column annotation, exactly 1 char per column # Format: "#=GC <feature> <exactly 1 char per column>" pass elif line[:5] == '#=GS ': # Generic per-Sequence annotation, free text # Format: "#=GS <seqname> <feature> <free text>" id, feature, text = line[5:].strip().split(None, 2) # if id not in ids: # ids.append(id) if id not in gs: gs[id] = {} if feature not in gs[id]: gs[id][feature] = [text] else: gs[id][feature].append(text) elif line[:5] == "#=GR ": # Generic per-Sequence AND per-Column markup # Format: "#=GR <seqname> <feature> <exactly 1 char per column>" id, feature, text = line[5:].strip().split(None, 2) # if id not in ids: # ids.append(id) if id not in gr: gr[id] = {} if feature not in gr[id]: gr[id][feature] = "" gr[id][feature] += text.strip( ) # append to any previous entry # TODO - Should we check the length matches the alignment length? # For iterlaced sequences the GR data can be split over # multiple lines # Next line... assert len(seqs) <= len(ids) # assert len(gs) <= len(ids) # assert len(gr) <= len(ids) self.ids = ids.keys() self.sequences = seqs self.seq_annotation = gs self.seq_col_annotation = gr if ids and seqs: if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids): raise ValueError( "Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment)) alignment_length = len(list(seqs.values())[0]) records = [] # Alignment obj will put them all in a list anyway for id in ids: seq = seqs[id] if alignment_length != len(seq): raise ValueError( "Sequences have different lengths, or repeated identifier" ) name, start, end = self._identifier_split(id) record = SeqRecord(Seq(seq, self.alphabet), id=id, name=name, description=id, annotations={"accession": name}) # Accession will be overridden by _populate_meta_data if an explicit # accession is provided: record.annotations["accession"] = name if start is not None: record.annotations["start"] = start if end is not None: record.annotations["end"] = end self._populate_meta_data(id, record) records.append(record) alignment = MultipleSeqAlignment(records, self.alphabet) # TODO - Introduce an annotated alignment class? # For now, store the annotation a new private property: alignment._annotations = gr return alignment else: raise StopIteration
def __next__(self): handle = self.handle if self._header is None: line = handle.readline() else: # Header we saved from when we were parsing # the previous alignment. line = self._header self._header = None if not line: # Empty file - just give up. raise StopIteration if not line.strip() == '# STOCKHOLM 1.0': raise ValueError("Did not find STOCKHOLM header") # Note: If this file follows the PFAM conventions, there should be # a line containing the number of sequences, e.g. "#=GF SQ 67" # We do not check for this - perhaps we should, and verify that # if present it agrees with our parsing. seqs = {} ids = [] gs = {} gr = {} gf = {} passed_end_alignment = False while True: line = handle.readline() if not line: break # end of file line = line.strip() # remove trailing \n if line == '# STOCKHOLM 1.0': self._header = line break elif line == "//": # The "//" line indicates the end of the alignment. # There may still be more meta-data passed_end_alignment = True elif line == "": # blank line, ignore pass elif line[0] != "#": # Sequence # Format: "<seqname> <sequence>" assert not passed_end_alignment parts = [x.strip() for x in line.split(" ", 1)] if len(parts) != 2: # This might be someone attempting to store a zero length sequence? raise ValueError("Could not split line into identifier " + "and sequence:\n" + line) id, seq = parts if id not in ids: ids.append(id) seqs.setdefault(id, '') seqs[id] += seq.replace(".", "-") elif len(line) >= 5: # Comment line or meta-data if line[:5] == "#=GF ": # Generic per-File annotation, free text # Format: #=GF <feature> <free text> feature, text = line[5:].strip().split(None, 1) # Each feature key could be used more than once, # so store the entries as a list of strings. if feature not in gf: gf[feature] = [text] else: gf[feature].append(text) elif line[:5] == '#=GC ': # Generic per-Column annotation, exactly 1 char per column # Format: "#=GC <feature> <exactly 1 char per column>" pass elif line[:5] == '#=GS ': # Generic per-Sequence annotation, free text # Format: "#=GS <seqname> <feature> <free text>" id, feature, text = line[5:].strip().split(None, 2) # if id not in ids: # ids.append(id) if id not in gs: gs[id] = {} if feature not in gs[id]: gs[id][feature] = [text] else: gs[id][feature].append(text) elif line[:5] == "#=GR ": # Generic per-Sequence AND per-Column markup # Format: "#=GR <seqname> <feature> <exactly 1 char per column>" id, feature, text = line[5:].strip().split(None, 2) # if id not in ids: # ids.append(id) if id not in gr: gr[id] = {} if feature not in gr[id]: gr[id][feature] = "" gr[id][feature] += text.strip() # append to any previous entry # TODO - Should we check the length matches the alignment length? # For iterlaced sequences the GR data can be split over # multiple lines # Next line... assert len(seqs) <= len(ids) # assert len(gs) <= len(ids) # assert len(gr) <= len(ids) self.ids = ids self.sequences = seqs self.seq_annotation = gs self.seq_col_annotation = gr if ids and seqs: if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids): raise ValueError("Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment)) alignment_length = len(list(seqs.values())[0]) records = [] # Alignment obj will put them all in a list anyway for id in ids: seq = seqs[id] if alignment_length != len(seq): raise ValueError("Sequences have different lengths, or repeated identifier") name, start, end = self._identifier_split(id) record = SeqRecord(Seq(seq, self.alphabet), id=id, name=name, description=id, annotations={"accession": name}) # Accession will be overridden by _populate_meta_data if an explicit # accession is provided: record.annotations["accession"] = name if start is not None: record.annotations["start"] = start if end is not None: record.annotations["end"] = end self._populate_meta_data(id, record) records.append(record) alignment = MultipleSeqAlignment(records, self.alphabet) # TODO - Introduce an annotated alignment class? # For now, store the annotation a new private property: alignment._annotations = gr return alignment else: raise StopIteration
def build_hsp(): if not query_tags and not match_tags: raise ValueError("No data for query %r, match %r" % (query_id, match_id)) assert query_tags, query_tags assert match_tags, match_tags evalue = align_tags.get("fa_expect") q = "?" # Just for printing len(q) in debug below m = "?" # Just for printing len(m) in debug below tool = global_tags.get("tool", "").upper() q = _extract_alignment_region(query_seq, query_tags) if tool in ["TFASTX"] and len(match_seq) == len(q): m = match_seq # Quick hack until I can work out how -, * and / characters # and the apparent mix of aa and bp coordinates works. else: m = _extract_alignment_region(match_seq, match_tags) if len(q) != len(m): message = """Darn... amino acids vs nucleotide coordinates? tool: {0} query_seq: {1} query_tags: {2} {3} length: {4} match_seq: {5} match_tags: {6} {7} length: {8} handle.name: {9} """.format(tool, query_seq, query_tags, q, len(q), match_seq, match_tags, m, len(m), handle.name) raise ValueError(message) assert alphabet is not None alignment = MultipleSeqAlignment([], alphabet) # TODO - Introduce an annotated alignment class? # See also Bio/AlignIO/MafIO.py for same requirement. # For now, store the annotation a new private property: alignment._annotations = {} # Want to record both the query header tags, and the alignment tags. for key, value in header_tags.items(): alignment._annotations[key] = value for key, value in align_tags.items(): alignment._annotations[key] = value # Query # ===== record = SeqRecord( Seq(q, alphabet), id=query_id, name="query", description=query_descr, annotations={"original_length": int(query_tags["sq_len"])}) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_tags["al_start"]) record._al_stop = int(query_tags["al_stop"]) alignment.append(record) # TODO - What if a specific alphabet has been requested? # TODO - Use an IUPAC alphabet? # TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_tags: if query_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in q: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") # Match # ===== record = SeqRecord( Seq(m, alphabet), id=match_id, name="match", description=match_descr, annotations={"original_length": int(match_tags["sq_len"])}) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_tags["al_start"]) record._al_stop = int(match_tags["al_stop"]) alignment.append(record) # This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_tags: if match_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in m: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def MafIterator(handle, seq_count=None, alphabet=single_letter_alphabet): """ Iterates over lines in a MAF file-like object (handle), yielding MultipleSeqAlignment objects. SeqRecord IDs generally correspond to species names """ in_a_bundle = False annotations = [] records = [] while True: # allows parsing of the last bundle without duplicating code try: line = next(handle) except StopIteration: line = "" if in_a_bundle: if line.startswith("s"): # add a SeqRecord to the bundle line_split = line.strip().split() if len(line_split) != 7: raise ValueError( "Error parsing alignment - 's' line must have 7 fields" ) # convert MAF-style +/- strand to biopython-type 1/-1 if line_split[4] == "+": strand = 1 elif line_split[4] == "-": strand = -1 else: strand = 1 # s (literal), src (ID), start, size, strand, srcSize, text (sequence) anno = { "start": int(line_split[2]), "size": int(line_split[3]), "strand": strand, "srcSize": int(line_split[5]) } sequence = line_split[6] # interpret a dot/period to mean same the first sequence if "." in sequence: if not records: raise ValueError( "Found dot/period in first sequence of alignment") ref = str(records[0].seq) new = [] for (s, r) in zip(sequence, ref): new.append(r if s == "." else s) sequence = "".join(new) records.append( SeqRecord(Seq(sequence, alphabet), id=line_split[1], name=line_split[1], description="", annotations=anno)) elif line.startswith("i"): # TODO: information about what is in the aligned species DNA before # and after the immediately preceding "s" line pass elif line.startswith("e"): # TODO: information about the size of the gap between the alignments # that span the current block pass elif line.startswith("q"): # TODO: quality of each aligned base for the species. # Need to find documentation on this, looks like ASCII 0-9 or gap? # Can then store in each SeqRecord's .letter_annotations dictionary, # perhaps as the raw string or turned into integers / None for gap? pass elif not line.strip(): # end a bundle of records if seq_count is not None: assert len(records) == seq_count alignment = MultipleSeqAlignment(records, alphabet) # TODO - Introduce an annotated alignment class? # See also Bio/AlignIO/FastaIO.py for same requirement. # For now, store the annotation a new private property: alignment._annotations = annotations yield alignment in_a_bundle = False annotations = [] records = [] else: raise ValueError( "Error parsing alignment - unexpected line:\n%s" % (line, )) elif line.startswith("a"): # start a bundle of records in_a_bundle = True if len(line.strip().split()[1:]) != line.count("="): raise ValueError( "Error parsing alignment - invalid key in 'a' line") annotations = dict( [x.split("=") for x in line.strip().split()[1:]]) elif line.startswith("#"): # ignore comments pass elif not line: break
def build_hsp(): if not query_tags and not match_tags: raise ValueError("No data for query %r, match %r" % (query_id, match_id)) assert query_tags, query_tags assert match_tags, match_tags evalue = align_tags.get("fa_expect") q = "?" # Just for printing len(q) in debug below m = "?" # Just for printing len(m) in debug below tool = global_tags.get("tool", "").upper() q = _extract_alignment_region(query_seq, query_tags) if tool in ["TFASTX"] and len(match_seq) == len(q): m = match_seq # Quick hack until I can work out how -, * and / characters # and the apparent mix of aa and bp coordinates works. else: m = _extract_alignment_region(match_seq, match_tags) if len(q) != len(m): message = """Darn... amino acids vs nucleotide coordinates? tool: {0} query_seq: {1} query_tags: {2} {3} length: {4} match_seq: {5} match_tags: {6} {7} length: {8} handle.name: {9} """.format(tool, query_seq, query_tags, q, len(q), match_seq, match_tags, m, len(m), handle.name) raise ValueError(message) assert alphabet is not None alignment = MultipleSeqAlignment([], alphabet) # TODO - Introduce an annotated alignment class? # See also Bio/AlignIO/MafIO.py for same requirement. # For now, store the annotation a new private property: alignment._annotations = {} # Want to record both the query header tags, and the alignment tags. for key, value in header_tags.items(): alignment._annotations[key] = value for key, value in align_tags.items(): alignment._annotations[key] = value # Query # ===== record = SeqRecord(Seq(q, alphabet), id=query_id, name="query", description=query_descr, annotations={"original_length": int(query_tags["sq_len"])}) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_tags["al_start"]) record._al_stop = int(query_tags["al_stop"]) alignment.append(record) # TODO - What if a specific alphabet has been requested? # TODO - Use an IUPAC alphabet? # TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_tags: if query_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in q: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") # Match # ===== record = SeqRecord(Seq(m, alphabet), id=match_id, name="match", description=match_descr, annotations={"original_length": int(match_tags["sq_len"])}) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_tags["al_start"]) record._al_stop = int(match_tags["al_stop"]) alignment.append(record) # This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_tags: if match_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in m: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def FastaM10Iterator(handle, alphabet=single_letter_alphabet): """Alignment iterator for the FASTA tool's pairwise alignment output. This is for reading the pairwise alignments output by Bill Pearson's FASTA program when called with the -m 10 command line option for machine readable output. For more details about the FASTA tools, see the website http://fasta.bioch.virginia.edu/ and the paper: W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 This class is intended to be used via the Bio.AlignIO.parse() function by specifying the format as "fasta-m10" as shown in the following code: from Bio import AlignIO handle = ... for a in AlignIO.parse(handle, "fasta-m10"): assert len(a) == 2, "Should be pairwise!" print "Alignment length %i" % a.get_alignment_length() for record in a: print record.seq, record.name, record.id Note that this is not a full blown parser for all the information in the FASTA output - for example, most of the header and all of the footer is ignored. Also, the alignments are not batched according to the input queries. Also note that there can be up to about 30 letters of flanking region included in the raw FASTA output as contextual information. This is NOT part of the alignment itself, and is not included in the resulting MultipleSeqAlignment objects returned. """ if alphabet is None: alphabet = single_letter_alphabet state_PREAMBLE = -1 state_NONE = 0 state_QUERY_HEADER = 1 state_ALIGN_HEADER = 2 state_ALIGN_QUERY = 3 state_ALIGN_MATCH = 4 state_ALIGN_CONS = 5 def build_hsp(): if not query_tags and not match_tags: raise ValueError("No data for query %r, match %r" \ % (query_id, match_id)) assert query_tags, query_tags assert match_tags, match_tags evalue = align_tags.get("fa_expect", None) q = "?" #Just for printing len(q) in debug below m = "?" #Just for printing len(m) in debug below tool = global_tags.get("tool", "").upper() try: q = _extract_alignment_region(query_seq, query_tags) if tool in ["TFASTX"] and len(match_seq) == len(q): m = match_seq #Quick hack until I can work out how -, * and / characters #and the apparent mix of aa and bp coordindates works. else: m = _extract_alignment_region(match_seq, match_tags) assert len(q) == len(m) except AssertionError, err: print "Darn... amino acids vs nucleotide coordinates?" print tool print query_seq print query_tags print q, len(q) print match_seq print match_tags print m, len(m) print handle.name raise err assert alphabet is not None alignment = MultipleSeqAlignment([], alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in header_tags.iteritems(): alignment._annotations[key] = value for key, value in align_tags.iteritems(): alignment._annotations[key] = value #Query #===== record = SeqRecord( Seq(q, alphabet), id=query_id, name="query", description=query_descr, annotations={"original_length": int(query_tags["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_tags["al_start"]) record._al_stop = int(query_tags["al_stop"]) alignment.append(record) #TODO - What if a specific alphabet has been requested? #TODO - Use an IUPAC alphabet? #TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_tags: if query_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in q: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") #Match #===== record = SeqRecord( Seq(m, alphabet), id=match_id, name="match", description=match_descr, annotations={"original_length": int(match_tags["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_tags["al_start"]) record._al_stop = int(match_tags["al_stop"]) alignment.append(record) #This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_tags: if match_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in m: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def FastaM10Iterator(handle, alphabet=single_letter_alphabet): """Alignment iterator for the FASTA tool's pairwise alignment output. This is for reading the pairwise alignments output by Bill Pearson's FASTA program when called with the -m 10 command line option for machine readable output. For more details about the FASTA tools, see the website http://fasta.bioch.virginia.edu/ and the paper: W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 This class is intended to be used via the Bio.AlignIO.parse() function by specifying the format as "fasta-m10" as shown in the following code: from Bio import AlignIO handle = ... for a in AlignIO.parse(handle, "fasta-m10"): assert len(a) == 2, "Should be pairwise!" print "Alignment length %i" % a.get_alignment_length() for record in a: print record.seq, record.name, record.id Note that this is not a full blown parser for all the information in the FASTA output - for example, most of the header and all of the footer is ignored. Also, the alignments are not batched according to the input queries. Also note that there can be up to about 30 letters of flanking region included in the raw FASTA output as contextual information. This is NOT part of the alignment itself, and is not included in the resulting MultipleSeqAlignment objects returned. """ if alphabet is None: alphabet = single_letter_alphabet state_PREAMBLE = -1 state_NONE = 0 state_QUERY_HEADER = 1 state_ALIGN_HEADER = 2 state_ALIGN_QUERY = 3 state_ALIGN_MATCH = 4 state_ALIGN_CONS = 5 def build_hsp(): if not query_tags and not match_tags: raise ValueError("No data for query %r, match %r" % (query_id, match_id)) assert query_tags, query_tags assert match_tags, match_tags evalue = align_tags.get("fa_expect", None) q = "?" # Just for printing len(q) in debug below m = "?" # Just for printing len(m) in debug below tool = global_tags.get("tool", "").upper() try: q = _extract_alignment_region(query_seq, query_tags) if tool in ["TFASTX"] and len(match_seq) == len(q): m = match_seq # Quick hack until I can work out how -, * and / characters # and the apparent mix of aa and bp coordinates works. else: m = _extract_alignment_region(match_seq, match_tags) assert len(q) == len(m) except AssertionError, err: print "Darn... amino acids vs nucleotide coordinates?" print tool print query_seq print query_tags print q, len(q) print match_seq print match_tags print m, len(m) print handle.name raise err assert alphabet is not None alignment = MultipleSeqAlignment([], alphabet) # TODO - Introduce an annotated alignment class? # For now, store the annotation a new private property: alignment._annotations = {} # Want to record both the query header tags, and the alignment tags. for key, value in header_tags.iteritems(): alignment._annotations[key] = value for key, value in align_tags.iteritems(): alignment._annotations[key] = value # Query # ===== record = SeqRecord( Seq(q, alphabet), id=query_id, name="query", description=query_descr, annotations={"original_length": int(query_tags["sq_len"])}, ) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_tags["al_start"]) record._al_stop = int(query_tags["al_stop"]) alignment.append(record) # TODO - What if a specific alphabet has been requested? # TODO - Use an IUPAC alphabet? # TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_tags: if query_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in q: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") # Match # ===== record = SeqRecord( Seq(m, alphabet), id=match_id, name="match", description=match_descr, annotations={"original_length": int(match_tags["sq_len"])}, ) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_tags["al_start"]) record._al_stop = int(match_tags["al_stop"]) alignment.append(record) # This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_tags: if match_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in m: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def __next__(self): """Parse the next alignment from the handle.""" handle = self.handle if self._header is None: line = handle.readline() else: # Header we saved from when we were parsing # the previous alignment. line = self._header self._header = None if not line: # Empty file - just give up. raise StopIteration if line.strip() != "# STOCKHOLM 1.0": raise ValueError("Did not find STOCKHOLM header") # Note: If this file follows the PFAM conventions, there should be # a line containing the number of sequences, e.g. "#=GF SQ 67" # We do not check for this - perhaps we should, and verify that # if present it agrees with our parsing. seqs = {} ids = {} # Really only need an OrderedSet, but python lacks this gs = {} gr = {} gf = {} gc = {} passed_end_alignment = False while True: line = handle.readline() if not line: break # end of file line = line.strip() # remove trailing \n if line == "# STOCKHOLM 1.0": self._header = line break elif line == "//": # The "//" line indicates the end of the alignment. # There may still be more meta-data passed_end_alignment = True elif line == "": # blank line, ignore pass elif line[0] != "#": # Sequence # Format: "<seqname> <sequence>" assert not passed_end_alignment parts = [x.strip() for x in line.split(" ", 1)] if len(parts) != 2: # This might be someone attempting to store a zero length sequence? raise ValueError( "Could not split line into identifier and sequence:\n" + line) seq_id, seq = parts if seq_id not in ids: ids[seq_id] = True seqs.setdefault(seq_id, "") seqs[seq_id] += seq.replace(".", "-") elif len(line) >= 5: # Comment line or meta-data if line[:5] == "#=GF ": # Generic per-File annotation, free text # Format: #=GF <feature> <free text> feature, text = line[5:].strip().split(None, 1) # Each feature key could be used more than once, # so store the entries as a list of strings. if feature not in gf: gf[feature] = [text] else: gf[feature].append(text) elif line[:5] == "#=GC ": # Generic per-Column annotation, exactly 1 char per column # Format: "#=GC <feature> <exactly 1 char per column>" feature, text = line[5:].strip().split(None, 2) if feature not in gc: gc[feature] = "" gc[feature] += text.strip() # append to any previous entry # Might be interleaved blocks, so can't check length yet elif line[:5] == "#=GS ": # Generic per-Sequence annotation, free text # Format: "#=GS <seqname> <feature> <free text>" try: seq_id, feature, text = line[5:].strip().split(None, 2) except ValueError: # Free text can sometimes be empty, which a one line split throws an error for. # See https://github.com/biopython/biopython/issues/2982 for more details seq_id, feature = line[5:].strip().split(None, 1) text = "" # if seq_id not in ids: # ids.append(seq_id) if seq_id not in gs: gs[seq_id] = {} if feature not in gs[seq_id]: gs[seq_id][feature] = [text] else: gs[seq_id][feature].append(text) elif line[:5] == "#=GR ": # Generic per-Sequence AND per-Column markup # Format: "#=GR <seqname> <feature> <exactly 1 char per column>" seq_id, feature, text = line[5:].strip().split(None, 2) # if seq_id not in ids: # ids.append(seq_id) if seq_id not in gr: gr[seq_id] = {} if feature not in gr[seq_id]: gr[seq_id][feature] = "" gr[seq_id][feature] += text.strip( ) # append to any previous entry # Might be interleaved blocks, so can't check length yet # Next line... assert len(seqs) <= len(ids) # assert len(gs) <= len(ids) # assert len(gr) <= len(ids) self.ids = ids.keys() self.sequences = seqs self.seq_annotation = gs self.seq_col_annotation = gr if ids and seqs: if (self.records_per_alignment is not None and self.records_per_alignment != len(ids)): raise ValueError( "Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment)) alignment_length = len(list(seqs.values())[0]) records = [] # Alignment obj will put them all in a list anyway for seq_id in ids: seq = seqs[seq_id] if alignment_length != len(seq): raise ValueError( "Sequences have different lengths, or repeated identifier" ) name, start, end = self._identifier_split(seq_id) record = SeqRecord( Seq(seq), id=seq_id, name=name, description=seq_id, annotations={"accession": name}, ) # Accession will be overridden by _populate_meta_data if an explicit # accession is provided: record.annotations["accession"] = name if start is not None: record.annotations["start"] = start if end is not None: record.annotations["end"] = end self._populate_meta_data(seq_id, record) records.append(record) for k, v in gc.items(): if len(v) != alignment_length: raise ValueError("%s length %i, expected %i" % (k, len(v), alignment_length)) alignment = MultipleSeqAlignment(records) for k, v in sorted(gc.items()): if k in self.pfam_gc_mapping: alignment.column_annotations[self.pfam_gc_mapping[k]] = v elif k.endswith("_cons") and k[:-5] in self.pfam_gr_mapping: alignment.column_annotations[self.pfam_gr_mapping[ k[:-5]]] = v else: # Ignore it? alignment.column_annotations["GC:" + k] = v # TODO - Introduce an annotated alignment class? # For now, store the annotation a new private property: alignment._annotations = gr return alignment else: raise StopIteration
def next(self): """Reads from the handle to construct and return the next alignment. This returns the pairwise alignment of query and match/library sequences as an MultipleSeqAlignment object containing two rows. """ handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: raise StopIteration if line.startswith("#"): #Skip the file header before the alignments. e.g. line = self._skip_file_header(line) while ">>>" in line and not line.startswith(">>>"): #Moved onto the next query sequence! self._query_descr = "" self._query_header_annotation = {} #Read in the query header line = self._parse_query_header(line) #Now should be some alignments, but if not we move onto the next query if not line: #End of file raise StopIteration if ">>><<<" in line: #Reached the end of the alignments, no need to read the footer... raise StopIteration #Should start >>... and not >>>... assert line[0:2] == ">>" and not line[2] == ">", line query_seq_parts, match_seq_parts = [], [] query_annotation, match_annotation = {}, {} match_descr = "" alignment_annotation = {} #This should be followed by the target match ID line, then more tags. #e.g. """ >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] ; fa_frame: f ; fa_initn: 52 ; fa_init1: 52 ; fa_opt: 70 ; fa_z-score: 105.5 ; fa_bits: 27.5 ; fa_expect: 0.082 ; sw_score: 70 ; sw_ident: 0.279 ; sw_sim: 0.651 ; sw_overlap: 43 """ if (not line[0:2] == ">>") or line[0:3] == ">>>": raise ValueError("Expected target line starting '>>'") match_descr = line[2:].strip() #Handle the following "alignment hit" tagged data, e.g. line = handle.readline() line = self._parse_tag_section(line, alignment_annotation) assert not line[0:2] == "; " #Then we have the alignment numbers and sequence for the query """ >gi|10955265| .. ; sq_len: 346 ; sq_offset: 1 ; sq_type: p ; al_start: 197 ; al_stop: 238 ; al_display_start: 167 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL GEYFTENKPKYIIREIHQET """ if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..'") assert self._query_descr.startswith(line[1:].split(None, 1)[0]) #Handle the following "query alignment" tagged data line = handle.readline() line = self._parse_tag_section(line, query_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence (with leading flanking region) while not line[0] == ">": query_seq_parts.append(line.strip()) line = handle.readline() #Handle the following "match alignment" data """ >gi|152973545|ref|YP_001338596.1| .. ; sq_len: 242 ; sq_type: p ; al_start: 52 ; al_stop: 94 ; al_display_start: 22 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR QDFAFTRKMRREARQVEQSW """ #Match identifier if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError( "Expected line starting '>' and ending '..', got '%s'" % repr(line)) assert match_descr.startswith(line[1:].split(None, 1)[0]) #Tagged data, line = handle.readline() line = self._parse_tag_section(line, match_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence with flanking region... #but before that, since FASTA 35.4.1 there can be an consensus here, """ ; al_cons: .::. : :. ---. :: :. . : ..-:::-: :.: ..:...: etc """ while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): match_seq_parts.append(line.strip()) line = handle.readline() if line[0:2] == "; ": assert line.strip() == "; al_cons:" align_consensus_parts = [] line = handle.readline() while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): align_consensus_parts.append(line.strip()) line = handle.readline() #If we do anything with this in future, must remove any flanking region. align_consensus = "".join(align_consensus_parts) del align_consensus_parts assert not line[0:2] == "; " else: align_consensus = None assert (line[0] == ">" or ">>>" in line) self._header = line #We built a list of strings and then joined them because #its faster than appending to a string. query_seq = "".join(query_seq_parts) match_seq = "".join(match_seq_parts) del query_seq_parts, match_seq_parts #Note, query_seq and match_seq will usually be of different lengths, apparently #because in the m10 format leading gaps are added but not trailing gaps! #Remove the flanking regions, query_align_seq = self._extract_alignment_region( query_seq, query_annotation) match_align_seq = self._extract_alignment_region( match_seq, match_annotation) #How can we do this for the (optional) consensus? #The "sq_offset" values can be specified with the -X command line option. #They appear to just shift the origin used in the calculation of the coordinates. if len(query_align_seq) != len(match_align_seq): raise ValueError( "Problem parsing the alignment sequence coordinates, " "following should be the same length but are not:\n" "%s - len %i\n%s - len %i" % (query_align_seq, len(query_align_seq), match_align_seq, len(match_align_seq))) if "sw_overlap" in alignment_annotation: if int(alignment_annotation["sw_overlap"]) != len(query_align_seq): raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ % (alignment_annotation["sw_overlap"], len(query_align_seq))) #TODO - Look at the "sq_type" to assign a sensible alphabet? alphabet = self.alphabet alignment = MultipleSeqAlignment([], alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in self._query_header_annotation.iteritems(): alignment._annotations[key] = value for key, value in alignment_annotation.iteritems(): alignment._annotations[key] = value #Query #===== record = SeqRecord( Seq(query_align_seq, alphabet), id=self._query_descr.split(None, 1)[0].strip(","), name="query", description=self._query_descr, annotations={"original_length": int(query_annotation["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_annotation["al_start"]) record._al_stop = int(query_annotation["al_stop"]) alignment.append(record) #TODO - What if a specific alphabet has been requested? #TODO - Use an IUPAC alphabet? #TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_annotation: if query_annotation["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_annotation["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in query_align_seq: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") #Match #===== record = SeqRecord( Seq(match_align_seq, alphabet), id=match_descr.split(None, 1)[0].strip(","), name="match", description=match_descr, annotations={"original_length": int(match_annotation["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_annotation["al_start"]) record._al_stop = int(match_annotation["al_stop"]) alignment.append(record) #This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_annotation: if match_annotation["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_annotation["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in match_align_seq: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def next(self): """Reads from the handle to construct and return the next alignment. This returns the pairwise alignment of query and match/library sequences as an MultipleSeqAlignment object containing two rows. """ handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return None if line.startswith("#"): #Skip the file header before the alignments. e.g. line = self._skip_file_header(line) while ">>>" in line and not line.startswith(">>>"): #Moved onto the next query sequence! self._query_descr = "" self._query_header_annotation = {} #Read in the query header line = self._parse_query_header(line) #Now should be some alignments, but if not we move onto the next query if not line: #End of file return None if ">>><<<" in line: #Reached the end of the alignments, no need to read the footer... return None #Should start >>... and not >>>... assert line[0:2] == ">>" and not line[2] == ">", line query_seq_parts, match_seq_parts = [], [] query_annotation, match_annotation = {}, {} match_descr = "" alignment_annotation = {} #This should be followed by the target match ID line, then more tags. #e.g. """ >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] ; fa_frame: f ; fa_initn: 52 ; fa_init1: 52 ; fa_opt: 70 ; fa_z-score: 105.5 ; fa_bits: 27.5 ; fa_expect: 0.082 ; sw_score: 70 ; sw_ident: 0.279 ; sw_sim: 0.651 ; sw_overlap: 43 """ if (not line[0:2] == ">>") or line[0:3] == ">>>": raise ValueError("Expected target line starting '>>'") match_descr = line[2:].strip() #Handle the following "alignment hit" tagged data, e.g. line = handle.readline() line = self._parse_tag_section(line, alignment_annotation) assert not line[0:2] == "; " #Then we have the alignment numbers and sequence for the query """ >gi|10955265| .. ; sq_len: 346 ; sq_offset: 1 ; sq_type: p ; al_start: 197 ; al_stop: 238 ; al_display_start: 167 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL GEYFTENKPKYIIREIHQET """ if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..'") assert self._query_descr.startswith(line[1:].split(None,1)[0]) #Handle the following "query alignment" tagged data line = handle.readline() line = self._parse_tag_section(line, query_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence (with leading flanking region) while not line[0] == ">": query_seq_parts.append(line.strip()) line = handle.readline() #Handle the following "match alignment" data """ >gi|152973545|ref|YP_001338596.1| .. ; sq_len: 242 ; sq_type: p ; al_start: 52 ; al_stop: 94 ; al_display_start: 22 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR QDFAFTRKMRREARQVEQSW """ #Match identifier if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line)) assert match_descr.startswith(line[1:].split(None,1)[0]) #Tagged data, line = handle.readline() line = self._parse_tag_section(line, match_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence with flanking region... #but before that, since FASTA 35.4.1 there can be an consensus here, """ ; al_cons: .::. : :. ---. :: :. . : ..-:::-: :.: ..:...: etc """ while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): match_seq_parts.append(line.strip()) line = handle.readline() if line[0:2] == "; ": assert line.strip() == "; al_cons:" align_consensus_parts = [] line = handle.readline() while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): align_consensus_parts.append(line.strip()) line = handle.readline() #If we do anything with this in future, must remove any flanking region. align_consensus = "".join(align_consensus_parts) del align_consensus_parts assert not line[0:2] == "; " else: align_consensus = None assert (line[0] == ">" or ">>>" in line) self._header = line #We built a list of strings and then joined them because #its faster than appending to a string. query_seq = "".join(query_seq_parts) match_seq = "".join(match_seq_parts) del query_seq_parts, match_seq_parts #Note, query_seq and match_seq will usually be of different lengths, apparently #because in the m10 format leading gaps are added but not trailing gaps! #Remove the flanking regions, query_align_seq = self._extract_alignment_region(query_seq, query_annotation) match_align_seq = self._extract_alignment_region(match_seq, match_annotation) #How can we do this for the (optional) consensus? #The "sq_offset" values can be specified with the -X command line option. #They appear to just shift the origin used in the calculation of the coordinates. if len(query_align_seq) != len(match_align_seq): raise ValueError("Problem parsing the alignment sequence coordinates, " "following should be the same length but are not:\n" "%s - len %i\n%s - len %i" % (query_align_seq, len(query_align_seq), match_align_seq, len(match_align_seq))) if "sw_overlap" in alignment_annotation: if int(alignment_annotation["sw_overlap"]) != len(query_align_seq): raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ % (alignment_annotation["sw_overlap"], len(query_align_seq))) #TODO - Look at the "sq_type" to assign a sensible alphabet? alphabet = self.alphabet alignment = MultipleSeqAlignment([], alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in self._query_header_annotation.iteritems(): alignment._annotations[key] = value for key, value in alignment_annotation.iteritems(): alignment._annotations[key] = value #Query #===== record = SeqRecord(Seq(query_align_seq, alphabet), id = self._query_descr.split(None,1)[0].strip(","), name = "query", description = self._query_descr, annotations = {"original_length" : int(query_annotation["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_annotation["al_start"]) record._al_stop = int(query_annotation["al_stop"]) alignment.append(record) #TODO - What if a specific alphabet has been requested? #TODO - Use an IUPAC alphabet? #TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_annotation: if query_annotation["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_annotation["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in query_align_seq: if not hasattr(record.seq.alphabet,"gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") #Match #===== record = SeqRecord(Seq(match_align_seq, alphabet), id = match_descr.split(None,1)[0].strip(","), name = "match", description = match_descr, annotations = {"original_length" : int(match_annotation["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_annotation["al_start"]) record._al_stop = int(match_annotation["al_stop"]) alignment.append(record) #This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_annotation: if match_annotation["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_annotation["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in match_align_seq: if not hasattr(record.seq.alphabet,"gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def MafIterator(handle, seq_count=None, alphabet=single_letter_alphabet): """ Iterates over lines in a MAF file-like object (handle), yielding MultipleSeqAlignment objects. SeqRecord IDs generally correspond to species names """ in_a_bundle = False annotations = [] records = [] while True: # allows parsing of the last bundle without duplicating code try: line = next(handle) except StopIteration: line = "" if in_a_bundle: if line.startswith("s"): # add a SeqRecord to the bundle line_split = line.strip().split() if len(line_split) != 7: raise ValueError("Error parsing alignment - 's' line must have 7 fields") # convert MAF-style +/- strand to biopython-type 1/-1 if line_split[4] == "+": strand = 1 elif line_split[4] == "-": strand = -1 else: # TODO: issue warning, set to 0? strand = 1 # s (literal), src (ID), start, size, strand, srcSize, text (sequence) anno = {"start": int(line_split[2]), "size": int(line_split[3]), "strand": strand, "srcSize": int(line_split[5])} sequence = line_split[6] # interpret a dot/period to mean the same as the first sequence if "." in sequence: if not records: raise ValueError("Found dot/period in first sequence of alignment") ref = str(records[0].seq) new = [] for (letter, ref_letter) in zip(sequence, ref): new.append(ref_letter if letter == "." else letter) sequence = "".join(new) records.append(SeqRecord(Seq(sequence, alphabet), id=line_split[1], name=line_split[1], description="", annotations=anno)) elif line.startswith("i"): # TODO: information about what is in the aligned species DNA before # and after the immediately preceding "s" line pass elif line.startswith("e"): # TODO: information about the size of the gap between the alignments # that span the current block pass elif line.startswith("q"): # TODO: quality of each aligned base for the species. # Need to find documentation on this, looks like ASCII 0-9 or gap? # Can then store in each SeqRecord's .letter_annotations dictionary, # perhaps as the raw string or turned into integers / None for gap? pass elif line.startswith("#"): # ignore comments # (not sure whether comments # are in the maf specification, though) pass elif not line.strip(): # end a bundle of records if seq_count is not None: assert len(records) == seq_count alignment = MultipleSeqAlignment(records, alphabet) # TODO - Introduce an annotated alignment class? # See also Bio/AlignIO/FastaIO.py for same requirement. # For now, store the annotation a new private property: alignment._annotations = annotations yield alignment in_a_bundle = False annotations = [] records = [] else: raise ValueError("Error parsing alignment - unexpected line:\n%s" % (line,)) elif line.startswith("a"): # start a bundle of records in_a_bundle = True annot_strings = line.strip().split()[1:] if len(annot_strings) != line.count("="): raise ValueError("Error parsing alignment - invalid key in 'a' line") annotations = dict([a_string.split("=") for a_string in annot_strings]) elif line.startswith("#"): # ignore comments pass elif not line: break
def MafIterator(handle, seq_count = None, alphabet = single_letter_alphabet): """ Iterates over lines in a MAF file-like object (handle), yielding MultipleSeqAlignment objects. SeqRecord IDs generally correspond to species names """ in_a_bundle = False annotations = [] records = [] while True: # allows parsing of the last bundle without duplicating code try: line = handle.next() except StopIteration: line = "" if in_a_bundle: if line.startswith("s"): # add a SeqRecord to the bundle line_split = line.strip().split() if len(line_split) <> 7: raise ValueError("Error parsing alignment - 's' line must have 7 fields") # convert MAF-style +/- strand to biopython-stype +1/-1 if line_split[4] == "+": strand = "+1" elif line_split[4] == "-": strand = "-1" else: strand = "+1" # s (literal), src (ID), start, size, strand, srcSize, text (sequence) anno = {"start": int(line_split[2]), "size": int(line_split[3]), "strand": strand, "srcSize": int(line_split[5])} sequence = line_split[6] # interpret a dot/period to mean same the first sequence if "." in sequence: if not records: raise ValueError("Found dot/period in first sequence of alignment") ref = str(records[0].seq) new = [] for (s, r) in zip(sequence, ref): new.append(r if s == "." else s) sequence = "".join(new) records.append(SeqRecord(Seq(sequence, alphabet), id = line_split[1], name = line_split[1], description = "", annotations = anno)) elif line.startswith("e") or \ line.startswith("i") or \ line.startswith("q"): # not implemented pass elif not line.strip(): # end a bundle of records if seq_count is not None: assert len(records) == seq_count alignment = MultipleSeqAlignment(records, alphabet) #TODO - Introduce an annotated alignment class? #See also Bio/AlignIO/FastaIO.py for same requirement. #For now, store the annotation a new private property: alignment._annotations = annotations yield alignment in_a_bundle = False annotations = [] records = [] else: raise ValueError("Error parsing alignment - unexpected line:\n%s" % (line,)) elif line.startswith("a"): # start a bundle of records in_a_bundle = True if len(line.strip().split()[1:]) != line.count("="): raise ValueError("Error parsing alignment - invalid key in 'a' line") annotations = dict([x.split("=") for x in line.strip().split()[1:]]) elif line.startswith("#"): # ignore comments pass elif not line: break
def build_hsp(): if not query_tags and not match_tags: raise ValueError("No data for query %r, match %r" % (query_id, match_id)) assert query_tags, query_tags assert match_tags, match_tags evalue = align_tags.get("fa_expect") tool = global_tags.get("tool", "").upper() q = _extract_alignment_region(query_seq, query_tags) if tool in ["TFASTX"] and len(match_seq) == len(q): m = match_seq # Quick hack until I can work out how -, * and / characters # and the apparent mix of aa and bp coordinates works. else: m = _extract_alignment_region(match_seq, match_tags) if len(q) != len(m): raise ValueError(f"""\ Darn... amino acids vs nucleotide coordinates? tool: {tool} query_seq: {query_seq} query_tags: {query_tags} {q} length: {len(q)} match_seq: {match_seq} match_tags: {match_tags} {m} length: {len(m)} handle.name: {handle.name} """) alignment = MultipleSeqAlignment([]) # TODO - Introduce an annotated alignment class? # See also Bio/AlignIO/MafIO.py for same requirement. # For now, store the annotation a new private property: alignment._annotations = {} # Want to record both the query header tags, and the alignment tags. for key, value in header_tags.items(): alignment._annotations[key] = value for key, value in align_tags.items(): alignment._annotations[key] = value # Query # ===== record = SeqRecord( Seq(q), id=query_id, name="query", description=query_descr, annotations={"original_length": int(query_tags["sq_len"])}, ) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_tags["al_start"]) record._al_stop = int(query_tags["al_stop"]) alignment.append(record) # TODO - Can FASTA output RNA? if "sq_type" in query_tags: if query_tags["sq_type"] == "D": record.annotations["molecule_type"] = "DNA" elif query_tags["sq_type"] == "p": record.annotations["molecule_type"] = "protein" # Match # ===== record = SeqRecord( Seq(m), id=match_id, name="match", description=match_descr, annotations={"original_length": int(match_tags["sq_len"])}, ) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_tags["al_start"]) record._al_stop = int(match_tags["al_stop"]) alignment.append(record) if "sq_type" in match_tags: if match_tags["sq_type"] == "D": record.annotations["molecule_type"] = "DNA" elif match_tags["sq_type"] == "p": record.annotations["molecule_type"] = "protein" return alignment