class Record: """Hold Saf information in a format similar to the original record. The Record class is meant to make data easy to get to when you are just interested in looking at Saf data. Attributes: alignment """ def __init__(self): self.alignment = Alignment( Bio.Alphabet.generic_alphabet ) def __str__( self ): output = '' sequences = self.alignment.get_all_seqs() for sequence_record in sequences: output = output + '%s\n' % sequence_record.description output = output + out_sequence( sequence_record.seq.data ) return output
class Record: """Hold Saf information in a format similar to the original record. The Record class is meant to make data easy to get to when you are just interested in looking at Saf data. Attributes: alignment """ def __init__(self): self.alignment = Alignment(Bio.Alphabet.generic_alphabet) def __str__(self): output = '' sequences = self.alignment.get_all_seqs() for sequence_record in sequences: output = output + '%s\n' % sequence_record.description output = output + out_sequence(sequence_record.seq.data) return output
def next(self): try: line = self._header del self._header except AttributeError: line = self.handle.readline() if not line: # Empty file - just give up. return if not line.strip() == "# STOCKHOLM 1.0": raise ValueError("Did not find STOCKHOLM header") # import sys # print >> sys.stderr, 'Warning file does not start with STOCKHOLM 1.0' # Note: If this file follows the PFAM conventions, there should be # a line containing the number of sequences, e.g. "#=GF SQ 67" # We do not check for this - perhaps we should, and verify that # if present it agrees with our parsing. seqs = {} ids = [] gs = {} gr = {} gf = {} passed_end_alignment = False while 1: line = self.handle.readline() if not line: break # end of file line = line.strip() # remove trailing \n if line == "# STOCKHOLM 1.0": self._header = line break elif line == "//": # The "//" line indicates the end of the alignment. # There may still be more meta-data passed_end_alignment = True elif line == "": # blank line, ignore pass elif line[0] != "#": # Sequence # Format: "<seqname> <sequence>" assert not passed_end_alignment parts = [x.strip() for x in line.split(" ", 1)] if len(parts) != 2: # This might be someone attempting to store a zero length sequence? raise ValueError("Could not split line into identifier " + "and sequence:\n" + line) id, seq = parts if id not in ids: ids.append(id) seqs.setdefault(id, "") seqs[id] += seq.replace(".", "-") elif len(line) >= 5: # Comment line or meta-data if line[:5] == "#=GF ": # Generic per-File annotation, free text # Format: #=GF <feature> <free text> feature, text = line[5:].strip().split(None, 1) # Each feature key could be used more than once, # so store the entries as a list of strings. if feature not in gf: gf[feature] = [text] else: gf[feature].append(text) elif line[:5] == "#=GC ": # Generic per-Column annotation, exactly 1 char per column # Format: "#=GC <feature> <exactly 1 char per column>" pass elif line[:5] == "#=GS ": # Generic per-Sequence annotation, free text # Format: "#=GS <seqname> <feature> <free text>" id, feature, text = line[5:].strip().split(None, 2) # if id not in ids : # ids.append(id) if id not in gs: gs[id] = {} if feature not in gs[id]: gs[id][feature] = [text] else: gs[id][feature].append(text) elif line[:5] == "#=GR ": # Generic per-Sequence AND per-Column markup # Format: "#=GR <seqname> <feature> <exactly 1 char per column>" id, feature, text = line[5:].strip().split(None, 2) # if id not in ids : # ids.append(id) if id not in gr: gr[id] = {} if feature not in gr[id]: gr[id][feature] = "" gr[id][feature] += text.strip() # append to any previous entry # TODO - Should we check the length matches the alignment length? # For iterlaced sequences the GR data can be split over # multiple lines # Next line... assert len(seqs) <= len(ids) # assert len(gs) <= len(ids) # assert len(gr) <= len(ids) self.ids = ids self.sequences = seqs self.seq_annotation = gs self.seq_col_annotation = gr if ids and seqs: if self.records_per_alignment is not None and self.records_per_alignment != len(ids): raise ValueError( "Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment) ) alignment = Alignment(self.alphabet) # TODO - Introduce an annotated alignment class? # For now, store the annotation a new private property: alignment._annotations = gr alignment_length = len(seqs.values()[0]) for id in ids: seq = seqs[id] if alignment_length != len(seq): raise ValueError("Sequences have different lengths, or repeated identifier") name, start, end = self._identifier_split(id) alignment.add_sequence(id, seq, start=start, end=end) record = alignment.get_all_seqs()[-1] assert record.id == id or record.description == id record.id = id record.name = name record.description = id # will be overridden by _populate_meta_data if an explicit # accession is provided: record.annotations["accession"] = name self._populate_meta_data(id, record) return alignment else: return None
def next(self) : """Reads from the handle to construct and return the next alignment. This returns the pairwise alignment of query and match/library sequences as an Alignment object containing two rows.""" handle = self.handle try : #Header we saved from when we were parsing #the previous alignment. line = self._header print self._header.strip(), '--> self_header' del self._header except AttributeError: line = handle.readline() if not line: return None if line.startswith('#-') : #Reached the end of the alignments, no need to read the footer... return None if line.startswith("##") : #Skip the file header before the alignments. e.g. # print line.strip() line = self._skip_file_header(line) # print 'Back from file header skip' assert line.startswith('#'), line while not line.startswith('#=') : line = self.handle.readline() if line.startswith('#='): #Moved onto the next query sequence! self._query_descr = "" self._query_header_annotation = {} #Read in the query header line = self._parse_query_header(line) if not line : #End of file return None assert line.startswith(">>") and not line.startswith(">>>"), line query_seq_parts, match_seq_parts = [], [] query_annotation, match_annotation = {}, {} match_descr = "" alignment_annotation = {} #This should be followed by the target match numbering line, then more tags. #e.g. """ >>#2 ; sw_score: 41.0 ; sw_ident: 0.846 ; sw_overlap: 13 """ if not line.startswith(">>") and not line.startswith(">>>") : raise ValueError("Expected target line starting '>>'") match_descr = line[2:].strip() #print match_descr, 'match' #Handle the following "alignment hit" tagged data, e.g. line = handle.readline() line = self._parse_tag_section(line, alignment_annotation) assert not line.startswith("; ") #Then we have the alignment numbers and sequence for the query """ >gi|10955265| .. ; sq_len: 346 ; sq_offset: 1 ; sq_type: p ; al_start: 197 ; al_stop: 238 ; al_display_start: 167 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL GEYFTENKPKYIIREIHQET """ if not (line.startswith(">") and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..'") assert self._query_descr.startswith(line[1:].split()[0]) #Handle the following "query alignment" tagged data line = handle.readline() line = self._parse_tag_section(line, query_annotation) assert not line.startswith("; ") #Now should have the aligned query sequence (with leading flanking region) while not line.startswith(">") : query_seq_parts.append(line.strip()) line = handle.readline() # print 'queryseq', line.strip() #Handle the following "match alignment" data """ >gi|152973545|ref|YP_001338596.1| .. ; sq_len: 242 ; sq_type: p ; al_start: 52 ; al_stop: 94 ; al_display_start: 22 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR QDFAFTRKMRREARQVEQSW """ #Match identifier if not (line.startswith(">") and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line)) #print '----->', line.strip(), match_descr match_descr = line[1:].split()[0] + match_descr #assert match_descr.startswith(line[1:].split()[0]) # assert self._match_descr.startswith(line[1:].split()[0]) #Tagged data, line = handle.readline() line = self._parse_tag_section(line, match_annotation) assert not line.startswith("; ") #Now should have the aligned query sequence with flanking region... while not (line.startswith(">") or ">>>" in line) and not line.startswith('#'): match_seq_parts.append(line.strip()) line = handle.readline() if line.startswith('>') or '>>>' in line: self._header = line #We built a list of strings and then joined them because #its faster than appending to a string. query_seq = "".join(query_seq_parts) match_seq = "".join(match_seq_parts) del query_seq_parts, match_seq_parts #Note, query_seq and match_seq will usually be of different lengths, apparently #because in the m10 format leading gaps are added but not trailing gaps! #Remove the flanking regions, query_align_seq = self._extract_alignment_region(query_seq, query_annotation) match_align_seq = self._extract_alignment_region(match_seq, match_annotation) #The "sq_offset" values can be specified with the -X command line option. #The appear to just shift the origin used in the calculation of the coordinates. if ("sq_offset" in query_annotation and query_annotation["sq_offset"] != "1") \ or ("sq_offset" in match_annotation and match_annotation["sq_offset"] != "1") : #Note that until some point in the v35 series, FASTA always recorded one #for the query offset, and ommitted the match offset (even when these were #query_seq the -X command line option). #TODO - Work out how exactly the use of -X offsets changes things. #raise ValueError("Offsets from the -X command line option are not (yet) supported") pass # this is not useful when using stretcher # if len(query_align_seq) != len(match_align_seq) : # raise ValueError("Problem parsing the alignment sequence coordinates") if "sw_overlap" in alignment_annotation : if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) : raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ % (alignment_annotation["sw_overlap"], len(query_align_seq))) #TODO - Look at the "sq_type" to assign a sensible alphabet? alignment = Alignment(self.alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in self._query_header_annotation.iteritems() : alignment._annotations[key] = value for key, value in alignment_annotation.iteritems() : alignment._annotations[key] = value #TODO - Once the alignment object gets an append method, use it. #(i.e. an add SeqRecord method) alignment.add_sequence(self._query_descr, query_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == self._query_descr or record.description == self._query_descr assert record.seq.tostring() == query_align_seq record.id = self._query_descr.split()[0].strip(",") record.name = "query" record.annotations["original_length"] = int(query_annotation["sq_len"]) # Roba mia for k in query_annotation.keys(): record.annotations[k] = query_annotation[k] alignment.add_sequence(match_descr, match_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == match_descr or record.description == match_descr assert record.seq.tostring() == match_align_seq record.id = match_descr.split()[0].strip(",") record.name = "match" record.annotations["original_length"] = int(match_annotation["sq_len"]) # Roba mia for k in query_annotation.keys(): record.annotations[k] = match_annotation[k] return alignment
def next(self): handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return line = line.strip() parts = filter(None, line.split()) if len(parts) != 2: raise ValueError("First line should have two integers") try: number_of_seqs = int(parts[0]) length_of_seqs = int(parts[1]) except ValueError: raise ValueError("First line should have two integers") assert self._is_header(line) if self.records_per_alignment is not None \ and self.records_per_alignment != number_of_seqs : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (number_of_seqs, self.records_per_alignment)) ids = [] seqs = [] #Expects STRICT truncation/padding to 10 characters #Does not require any white space between name and seq. for i in range(0, number_of_seqs): line = handle.readline().rstrip() ids.append(line[:10].strip()) #first ten characters seqs.append([line[10:].strip().replace(" ", "")]) #Look for further blocks line = "" while True: #Skip any blank lines between blocks... while "" == line.strip(): line = handle.readline() if not line: break #end of file if not line: break #end of file if self._is_header(line): #Looks like the start of a concatenated alignment self._header = line break #print "New block..." for i in range(0, number_of_seqs): seqs[i].append(line.strip().replace(" ", "")) line = handle.readline() if (not line) and i + 1 < number_of_seqs: raise ValueError("End of file mid-block") if not line: break #end of file alignment = Alignment(self.alphabet) for i in range(0, number_of_seqs): seq = "".join(seqs[i]) if len(seq) != length_of_seqs: raise ValueError("Sequence %i length %i, expected length %i" \ % (i+1, len(seq), length_of_seqs)) alignment.add_sequence(ids[i], seq) record = alignment.get_all_seqs()[-1] assert ids[i] == record.id or ids[i] == record.description record.id = ids[i] record.name = ids[i] record.description = ids[i] return alignment
def next(self) : """Reads from the handle to construct and return the next alignment. This returns the pairwise alignment of query and match/library sequences as an Alignment object containing two rows.""" handle = self.handle try : #Header we saved from when we were parsing #the previous alignment. line = self._header print self._header.strip(), '--> self_header' del self._header except AttributeError: line = handle.readline() if not line: return None if line.startswith('#-') : #Reached the end of the alignments, no need to read the footer... return None if line.startswith("##") : #Skip the file header before the alignments. e.g. # print line.strip() line = self._skip_file_header(line) # print 'Back from file header skip' assert line.startswith('#'), line while not line.startswith('#=') : line = self.handle.readline() if line.startswith('#='): #Moved onto the next query sequence! self._query_descr = "" self._query_header_annotation = {} #Read in the query header line = self._parse_query_header(line) if not line : #End of file return None assert line.startswith(">>") and not line.startswith(">>>"), line query_seq_parts, match_seq_parts = [], [] query_annotation, match_annotation = {}, {} match_descr = "" alignment_annotation = {} #This should be followed by the target match numbering line, then more tags. #e.g. """ >>#2 ; sw_score: 41.0 ; sw_ident: 0.846 ; sw_overlap: 13 """ if not line.startswith(">>") and not line.startswith(">>>") : raise ValueError("Expected target line starting '>>'") match_descr = line[2:].strip() #print match_descr, 'match' #Handle the following "alignment hit" tagged data, e.g. line = handle.readline() line = self._parse_tag_section(line, alignment_annotation) assert not line.startswith("; ") #Then we have the alignment numbers and sequence for the query """ >gi|10955265| .. ; sq_len: 346 ; sq_offset: 1 ; sq_type: p ; al_start: 197 ; al_stop: 238 ; al_display_start: 167 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL GEYFTENKPKYIIREIHQET """ if not (line.startswith(">") and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..'") assert self._query_descr.startswith(line[1:].split()[0]) #Handle the following "query alignment" tagged data line = handle.readline() line = self._parse_tag_section(line, query_annotation) assert not line.startswith("; ") #Now should have the aligned query sequence (with leading flanking region) while not line.startswith(">") : query_seq_parts.append(line.strip()) line = handle.readline() # print 'queryseq', line.strip() #Handle the following "match alignment" data """ >gi|152973545|ref|YP_001338596.1| .. ; sq_len: 242 ; sq_type: p ; al_start: 52 ; al_stop: 94 ; al_display_start: 22 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR QDFAFTRKMRREARQVEQSW """ #Match identifier if not (line.startswith(">") and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line)) #print '----->', line.strip(), match_descr match_descr = line[1:].split()[0] + match_descr #assert match_descr.startswith(line[1:].split()[0]) # assert self._match_descr.startswith(line[1:].split()[0]) #Tagged data, line = handle.readline() line = self._parse_tag_section(line, match_annotation) assert not line.startswith("; ") #Now should have the aligned query sequence with flanking region... while not (line.startswith(">") or ">>>" in line) and not line.startswith('#'): match_seq_parts.append(line.strip()) line = handle.readline() if not line: #End of file return None if line.startswith('>') or '>>>' in line: self._header = line #We built a list of strings and then joined them because #its faster than appending to a string. query_seq = "".join(query_seq_parts) match_seq = "".join(match_seq_parts) del query_seq_parts, match_seq_parts #Note, query_seq and match_seq will usually be of different lengths, apparently #because in the m10 format leading gaps are added but not trailing gaps! #Remove the flanking regions, query_align_seq = self._extract_alignment_region(query_seq, query_annotation) match_align_seq = self._extract_alignment_region(match_seq, match_annotation) #The "sq_offset" values can be specified with the -X command line option. #The appear to just shift the origin used in the calculation of the coordinates. if ("sq_offset" in query_annotation and query_annotation["sq_offset"] != "1") \ or ("sq_offset" in match_annotation and match_annotation["sq_offset"] != "1") : #Note that until some point in the v35 series, FASTA always recorded one #for the query offset, and ommitted the match offset (even when these were #query_seq the -X command line option). #TODO - Work out how exactly the use of -X offsets changes things. #raise ValueError("Offsets from the -X command line option are not (yet) supported") pass # this is not useful when using stretcher # if len(query_align_seq) != len(match_align_seq) : # raise ValueError("Problem parsing the alignment sequence coordinates") if "sw_overlap" in alignment_annotation : if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) : raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ % (alignment_annotation["sw_overlap"], len(query_align_seq))) #TODO - Look at the "sq_type" to assign a sensible alphabet? alignment = Alignment(self.alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in self._query_header_annotation.iteritems() : alignment._annotations[key] = value for key, value in alignment_annotation.iteritems() : alignment._annotations[key] = value #TODO - Once the alignment object gets an append method, use it. #(i.e. an add SeqRecord method) alignment.add_sequence(self._query_descr, query_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == self._query_descr or record.description == self._query_descr assert record.seq.tostring() == query_align_seq record.id = self._query_descr.split()[0].strip(",") record.name = "query" record.annotations["original_length"] = int(query_annotation["sq_len"]) # Roba mia for k in query_annotation.keys(): record.annotations[k] = query_annotation[k] alignment.add_sequence(match_descr, match_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == match_descr or record.description == match_descr assert record.seq.tostring() == match_align_seq record.id = match_descr.split()[0].strip(",") record.name = "match" record.annotations["original_length"] = int(match_annotation["sq_len"]) # Roba mia for k in query_annotation.keys(): record.annotations[k] = match_annotation[k] return alignment
def next(self): try: line = self._header del self._header except AttributeError: line = self.handle.readline() if not line: #Empty file - just give up. return if not line.strip() == '# STOCKHOLM 1.0': raise ValueError("Did not find STOCKHOLM header") #import sys #print >> sys.stderr, 'Warning file does not start with STOCKHOLM 1.0' # Note: If this file follows the PFAM conventions, there should be # a line containing the number of sequences, e.g. "#=GF SQ 67" # We do not check for this - perhaps we should, and verify that # if present it agrees with our parsing. seqs = {} ids = [] gs = {} gr = {} gf = {} passed_end_alignment = False while 1: line = self.handle.readline() if not line: break #end of file line = line.strip() #remove trailing \n if line == '# STOCKHOLM 1.0': self._header = line break elif line == "//": #The "//" line indicates the end of the alignment. #There may still be more meta-data passed_end_alignment = True elif line == "": #blank line, ignore pass elif line[0] != "#": #Sequence #Format: "<seqname> <sequence>" assert not passed_end_alignment parts = [x.strip() for x in line.split(" ", 1)] if len(parts) != 2: #This might be someone attempting to store a zero length sequence? raise ValueError("Could not split line into identifier " \ + "and sequence:\n" + line) id, seq = parts if id not in ids: ids.append(id) seqs.setdefault(id, '') seqs[id] += seq.replace(".", "-") elif len(line) >= 5: #Comment line or meta-data if line[:5] == "#=GF ": #Generic per-File annotation, free text #Format: #=GF <feature> <free text> feature, text = line[5:].strip().split(None, 1) #Each feature key could be used more than once, #so store the entries as a list of strings. if feature not in gf: gf[feature] = [text] else: gf[feature].append(text) elif line[:5] == '#=GC ': #Generic per-Column annotation, exactly 1 char per column #Format: "#=GC <feature> <exactly 1 char per column>" pass elif line[:5] == '#=GS ': #Generic per-Sequence annotation, free text #Format: "#=GS <seqname> <feature> <free text>" id, feature, text = line[5:].strip().split(None, 2) #if id not in ids : # ids.append(id) if id not in gs: gs[id] = {} if feature not in gs[id]: gs[id][feature] = [text] else: gs[id][feature].append(text) elif line[:5] == "#=GR ": #Generic per-Sequence AND per-Column markup #Format: "#=GR <seqname> <feature> <exactly 1 char per column>" id, feature, text = line[5:].strip().split(None, 2) #if id not in ids : # ids.append(id) if id not in gr: gr[id] = {} if feature not in gr[id]: gr[id][feature] = "" gr[id][feature] += text.strip( ) # append to any previous entry #TODO - Should we check the length matches the alignment length? # For iterlaced sequences the GR data can be split over # multiple lines #Next line... assert len(seqs) <= len(ids) #assert len(gs) <= len(ids) #assert len(gr) <= len(ids) self.ids = ids self.sequences = seqs self.seq_annotation = gs self.seq_col_annotation = gr if ids and seqs: if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids) : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (len(ids), self.records_per_alignment)) alignment = Alignment(self.alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = gr alignment_length = len(seqs.values()[0]) for id in ids: seq = seqs[id] if alignment_length != len(seq): raise ValueError( "Sequences have different lengths, or repeated identifier" ) name, start, end = self._identifier_split(id) alignment.add_sequence(id, seq, start=start, end=end) record = alignment.get_all_seqs()[-1] assert record.id == id or record.description == id record.id = id record.name = name record.description = id #will be overridden by _populate_meta_data if an explicit #accession is provided: record.annotations["accession"] = name self._populate_meta_data(id, record) return alignment else: return None
def next(self) : """Reads from the handle to construct and return the next alignment. This returns the pairwise alignment of query and match/library sequences as an Alignment object containing two rows.""" handle = self.handle try : #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return None if line.startswith("#") : #Skip the file header before the alignments. e.g. line = self._skip_file_header(line) while ">>>" in line and not line.startswith(">>>") : #Moved onto the next query sequence! self._query_descr = "" self._query_header_annotation = {} #Read in the query header line = self._parse_query_header(line) #Now should be some alignments, but if not we move onto the next query if not line : #End of file return None if ">>><<<" in line : #Reached the end of the alignments, no need to read the footer... return None #Should start >>... and not >>>... assert line[0:2] == ">>" and not line[2] == ">", line query_seq_parts, match_seq_parts = [], [] query_annotation, match_annotation = {}, {} match_descr = "" alignment_annotation = {} #This should be followed by the target match ID line, then more tags. #e.g. """ >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] ; fa_frame: f ; fa_initn: 52 ; fa_init1: 52 ; fa_opt: 70 ; fa_z-score: 105.5 ; fa_bits: 27.5 ; fa_expect: 0.082 ; sw_score: 70 ; sw_ident: 0.279 ; sw_sim: 0.651 ; sw_overlap: 43 """ if (not line[0:2] == ">>") or line[0:3] == ">>>" : raise ValueError("Expected target line starting '>>'") match_descr = line[2:].strip() #Handle the following "alignment hit" tagged data, e.g. line = handle.readline() line = self._parse_tag_section(line, alignment_annotation) assert not line[0:2] == "; " #Then we have the alignment numbers and sequence for the query """ >gi|10955265| .. ; sq_len: 346 ; sq_offset: 1 ; sq_type: p ; al_start: 197 ; al_stop: 238 ; al_display_start: 167 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL GEYFTENKPKYIIREIHQET """ if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..'") assert self._query_descr.startswith(line[1:].split(None,1)[0]) #Handle the following "query alignment" tagged data line = handle.readline() line = self._parse_tag_section(line, query_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence (with leading flanking region) while not line[0] == ">" : query_seq_parts.append(line.strip()) line = handle.readline() #Handle the following "match alignment" data """ >gi|152973545|ref|YP_001338596.1| .. ; sq_len: 242 ; sq_type: p ; al_start: 52 ; al_stop: 94 ; al_display_start: 22 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR QDFAFTRKMRREARQVEQSW """ #Match identifier if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line)) assert match_descr.startswith(line[1:].split(None,1)[0]) #Tagged data, line = handle.readline() line = self._parse_tag_section(line, match_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence with flanking region... #but before that, since FASTA 35.4.1 there can be an consensus here, """ ; al_cons: .::. : :. ---. :: :. . : ..-:::-: :.: ..:...: etc """ while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): match_seq_parts.append(line.strip()) line = handle.readline() if line[0:2] == "; " : assert line.strip() == "; al_cons:" align_consensus_parts = [] line = handle.readline() while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): align_consensus_parts.append(line.strip()) line = handle.readline() #If we do anything with this in future, must remove any flanking region. align_consensus = "".join(align_consensus_parts) del align_consensus_parts assert not line[0:2] == "; " else : align_consensus = None assert (line[0] == ">" or ">>>" in line) self._header = line #We built a list of strings and then joined them because #its faster than appending to a string. query_seq = "".join(query_seq_parts) match_seq = "".join(match_seq_parts) del query_seq_parts, match_seq_parts #Note, query_seq and match_seq will usually be of different lengths, apparently #because in the m10 format leading gaps are added but not trailing gaps! #Remove the flanking regions, query_align_seq = self._extract_alignment_region(query_seq, query_annotation) match_align_seq = self._extract_alignment_region(match_seq, match_annotation) #How can we do this for the (optional) consensus? #The "sq_offset" values can be specified with the -X command line option. #They appear to just shift the origin used in the calculation of the coordinates. if len(query_align_seq) != len(match_align_seq) : raise ValueError("Problem parsing the alignment sequence coordinates, " "following should be the same length but are not:\n" "%s - len %i\n%s - len %i" % (query_align_seq, len(query_align_seq), match_align_seq, len(match_align_seq))) if "sw_overlap" in alignment_annotation : if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) : raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ % (alignment_annotation["sw_overlap"], len(query_align_seq))) #TODO - Look at the "sq_type" to assign a sensible alphabet? alphabet = self.alphabet alignment = Alignment(alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in self._query_header_annotation.iteritems() : alignment._annotations[key] = value for key, value in alignment_annotation.iteritems() : alignment._annotations[key] = value #TODO - Once the alignment object gets an append method, use it. #(i.e. an add SeqRecord method) alignment.add_sequence(self._query_descr, query_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == self._query_descr or record.description == self._query_descr #assert record.seq.tostring() == query_align_seq record.id = self._query_descr.split(None,1)[0].strip(",") record.name = "query" record.annotations["original_length"] = int(query_annotation["sq_len"]) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_annotation["al_start"]) record._al_stop = int(query_annotation["al_stop"]) #TODO - What if a specific alphabet has been requested? #TODO - Use an IUPAC alphabet? #TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_annotation : if query_annotation["sq_type"] == "D" : record.seq.alphabet = generic_dna elif query_annotation["sq_type"] == "p" : record.seq.alphabet = generic_protein if "-" in query_align_seq : if not hasattr(record.seq.alphabet,"gap_char") : record.seq.alphabet = Gapped(record.seq.alphabet, "-") alignment.add_sequence(match_descr, match_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == match_descr or record.description == match_descr #assert record.seq.tostring() == match_align_seq record.id = match_descr.split(None,1)[0].strip(",") record.name = "match" record.annotations["original_length"] = int(match_annotation["sq_len"]) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_annotation["al_start"]) record._al_stop = int(query_annotation["al_stop"]) #This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_annotation : if match_annotation["sq_type"] == "D" : record.seq.alphabet = generic_dna elif match_annotation["sq_type"] == "p" : record.seq.alphabet = generic_protein if "-" in match_align_seq : if not hasattr(record.seq.alphabet,"gap_char") : record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def next(self) : handle = self.handle try : #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError : line = handle.readline() if not line: return line = line.strip() parts = filter(None, line.split()) if len(parts)!=2 : raise ValueError("First line should have two integers") try : number_of_seqs = int(parts[0]) length_of_seqs = int(parts[1]) except ValueError: raise ValueError("First line should have two integers") assert self._is_header(line) if self.records_per_alignment is not None \ and self.records_per_alignment != number_of_seqs : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (number_of_seqs, self.records_per_alignment)) ids = [] seqs = [] #Expects STRICT truncation/padding to 10 characters #Does not require any white space between name and seq. for i in range(0,number_of_seqs) : line = handle.readline().rstrip() ids.append(line[:10].strip()) #first ten characters seqs.append([line[10:].strip().replace(" ","")]) #Look for further blocks line="" while True : #Skip any blank lines between blocks... while ""==line.strip(): line = handle.readline() if not line : break #end of file if not line : break #end of file if self._is_header(line) : #Looks like the start of a concatenated alignment self._header = line break #print "New block..." for i in range(0,number_of_seqs) : seqs[i].append(line.strip().replace(" ","")) line = handle.readline() if (not line) and i+1 < number_of_seqs : raise ValueError("End of file mid-block") if not line : break #end of file alignment = Alignment(self.alphabet) for i in range(0,number_of_seqs) : seq = "".join(seqs[i]) if len(seq)!=length_of_seqs : raise ValueError("Sequence %i length %i, expected length %i" \ % (i+1, len(seq), length_of_seqs)) alignment.add_sequence(ids[i], seq) record = alignment.get_all_seqs()[-1] assert ids[i] == record.id or ids[i] == record.description record.id = ids[i] record.name = ids[i] record.description = ids[i] return alignment
# biopython from Bio import Alphabet from Bio import Seq from Bio.Alphabet import IUPAC from Bio import Clustalw from Bio.Align.FormatConvert import FormatConverter from Bio.Align import AlignInfo from Bio.Fasta import FastaAlign from Bio.SubsMat import FreqTable from Bio.Align.Generic import Alignment #Very simple tests on an empty alignment alignment = Alignment(Alphabet.generic_alphabet) assert alignment.get_alignment_length() == 0 assert alignment.get_all_seqs() == [] del alignment #Basic tests on simple three string alignment alignment = Alignment(Alphabet.generic_alphabet) letters = "AbcDefGhiJklMnoPqrStuVwxYz" alignment.add_sequence("mixed", letters) alignment.add_sequence("lower", letters.lower()) alignment.add_sequence("upper", letters.upper()) assert alignment.get_alignment_length() == 26 assert len(alignment.get_all_seqs()) == 3 assert alignment.get_seq_by_num(0).tostring() == letters assert alignment.get_seq_by_num(1).tostring() == letters.lower() assert alignment.get_seq_by_num(2).tostring() == letters.upper() assert alignment.get_all_seqs()[0].description == "mixed" assert alignment.get_all_seqs()[1].description == "lower"
def next(self) : """Reads from the handle to construct and return the next alignment. This returns the pairwise alignment of query and match/library sequences as an Alignment object containing two rows.""" handle = self.handle try : #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return None if line.startswith("#") : #Skip the file header before the alignments. e.g. line = self._skip_file_header(line) while ">>>" in line and not line.startswith(">>>") : #Moved onto the next query sequence! self._query_descr = "" self._query_header_annotation = {} #Read in the query header line = self._parse_query_header(line) #Now should be some alignments, but if not we move onto the next query if not line : #End of file return None if ">>><<<" in line : #Reached the end of the alignments, no need to read the footer... return None #Should start >>... and not >>>... assert line[0:2] == ">>" and not line[2] == ">", line query_seq_parts, match_seq_parts = [], [] query_annotation, match_annotation = {}, {} match_descr = "" alignment_annotation = {} #This should be followed by the target match ID line, then more tags. #e.g. """ >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] ; fa_frame: f ; fa_initn: 52 ; fa_init1: 52 ; fa_opt: 70 ; fa_z-score: 105.5 ; fa_bits: 27.5 ; fa_expect: 0.082 ; sw_score: 70 ; sw_ident: 0.279 ; sw_sim: 0.651 ; sw_overlap: 43 """ if (not line[0:2] == ">>") or line[0:3] == ">>>" : raise ValueError("Expected target line starting '>>'") match_descr = line[2:].strip() #Handle the following "alignment hit" tagged data, e.g. line = handle.readline() line = self._parse_tag_section(line, alignment_annotation) assert not line[0:2] == "; " #Then we have the alignment numbers and sequence for the query """ >gi|10955265| .. ; sq_len: 346 ; sq_offset: 1 ; sq_type: p ; al_start: 197 ; al_stop: 238 ; al_display_start: 167 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL GEYFTENKPKYIIREIHQET """ if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..'") assert self._query_descr.startswith(line[1:].split(None,1)[0]) #Handle the following "query alignment" tagged data line = handle.readline() line = self._parse_tag_section(line, query_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence (with leading flanking region) while not line[0] == ">" : query_seq_parts.append(line.strip()) line = handle.readline() #Handle the following "match alignment" data """ >gi|152973545|ref|YP_001338596.1| .. ; sq_len: 242 ; sq_type: p ; al_start: 52 ; al_stop: 94 ; al_display_start: 22 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR QDFAFTRKMRREARQVEQSW """ #Match identifier if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line)) assert match_descr.startswith(line[1:].split(None,1)[0]) #Tagged data, line = handle.readline() line = self._parse_tag_section(line, match_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence with flanking region... #but before that, since FASTA 35.4.1 there can be an consensus here, """ ; al_cons: .::. : :. ---. :: :. . : ..-:::-: :.: ..:...: etc """ while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): match_seq_parts.append(line.strip()) line = handle.readline() if line[0:2] == "; " : assert line.strip() == "; al_cons:" align_consensus_parts = [] line = handle.readline() while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): align_consensus_parts.append(line.strip()) line = handle.readline() #If we do anything with this in future, must remove any flanking region. align_consensus = "".join(align_consensus_parts) del align_consensus_parts assert not line[0:2] == "; " else : align_consensus = None assert (line[0] == ">" or ">>>" in line) self._header = line #We built a list of strings and then joined them because #its faster than appending to a string. query_seq = "".join(query_seq_parts) match_seq = "".join(match_seq_parts) del query_seq_parts, match_seq_parts #Note, query_seq and match_seq will usually be of different lengths, apparently #because in the m10 format leading gaps are added but not trailing gaps! #Remove the flanking regions, query_align_seq = self._extract_alignment_region(query_seq, query_annotation) match_align_seq = self._extract_alignment_region(match_seq, match_annotation) #How can we do this for the (optional) consensus? #The "sq_offset" values can be specified with the -X command line option. #They appear to just shift the origin used in the calculation of the coordinates. if len(query_align_seq) != len(match_align_seq) : raise ValueError("Problem parsing the alignment sequence coordinates, " "following should be the same length but are not:\n" "%s - len %i\n%s - len %i" % (query_align_seq, len(query_align_seq), match_align_seq, len(match_align_seq))) if "sw_overlap" in alignment_annotation : if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) : raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ % (alignment_annotation["sw_overlap"], len(query_align_seq))) #TODO - Look at the "sq_type" to assign a sensible alphabet? alphabet = self.alphabet alignment = Alignment(alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in self._query_header_annotation.iteritems() : alignment._annotations[key] = value for key, value in alignment_annotation.iteritems() : alignment._annotations[key] = value #TODO - Once the alignment object gets an append method, use it. #(i.e. an add SeqRecord method) alignment.add_sequence(self._query_descr, query_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == self._query_descr or record.description == self._query_descr #assert record.seq.tostring() == query_align_seq record.id = self._query_descr.split(None,1)[0].strip(",") record.name = "query" record.annotations["original_length"] = int(query_annotation["sq_len"]) #TODO - What if a specific alphabet has been requested? #TODO - Use an IUPAC alphabet? #TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_annotation : if query_annotation["sq_type"] == "D" : record.seq.alphabet = generic_dna elif query_annotation["sq_type"] == "p" : record.seq.alphabet = generic_protein if "-" in query_align_seq : if not hasattr(record.seq.alphabet,"gap_char") : record.seq.alphabet = Gapped(record.seq.alphabet, "-") alignment.add_sequence(match_descr, match_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == match_descr or record.description == match_descr #assert record.seq.tostring() == match_align_seq record.id = match_descr.split(None,1)[0].strip(",") record.name = "match" record.annotations["original_length"] = int(match_annotation["sq_len"]) #This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_annotation : if match_annotation["sq_type"] == "D" : record.seq.alphabet = generic_dna elif match_annotation["sq_type"] == "p" : record.seq.alphabet = generic_protein if "-" in match_align_seq : if not hasattr(record.seq.alphabet,"gap_char") : record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
# annotations: dictionary with further info, can't be set on initialization seqrec=SeqRecord(Seq('mdstnvrsgmksrkkkpkttvidddddcmtcsacqsklvkisditkvsldyintmrgntlacaacgsslkllndfas',Bio.Alphabet.generic_protein), id='P20994.1', name='P20994', description='Protein A19', dbxrefs=['Pfam:PF05077', 'InterPro:IPR007769', 'DIP:2186N']) seqrec.annotations['note']='A simple note' print seqrec #tipo de dato alineamiento de secuencias, guarda no procesa from Bio.Align.Generic import Alignment seq1='MHQAIFIYQIGYPLKSGYIQSIRSPEYDNW' seq2='MH--IFIYQIGYALKSGYIQSIRSPEY-NW' align=Alignment(Bio.Alphabet.Gapped(IUPAC.protein)) #instance of Alignment class align.add_sequence('asp',seq1) align.add_sequence('unk',seq2) print align #Alignment methods #get_all_seqs: return all sequences in the alignment as a list of SeqRecord for s in align.get_all_seqs(): #in align: (the same) print '->',s.seq #get_seq_by_num(n): return only the selected sequence by index print str(align.get_seq_by_num(1)) #Seq object print align[0] #SeqRecord object print str(align[0].seq) #get_alignment_length(): get length of alignment print align.get_alignment_length() #get_column(n): return a string with all the letters in the n column print align.get_column(0) print align.get_column(2) #AlignInfo module: to extract info from alignment objects from Bio.Align import AlignInfo #print_info_content function #SummaryInfo,PSSM classes