def _alignment_to_phylip(obj, fh): if obj.is_empty(): raise PhylipFormatError( "Alignment can only be written in PHYLIP format if there is at " "least one sequence in the alignment.") sequence_length = obj.sequence_length() if sequence_length == 0: raise PhylipFormatError( "Alignment can only be written in PHYLIP format if there is at " "least one position in the alignment.") chunk_size = 10 for id_ in obj.ids(): if len(id_) > chunk_size: raise PhylipFormatError( "Alignment can only be written in PHYLIP format if all " "sequence IDs have %d or fewer characters. Found sequence " "with ID '%s' that exceeds this limit. Use " "Alignment.update_ids to assign shorter IDs." % (chunk_size, id_)) sequence_count = obj.sequence_count() fh.write('{0:d} {1:d}\n'.format(sequence_count, sequence_length)) fmt = '{0:%d}{1}\n' % chunk_size for seq in obj: chunked_seq = _chunk_str(str(seq), chunk_size, ' ') fh.write(fmt.format(seq.metadata['id'], chunked_seq))
def _alignment_to_phylip(obj, fh): if obj.is_empty(): raise PhylipFormatError( "Alignment can only be written in PHYLIP format if there is at " "least one sequence in the alignment.") sequence_length = obj.sequence_length() if sequence_length == 0: raise PhylipFormatError( "Alignment can only be written in PHYLIP format if there is at " "least one position in the alignment.") chunk_size = 10 for id_ in obj.ids(): if len(id_) > chunk_size: raise PhylipFormatError( "Alignment can only be written in PHYLIP format if all " "sequence IDs have %d or fewer characters. Found sequence " "with ID '%s' that exceeds this limit. Use " "Alignment.update_ids to assign shorter IDs." % (chunk_size, id_)) sequence_count = obj.sequence_count() fh.write('{0:d} {1:d}\n'.format(sequence_count, sequence_length)) fmt = '{0:%d}{1}\n' % chunk_size for seq in obj: chunked_seq = _chunk_str(str(seq), chunk_size, ' ') fh.write(fmt.format(seq.id, chunked_seq))
def _generator_to_fasta(obj, fh, qual=FileSentinel, id_whitespace_replacement='_', description_newline_replacement=' ', max_width=None): if max_width is not None: if max_width < 1: raise ValueError( "Maximum line width must be greater than zero (max_width=%d)." % max_width) if qual is not None: # define text wrapper for splitting quality scores here for # efficiency. textwrap docs recommend reusing a TextWrapper # instance when it is used many times. configure text wrapper to # never break "words" (i.e., integer quality scores) across lines qual_wrapper = textwrap.TextWrapper( width=max_width, break_long_words=False, break_on_hyphens=False) formatted_records = _format_fasta_like_records( obj, id_whitespace_replacement, description_newline_replacement, qual is not None) for header, seq_str, qual_scores in formatted_records: if max_width is not None: seq_str = _chunk_str(seq_str, max_width, '\n') fh.write('>%s\n%s\n' % (header, seq_str)) if qual is not None: qual_str = ' '.join(np.asarray(qual_scores, dtype=np.str)) if max_width is not None: qual_str = qual_wrapper.fill(qual_str) qual.write('>%s\n%s\n' % (header, qual_str))
def test_even_split(self): self.assertEqual(_chunk_str('abcdef', 6, ' '), 'abcdef') self.assertEqual(_chunk_str('abcdef', 3, ' '), 'abc def') self.assertEqual(_chunk_str('abcdef', 2, ' '), 'ab cd ef') self.assertEqual(_chunk_str('abcdef', 1, ' '), 'a b c d e f') self.assertEqual(_chunk_str('a', 1, ' '), 'a') self.assertEqual(_chunk_str('abcdef', 2, ''), 'abcdef')
def _generator_to_fasta(obj, fh, id_whitespace_replacement='_', description_newline_replacement=' ', max_width=None): if ((id_whitespace_replacement is not None and '\n' in id_whitespace_replacement) or (description_newline_replacement is not None and '\n' in description_newline_replacement)): raise FASTAFormatError( "Newline character (\\n) cannot be used to replace whitespace in " "biological sequence IDs, nor to replace newlines in biological " "sequence descriptions. Otherwise, the FASTA-formatted file will " "be invalid.") ws_pattern = re.compile(r'\s') nl_pattern = re.compile(r'\n') for idx, seq in enumerate(obj): if len(seq) < 1: raise FASTAFormatError( "Cannot write %s biological sequence in FASTA format because " "it does not contain any characters (i.e., it is an " "empty/blank sequence). Empty sequences are not supported in " "the FASTA file format." % cardinal_to_ordinal(idx + 1)) id_ = seq.id if id_whitespace_replacement is not None: id_ = re.sub(ws_pattern, id_whitespace_replacement, id_) desc = seq.description if description_newline_replacement is not None: desc = re.sub(nl_pattern, description_newline_replacement, desc) if desc: header = '%s %s' % (id_, desc) else: header = id_ seq_str = str(seq) if max_width is not None: seq_str = _chunk_str(seq_str, max_width, '\n') fh.write('>%s\n%s\n' % (header, seq_str))
def test_invalid_n(self): with self.assertRaisesRegexp(ValueError, 'n=0'): _chunk_str('abcdef', 0, ' ') with self.assertRaisesRegexp(ValueError, 'n=-42'): _chunk_str('abcdef', -42, ' ')
def test_uneven_split(self): self.assertEqual(_chunk_str('abcdef', 5, '|'), 'abcde|f') self.assertEqual(_chunk_str('abcdef', 4, '|'), 'abcd|ef') self.assertEqual(_chunk_str('abcdefg', 3, ' - '), 'abc - def - g')
def test_no_split(self): self.assertEqual(_chunk_str('', 2, '\n'), '') self.assertEqual(_chunk_str('a', 100, '\n'), 'a') self.assertEqual(_chunk_str('abcdef', 42, '|'), 'abcdef')