def _format_fasta_like_records(generator, id_whitespace_replacement, description_newline_replacement, require_qual, lowercase=None): if ((id_whitespace_replacement is not None and '\n' in id_whitespace_replacement) or (description_newline_replacement is not None and '\n' in description_newline_replacement)): raise ValueError( "Newline character (\\n) cannot be used to replace whitespace in " "sequence IDs, nor to replace newlines in sequence descriptions.") for idx, seq in enumerate(generator): if len(seq) < 1: raise ValueError( "%s sequence does not contain any characters (i.e., it is an " "empty/blank sequence). Writing empty sequences is not " "supported." % cardinal_to_ordinal(idx + 1)) if 'id' in seq.metadata: id_ = seq.metadata['id'] else: id_ = '' if id_whitespace_replacement is not None: id_ = _whitespace_regex.sub(id_whitespace_replacement, id_) if 'description' in seq.metadata: desc = seq.metadata['description'] else: desc = '' if description_newline_replacement is not None: desc = _newline_regex.sub(description_newline_replacement, desc) if desc: header = '%s %s' % (id_, desc) else: header = id_ if require_qual and 'quality' not in seq.positional_metadata: raise ValueError( "Cannot write %s sequence because it does not have quality " "scores associated with it." % cardinal_to_ordinal(idx + 1)) qual = None if 'quality' in seq.positional_metadata: qual = seq.positional_metadata['quality'].values if lowercase is not None: if hasattr(seq, 'lowercase'): seq_str = seq.lowercase(lowercase) else: raise AttributeError("lowercase specified but class %s does " "not support lowercase functionality" % seq.__class__.__name__) else: seq_str = str(seq) yield header, seq_str, qual
def test_valid_range(self): # taken and modified from http://stackoverflow.com/a/20007730/3776794 exp = ['0th', '1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th', '11th', '12th', '13th', '14th', '15th', '16th', '17th', '18th', '19th', '20th', '21st', '22nd', '23rd', '24th', '25th', '26th', '27th', '28th', '29th', '30th', '31st', '32nd', '100th', '101st', '42042nd'] obs = [cardinal_to_ordinal(n) for n in list(range(0, 33)) + [100, 101, 42042]] self.assertEqual(obs, exp)
def _format_fasta_like_records(generator, id_whitespace_replacement, description_newline_replacement, require_qual): if (id_whitespace_replacement is not None and "\n" in id_whitespace_replacement) or ( description_newline_replacement is not None and "\n" in description_newline_replacement ): raise ValueError( "Newline character (\\n) cannot be used to replace whitespace in " "sequence IDs, nor to replace newlines in sequence descriptions." ) for idx, seq in enumerate(generator): if len(seq) < 1: raise ValueError( "%s sequence does not contain any characters (i.e., it is an " "empty/blank sequence). Writing empty sequences is not " "supported." % cardinal_to_ordinal(idx + 1) ) id_ = seq.id if id_whitespace_replacement is not None: id_ = _whitespace_regex.sub(id_whitespace_replacement, id_) desc = seq.description if description_newline_replacement is not None: desc = _newline_regex.sub(description_newline_replacement, desc) if desc: header = "%s %s" % (id_, desc) else: header = id_ if require_qual and seq.quality is None: raise ValueError( "Cannot write %s sequence because it does not have quality " "scores associated with it." % cardinal_to_ordinal(idx + 1) ) yield header, str(seq), seq.quality
def _format_fasta_like_records(generator, id_whitespace_replacement, description_newline_replacement, require_qual): if ((id_whitespace_replacement is not None and '\n' in id_whitespace_replacement) or (description_newline_replacement is not None and '\n' in description_newline_replacement)): raise ValueError( "Newline character (\\n) cannot be used to replace whitespace in " "sequence IDs, nor to replace newlines in sequence descriptions.") for idx, seq in enumerate(generator): if len(seq) < 1: raise ValueError( "%s sequence does not contain any characters (i.e., it is an " "empty/blank sequence). Writing empty sequences is not " "supported." % cardinal_to_ordinal(idx + 1)) id_ = seq.id if id_whitespace_replacement is not None: id_ = _whitespace_regex.sub(id_whitespace_replacement, id_) desc = seq.description if description_newline_replacement is not None: desc = _newline_regex.sub(description_newline_replacement, desc) if desc: header = '%s %s' % (id_, desc) else: header = id_ if require_qual and not seq.has_quality(): raise ValueError( "Cannot write %s sequence because it does not have quality " "scores associated with it." % cardinal_to_ordinal(idx + 1)) yield header, seq.sequence, seq.quality
def _get_nth_sequence(generator, seq_num): # i is set to None so that an empty generator will not result in an # undefined variable when compared to seq_num. i = None if seq_num is None or seq_num < 1: raise ValueError('Invalid sequence number (`seq_num`=%s). `seq_num`' ' must be between 1 and the number of sequences in' ' the file.' % str(seq_num)) try: for i, seq in zip(range(1, seq_num + 1), generator): pass finally: generator.close() if i == seq_num: return seq raise ValueError('Reached end of file before finding the %s sequence.' % cardinal_to_ordinal(seq_num))
def test_valid_range(self): # taken and modified from http://stackoverflow.com/a/20007730/3776794 exp = [ "0th", "1st", "2nd", "3rd", "4th", "5th", "6th", "7th", "8th", "9th", "10th", "11th", "12th", "13th", "14th", "15th", "16th", "17th", "18th", "19th", "20th", "21st", "22nd", "23rd", "24th", "25th", "26th", "27th", "28th", "29th", "30th", "31st", "32nd", "100th", "101st", "42042nd", ] obs = [cardinal_to_ordinal(n) for n in list(range(0, 33)) + [100, 101, 42042]] self.assertEqual(obs, exp)
def _generator_to_fasta(obj, fh, id_whitespace_replacement='_', description_newline_replacement=' ', max_width=None): if ((id_whitespace_replacement is not None and '\n' in id_whitespace_replacement) or (description_newline_replacement is not None and '\n' in description_newline_replacement)): raise FASTAFormatError( "Newline character (\\n) cannot be used to replace whitespace in " "biological sequence IDs, nor to replace newlines in biological " "sequence descriptions. Otherwise, the FASTA-formatted file will " "be invalid.") ws_pattern = re.compile(r'\s') nl_pattern = re.compile(r'\n') for idx, seq in enumerate(obj): if len(seq) < 1: raise FASTAFormatError( "Cannot write %s biological sequence in FASTA format because " "it does not contain any characters (i.e., it is an " "empty/blank sequence). Empty sequences are not supported in " "the FASTA file format." % cardinal_to_ordinal(idx + 1)) id_ = seq.id if id_whitespace_replacement is not None: id_ = re.sub(ws_pattern, id_whitespace_replacement, id_) desc = seq.description if description_newline_replacement is not None: desc = re.sub(nl_pattern, description_newline_replacement, desc) if desc: header = '%s %s' % (id_, desc) else: header = id_ seq_str = str(seq) if max_width is not None: seq_str = _chunk_str(seq_str, max_width, '\n') fh.write('>%s\n%s\n' % (header, seq_str))
def _fasta_to_sequence(fh, seq_num, constructor): if seq_num < 1: raise FASTAFormatError( "Invalid sequence number (seq_num=%d). seq_num must be between 1 " "and the number of sequences in the FASTA-formatted file " "(inclusive)." % seq_num) seq_idx = seq_num - 1 seq = None try: gen = _fasta_to_generator(fh, constructor=constructor) for idx, curr_seq in enumerate(gen): if idx == seq_idx: seq = curr_seq break finally: gen.close() if seq is None: raise FASTAFormatError( "Reached end of FASTA-formatted file before finding %s biological " "sequence." % cardinal_to_ordinal(seq_num)) return seq
def test_invalid_n(self): with six.assertRaisesRegex(self, ValueError, '-1'): cardinal_to_ordinal(-1)
def test_invalid_n(self): with self.assertRaisesRegexp(ValueError, "-1"): cardinal_to_ordinal(-1)