def _generator_to_fasta(obj, fh, qual=FileSentinel, id_whitespace_replacement='_', description_newline_replacement=' ', max_width=None, lowercase=None): if max_width is not None: if max_width < 1: raise ValueError( "Maximum line width must be greater than zero (max_width=%d)." % max_width) if qual is not None: # define text wrapper for splitting quality scores here for # efficiency. textwrap docs recommend reusing a TextWrapper # instance when it is used many times. configure text wrapper to # never break "words" (i.e., integer quality scores) across lines qual_wrapper = textwrap.TextWrapper( width=max_width, break_long_words=False, break_on_hyphens=False) formatted_records = _format_fasta_like_records( obj, id_whitespace_replacement, description_newline_replacement, qual is not None, lowercase) for header, seq_str, qual_scores in formatted_records: if max_width is not None: seq_str = chunk_str(seq_str, max_width, '\n') fh.write('>%s\n%s\n' % (header, seq_str)) if qual is not None: qual_str = ' '.join(np.asarray(qual_scores, dtype=np.str)) if max_width is not None: qual_str = qual_wrapper.fill(qual_str) qual.write('>%s\n%s\n' % (header, qual_str))
def test_empty_sequence(self): def blank_seq_gen(): yield from (DNA('A'), Sequence(''), RNA('GG')) with self.assertRaisesRegex(ValueError, r'2nd.*empty'): list(_format_fasta_like_records(blank_seq_gen(), None, None, False))
def test_empty_str_replacement(self): exp = [("", "ACGT", range(4)), ("foobar", "GAU", None), (" foo bar", "TAG", None), ("foo bar baz", "A", [42])] obs = list(_format_fasta_like_records(self.gen, "", "", False)) self.assertEqual(len(obs), len(exp)) for o, e in zip(obs, exp): npt.assert_equal(o, e)
def _generator_to_fasta(obj, fh, qual=FileSentinel, id_whitespace_replacement='_', description_newline_replacement=' ', max_width=None, lowercase=None): if max_width is not None: if max_width < 1: raise ValueError( "Maximum line width must be greater than zero (max_width=%d)." % max_width) if qual is not None: # define text wrapper for splitting quality scores here for # efficiency. textwrap docs recommend reusing a TextWrapper # instance when it is used many times. configure text wrapper to # never break "words" (i.e., integer quality scores) across lines qual_wrapper = textwrap.TextWrapper(width=max_width, break_long_words=False, break_on_hyphens=False) formatted_records = _format_fasta_like_records( obj, id_whitespace_replacement, description_newline_replacement, qual is not None, lowercase) for header, seq_str, qual_scores in formatted_records: if max_width is not None: seq_str = chunk_str(seq_str, max_width, '\n') fh.write('>%s\n%s\n' % (header, seq_str)) if qual is not None: qual_str = ' '.join(np.asarray(qual_scores, dtype=np.str)) if max_width is not None: qual_str = qual_wrapper.fill(qual_str) qual.write('>%s\n%s\n' % (header, qual_str))
def test_empty_sequence(self): def blank_seq_gen(): yield from (DNA('A'), Sequence(''), RNA('GG')) with self.assertRaisesRegex(ValueError, '2nd.*empty'): list(_format_fasta_like_records(blank_seq_gen(), None, None, False))
def test_empty_str_replacement(self): exp = [('', 'ACGT', range(4)), ('foobar', 'GAU', None), (' foo bar', 'TAG', None), ('foo bar baz', 'A', [42])] obs = list(_format_fasta_like_records(self.gen, '', '', False)) self.assertEqual(len(obs), len(exp)) for o, e in zip(obs, exp): npt.assert_equal(o, e)
def test_multi_char_replacement(self): exp = [('', 'ACGT', range(4)), ('-.--.-foo-.--.--.--.-bar-.-', 'GAU', None), (' foo_-__-_ bar_-_', 'TAG', None), ('foo bar baz', 'A', [42])] obs = list(_format_fasta_like_records(self.gen, '-.-', '_-_', False)) self.assertEqual(len(obs), len(exp)) for o, e in zip(obs, exp): npt.assert_equal(o, e)
def test_missing_quality_scores(self): def missing_qual_gen(): yield from (RNA('A', positional_metadata={'quality': [42]}), Sequence('AG'), DNA('GG', positional_metadata={'quality': [41, 40]})) with self.assertRaisesRegex(ValueError, r'2nd sequence.*quality scores'): list(_format_fasta_like_records(missing_qual_gen(), '-', '-', True))
def test_missing_quality_scores(self): def missing_qual_gen(): yield from (RNA('A', positional_metadata={'quality': [42]}), Sequence('AG'), DNA('GG', positional_metadata={'quality': [41, 40]})) with self.assertRaisesRegex(ValueError, '2nd sequence.*quality scores'): list(_format_fasta_like_records(missing_qual_gen(), '-', '-', True))
def test_missing_quality_scores(self): def missing_qual_gen(): yield from ( RNA("A", positional_metadata={"quality": [42]}), Sequence("AG"), DNA("GG", positional_metadata={"quality": [41, 40]}), ) with self.assertRaisesRegex(ValueError, "2nd sequence.*quality scores"): list(_format_fasta_like_records(missing_qual_gen(), "-", "-", True))
def test_empty_str_replacement(self): exp = [ ('', 'ACGT', range(4)), ('foobar', 'GAU', None), (' foo bar', 'TAG', None), ('foo bar baz', 'A', [42]) ] obs = list(_format_fasta_like_records(self.gen, '', '', False)) self.assertEqual(len(obs), len(exp)) for o, e in zip(obs, exp): npt.assert_equal(o, e)
def test_multi_char_replacement(self): exp = [ ("", "ACGT", range(4)), ("-.--.-foo-.--.--.--.-bar-.-", "GAU", None), (" foo_-__-_ bar_-_", "TAG", None), ("foo bar baz", "A", [42]), ] obs = list(_format_fasta_like_records(self.gen, "-.-", "_-_", False)) self.assertEqual(len(obs), len(exp)) for o, e in zip(obs, exp): npt.assert_equal(o, e)
def test_multi_char_replacement(self): exp = [ ('', 'ACGT', range(4)), ('-.--.-foo-.--.--.--.-bar-.-', 'GAU', None), (' foo_-__-_ bar_-_', 'TAG', None), ('foo bar baz', 'A', [42]) ] obs = list(_format_fasta_like_records(self.gen, '-.-', '_-_', False)) self.assertEqual(len(obs), len(exp)) for o, e in zip(obs, exp): npt.assert_equal(o, e)
def _generator_to_fastq(obj, fh, variant=None, phred_offset=None, id_whitespace_replacement='_', description_newline_replacement=' ', lowercase=None): formatted_records = _format_fasta_like_records( obj, id_whitespace_replacement, description_newline_replacement, True, lowercase=lowercase) for header, seq_str, qual_scores in formatted_records: qual_str = _encode_phred_to_qual(qual_scores, variant=variant, phred_offset=phred_offset) fh.write('@') fh.write(header) fh.write('\n') fh.write(seq_str) fh.write('\n+\n') fh.write(qual_str) fh.write('\n')
def test_empty_sequence(self): def blank_seq_gen(): yield from (DNA("A"), Sequence(""), RNA("GG")) with self.assertRaisesRegex(ValueError, "2nd.*empty"): list(_format_fasta_like_records(blank_seq_gen(), None, None, False))
def test_newline_character_in_description_newline_replacement(self): with self.assertRaisesRegex(ValueError, "Newline character"): list(_format_fasta_like_records(self.gen, None, "a\nb", False))
def test_newline_character_in_id_whitespace_replacement(self): with self.assertRaisesRegex(ValueError, "Newline character"): list(_format_fasta_like_records(self.gen, "-\n--", " ", False))
def test_newline_character_in_description_newline_replacement(self): with six.assertRaisesRegex(self, ValueError, 'Newline character'): list(_format_fasta_like_records(self.gen, None, 'a\nb', False))
def test_newline_character_in_id_whitespace_replacement(self): with six.assertRaisesRegex(self, ValueError, 'Newline character'): list(_format_fasta_like_records(self.gen, '-\n--', ' ', False))