def _fastq_to_generator(fh, variant=None, phred_offset=None, constructor=Sequence, **kwargs): # Skip any blank or whitespace-only lines at beginning of file seq_header = next(_line_generator(fh, skip_blanks=True)) if not seq_header.startswith('@'): raise FASTQFormatError( "Expected sequence (@) header line at start of file: %r" % str(seq_header)) while seq_header is not None: id_, desc = _parse_fasta_like_header(seq_header) seq, qual_header = _parse_sequence_data(fh, seq_header) if qual_header != '+' and qual_header[1:] != seq_header[1:]: raise FASTQFormatError( "Sequence (@) and quality (+) header lines do not match: " "%r != %r" % (str(seq_header[1:]), str(qual_header[1:]))) phred_scores, seq_header = _parse_quality_scores(fh, len(seq), variant, phred_offset, qual_header) yield constructor(seq, metadata={'id': id_, 'description': desc}, positional_metadata={'quality': phred_scores}, **kwargs)
def _fastq_to_generator(fh, variant=None, phred_offset=None, constructor=Sequence, **kwargs): # Skip any blank or whitespace-only lines at beginning of file seq_header = next(_line_generator(fh, skip_blanks=True)) if not seq_header.startswith('@'): raise FASTQFormatError( "Expected sequence (@) header line at start of file: %r" % str(seq_header)) while seq_header is not None: id_, desc = _parse_fasta_like_header(seq_header) seq, qual_header = _parse_sequence_data(fh, seq_header) if qual_header != '+' and qual_header[1:] != seq_header[1:]: raise FASTQFormatError( "Sequence (@) and quality (+) header lines do not match: " "%r != %r" % (str(seq_header[1:]), str(qual_header[1:]))) phred_scores, seq_header = _parse_quality_scores( fh, len(seq), variant, phred_offset, qual_header) yield constructor(seq, metadata={ 'id': id_, 'description': desc }, positional_metadata={'quality': phred_scores}, **kwargs)
def _parse_fasta_raw(fh, data_parser, error_type): """Raw parser for FASTA or QUAL files. Returns raw values (seq/qual, id, description). It is the responsibility of the caller to construct the correct in-memory object to hold the data. """ # Skip any blank or whitespace-only lines at beginning of file try: seq_header = next(_line_generator(fh, skip_blanks=True)) except StopIteration: return # header check inlined here and below for performance if seq_header.startswith('>'): id_, desc = _parse_fasta_like_header(seq_header) else: raise error_type( "Found non-header line when attempting to read the 1st record:" "\n%s" % seq_header) data_chunks = [] prev = seq_header for line in _line_generator(fh, skip_blanks=False): if line.startswith('>'): # new header, so yield current record and reset state yield data_parser(data_chunks), id_, desc data_chunks = [] id_, desc = _parse_fasta_like_header(line) else: if line: # ensure no blank lines within a single record if not prev: raise error_type( "Found blank or whitespace-only line within record.") data_chunks.append(line) prev = line # yield last record in file yield data_parser(data_chunks), id_, desc
def _parse_fasta_raw(fh, data_parser, error_type): """Raw parser for FASTA or QUAL files. Returns raw values (seq/qual, id, description). It is the responsibility of the caller to construct the correct in-memory object to hold the data. """ # Skip any blank or whitespace-only lines at beginning of file try: seq_header = next(_line_generator(fh, skip_blanks=True)) except StopIteration: return # header check inlined here and below for performance if seq_header.startswith('>'): id_, desc = _parse_fasta_like_header(seq_header) else: raise error_type( "Found non-header line when attempting to read the 1st record:" "\n%s" % seq_header) data_chunks = [] prev = seq_header for line in _line_generator(fh, skip_blanks=False): if line.startswith('>'): # new header, so yield current record and reset state yield data_parser(data_chunks), id_, desc data_chunks = [] id_, desc = _parse_fasta_like_header(line) else: if line: # ensure no blank lines within a single record if not prev: raise error_type( "Found blank or whitespace-only line within record.") data_chunks.append(line) prev = line # yield last record in file yield data_parser(data_chunks), id_, desc
def test_id_and_description(self): obs = _parse_fasta_like_header('>!thus suht! \t\t \n') self.assertEqual(obs, ('!thus', 'suht!'))
def test_description_only(self): obs = _parse_fasta_like_header('> suht! \t\t \n') self.assertEqual(obs, ('', 'suht!'))
def test_id_only(self): obs = _parse_fasta_like_header('>suht! \t\t \n') self.assertEqual(obs, ('suht!', ''))
def test_no_id_or_description(self): obs = _parse_fasta_like_header('> \t\t \n') self.assertEqual(obs, ('', ''))
def test_id_and_description(self): obs = _parse_fasta_like_header(">!thus suht! \t\t \n") self.assertEqual(obs, ("!thus", "suht!"))
def test_description_only(self): obs = _parse_fasta_like_header("> suht! \t\t \n") self.assertEqual(obs, ("", "suht!"))
def test_id_only(self): obs = _parse_fasta_like_header(">suht! \t\t \n") self.assertEqual(obs, ("suht!", ""))
def test_no_id_or_description(self): obs = _parse_fasta_like_header("> \t\t \n") self.assertEqual(obs, ("", ""))
def test_id_and_description(self): obs = _parse_fasta_like_header('>!thus suht! \t\t \n') self.assertEqual(obs, ('!thus', 'suht!'))
def test_description_only(self): obs = _parse_fasta_like_header('> suht! \t\t \n') self.assertEqual(obs, ('', 'suht!'))
def test_id_only(self): obs = _parse_fasta_like_header('>suht! \t\t \n') self.assertEqual(obs, ('suht!', ''))
def test_no_id_or_description(self): obs = _parse_fasta_like_header('> \t\t \n') self.assertEqual(obs, ('', ''))