def _parse_quality_scores(fh, seq_len, variant, phred_offset, prev): phred_scores = [] qual_len = 0 for chunk in _line_generator(fh, skip_blanks=False): if chunk: if chunk.startswith('@') and qual_len == seq_len: return np.hstack(phred_scores), chunk else: if not prev: _blank_error("after '+' or within quality scores") qual_len += len(chunk) if qual_len > seq_len: raise FASTQFormatError( "Found more quality score characters than sequence " "characters. Extra quality score characters: %r" % chunk[-(qual_len - seq_len):]) phred_scores.append( _decode_qual_to_phred(chunk, variant=variant, phred_offset=phred_offset)) prev = chunk if qual_len != seq_len: raise FASTQFormatError( "Found incomplete/truncated FASTQ record at end of file.") return np.hstack(phred_scores), None
def _parse_sequence_data(fh, prev): seq_chunks = [] for chunk in _line_generator(fh, skip_blanks=False): if chunk.startswith('+'): if not prev: _blank_error("before '+'") if not seq_chunks: raise FASTQFormatError( "Found FASTQ record without sequence data.") return ''.join(seq_chunks), chunk elif chunk.startswith('@'): raise FASTQFormatError( "Found FASTQ record that is missing a quality (+) header line " "after sequence data.") else: if not prev: _blank_error("after header or within sequence") if _whitespace_regex.search(chunk): raise FASTQFormatError( "Found whitespace in sequence data: %r" % str(chunk)) seq_chunks.append(chunk) prev = chunk raise FASTQFormatError( "Found incomplete/truncated FASTQ record at end of file.")
def _fastq_to_generator(fh, variant=None, phred_offset=None, constructor=Sequence, **kwargs): # Skip any blank or whitespace-only lines at beginning of file seq_header = next(_line_generator(fh, skip_blanks=True)) if not seq_header.startswith('@'): raise FASTQFormatError( "Expected sequence (@) header line at start of file: %r" % str(seq_header)) while seq_header is not None: id_, desc = _parse_fasta_like_header(seq_header) seq, qual_header = _parse_sequence_data(fh, seq_header) if qual_header != '+' and qual_header[1:] != seq_header[1:]: raise FASTQFormatError( "Sequence (@) and quality (+) header lines do not match: " "%r != %r" % (str(seq_header[1:]), str(qual_header[1:]))) phred_scores, seq_header = _parse_quality_scores( fh, len(seq), variant, phred_offset, qual_header) yield constructor(seq, metadata={ 'id': id_, 'description': desc }, positional_metadata={'quality': phred_scores}, **kwargs)
def _parse_sequence_data(fh): seq_chunks = [] for chunk in _line_generator(fh): if chunk.startswith('+'): if not seq_chunks: raise FASTQFormatError( "Found FASTQ record without sequence data.") return ''.join(seq_chunks), chunk elif chunk.startswith('@'): raise FASTQFormatError( "Found FASTQ record that is missing a quality (+) header line " "after sequence data.") else: if _whitespace_regex.search(chunk): raise FASTQFormatError( "Found whitespace in sequence data: %r" % chunk) seq_chunks.append(chunk) raise FASTQFormatError( "Found incomplete/truncated FASTQ record at end of file.")
def _fastq_to_generator(fh, variant=None, phred_offset=None, constructor=BiologicalSequence): seq_header = next(_line_generator(fh)) if not seq_header.startswith('@'): raise FASTQFormatError( "Expected sequence (@) header line at start of file: %r" % seq_header) while seq_header is not None: id_, desc = _parse_fasta_like_header(seq_header) seq, qual_header = _parse_sequence_data(fh) if qual_header != '+' and qual_header[1:] != seq_header[1:]: raise FASTQFormatError( "Sequence (@) and quality (+) header lines do not match: " "%r != %r" % (seq_header[1:], qual_header[1:])) phred_scores, seq_header = _parse_quality_scores( fh, len(seq), variant, phred_offset) yield constructor(seq, id=id_, description=desc, quality=phred_scores)
def _parse_quality_scores(fh, seq_len, variant, phred_offset): phred_scores = [] qual_len = 0 for chunk in _line_generator(fh): if chunk.startswith('@') and qual_len == seq_len: return phred_scores, chunk else: qual_len += len(chunk) if qual_len > seq_len: raise FASTQFormatError( "Found more quality score characters than sequence " "characters. Extra quality score characters: %r" % chunk[-(qual_len - seq_len):]) phred_scores.extend( _decode_qual_to_phred(chunk, variant=variant, phred_offset=phred_offset)) if qual_len != seq_len: raise FASTQFormatError( "Found incomplete/truncated FASTQ record at end of file.") return phred_scores, None
def _line_generator(fh): for line in fh: line = line.rstrip('\n') if not line: raise FASTQFormatError("Found blank line in FASTQ-formatted file.") yield line
def _blank_error(unique_text): error_string = ("Found blank or whitespace-only line {} in " "FASTQ file").format(unique_text) raise FASTQFormatError(error_string)