def _parse_sequence_data(fh, prev): seq_chunks = [] for chunk in _line_generator(fh, skip_blanks=False): if chunk.startswith('+'): if not prev: _blank_error("before '+'") if not seq_chunks: raise FASTQFormatError( "Found FASTQ record without sequence data.") return ''.join(seq_chunks), chunk elif chunk.startswith('@'): raise FASTQFormatError( "Found FASTQ record that is missing a quality (+) header line " "after sequence data.") else: if not prev: _blank_error("after header or within sequence") if _whitespace_regex.search(chunk): raise FASTQFormatError( "Found whitespace in sequence data: %r" % str(chunk)) seq_chunks.append(chunk) prev = chunk raise FASTQFormatError( "Found incomplete/truncated FASTQ record at end of file.")
def _parse_records(fh, constructor=None, **kwargs): res = None header_md = {} n = len(_REQUIRED_FIELDS) for line in _line_generator(fh, skip_blanks=True, strip=True): md = {} # parse the header (would be nice to abstract this pattern out) if line.startswith('@'): key, val = line.split('\t', 1) if key != '@CO': header_md[key] = val # parse the actual sequences else: tabs = line.split('\t') # zip stops generating after the shorter list of the two md = { k: _parse_required(v) for k, v in zip(_REQUIRED_FIELDS, tabs) } seq = md.pop('SEQ') opt = (_parse_optional(field) for field in tabs[n:]) md.update(opt) md.update(header_md) res = seq, md yield res
def _parse_quality_scores(fh, seq_len, variant, phred_offset, prev): phred_scores = [] qual_len = 0 for chunk in _line_generator(fh, skip_blanks=False): if chunk: if chunk.startswith('@') and qual_len == seq_len: return np.hstack(phred_scores), chunk else: if not prev: _blank_error("after '+' or within quality scores") qual_len += len(chunk) if qual_len > seq_len: raise FASTQFormatError( "Found more quality score characters than sequence " "characters. Extra quality score characters: %r" % chunk[-(qual_len - seq_len):]) phred_scores.append( _decode_qual_to_phred(chunk, variant=variant, phred_offset=phred_offset)) prev = chunk if qual_len != seq_len: raise FASTQFormatError( "Found incomplete/truncated FASTQ record at end of file.") return np.hstack(phred_scores), None
def _fastq_to_generator(fh, variant=None, phred_offset=None, constructor=Sequence, **kwargs): # Skip any blank or whitespace-only lines at beginning of file seq_header = next(_line_generator(fh, skip_blanks=True)) if not seq_header.startswith('@'): raise FASTQFormatError( "Expected sequence (@) header line at start of file: %r" % str(seq_header)) while seq_header is not None: id_, desc = _parse_fasta_like_header(seq_header) seq, qual_header = _parse_sequence_data(fh, seq_header) if qual_header != '+' and qual_header[1:] != seq_header[1:]: raise FASTQFormatError( "Sequence (@) and quality (+) header lines do not match: " "%r != %r" % (str(seq_header[1:]), str(qual_header[1:]))) phred_scores, seq_header = _parse_quality_scores( fh, len(seq), variant, phred_offset, qual_header) yield constructor(seq, metadata={ 'id': id_, 'description': desc }, positional_metadata={'quality': phred_scores}, **kwargs)
def _fastq_to_generator(fh, variant=None, phred_offset=None, constructor=Sequence, **kwargs): # Skip any blank or whitespace-only lines at beginning of file seq_header = next(_line_generator(fh, skip_blanks=True)) if not seq_header.startswith('@'): raise FASTQFormatError( "Expected sequence (@) header line at start of file: %r" % str(seq_header)) while seq_header is not None: id_, desc = _parse_fasta_like_header(seq_header) seq, qual_header = _parse_sequence_data(fh, seq_header) if qual_header != '+' and qual_header[1:] != seq_header[1:]: raise FASTQFormatError( "Sequence (@) and quality (+) header lines do not match: " "%r != %r" % (str(seq_header[1:]), str(qual_header[1:]))) phred_scores, seq_header = _parse_quality_scores(fh, len(seq), variant, phred_offset, qual_header) yield constructor(seq, metadata={'id': id_, 'description': desc}, positional_metadata={'quality': phred_scores}, **kwargs)
def _yield_record(fh): '''Yield (seq_id, lines) that belong to the same sequence.''' lines = [] current = False for line in _line_generator(fh, skip_blanks=True, strip=True): if line.startswith('##sequence-region'): _, seq_id, start, end = line.split() length = int(end) - int(start) + 1 yield 'length', seq_id, length if line.startswith('##FASTA'): # stop once reaching to sequence section break if not line.startswith('#'): try: seq_id, _ = line.split('\t', 1) except ValueError: raise GFF3FormatError('Wrong GFF3 format at line: %s' % line) if current == seq_id: lines.append(line) else: if current is not False: yield 'data', current, lines lines = [line] current = seq_id if current is False: # if the input file object is empty, it should return # an empty generator return yield else: yield 'data', current, lines
def _yield_record(fh): '''Yield (seq_id, lines) that belong to the same sequence.''' lines = [] current = False for line in _line_generator(fh, skip_blanks=True, strip=True): if line.startswith('##sequence-region'): _, seq_id, start, end = line.split() length = int(end) - int(start) + 1 yield 'length', seq_id, length if line.startswith('##FASTA'): # stop once reaching to sequence section break if not line.startswith('#'): try: seq_id, _ = line.split('\t', 1) except ValueError: raise GFF3FormatError( 'Wrong GFF3 format at line: %s' % line) if current == seq_id: lines.append(line) else: if current is not False: yield 'data', current, lines lines = [line] current = seq_id if current is False: # if the input file object is empty, it should return # an empty generator return yield else: yield 'data', current, lines
def _parse_records(fh, parser): data_chunks = [] for line in _line_generator(fh, skip_blanks=True, strip=False): if line.startswith('//'): yield parser(data_chunks) data_chunks = [] else: data_chunks.append(line)
def _parse_genbanks(fh): data_chunks = [] for line in _line_generator(fh, skip_blanks=True, strip=False): if line.startswith('//'): yield _parse_single_genbank(data_chunks) data_chunks = [] else: data_chunks.append(line)
def parser(lines): curr = [] for line in _line_generator(lines, **kwargs): # if we find another, return the previous section if is_another_section(line): if curr: yield curr curr = [] curr.append(line) # don't forget to return the last section in the file if curr: yield curr
def _embl_sniffer(fh): # check the 1st real line is a valid ID line if _too_many_blanks(fh, 5): return False, {} try: line = next(_line_generator(fh, skip_blanks=True, strip=False)) except StopIteration: return False, {} if line.startswith('ID'): return True, {} else: return False, {}
def _parse_fasta_raw(fh, data_parser, error_type): """Raw parser for FASTA or QUAL files. Returns raw values (seq/qual, id, description). It is the responsibility of the caller to construct the correct in-memory object to hold the data. """ # Skip any blank or whitespace-only lines at beginning of file try: seq_header = next(_line_generator(fh, skip_blanks=True)) except StopIteration: return # header check inlined here and below for performance if seq_header.startswith('>'): id_, desc = _parse_fasta_like_header(seq_header) else: raise error_type( "Found non-header line when attempting to read the 1st record:" "\n%s" % seq_header) data_chunks = [] prev = seq_header for line in _line_generator(fh, skip_blanks=False): if line.startswith('>'): # new header, so yield current record and reset state yield data_parser(data_chunks), id_, desc data_chunks = [] id_, desc = _parse_fasta_like_header(line) else: if line: # ensure no blank lines within a single record if not prev: raise error_type( "Found blank or whitespace-only line within record.") data_chunks.append(line) prev = line # yield last record in file yield data_parser(data_chunks), id_, desc
def _gff3_sniffer(fh): # check the 1st real line is a valid ID line if _too_many_blanks(fh, 5): return False, {} try: line = next(_line_generator(fh, skip_blanks=True, strip=False)) except StopIteration: return False, {} if re.match(r'##gff-version\s+3', line): return True, {} else: return False, {}
def _genbank_sniffer(fh): # check the 1st real line is a valid LOCUS line if _too_many_blanks(fh, 5): return False, {} try: line = next(_line_generator(fh, skip_blanks=True, strip=False)) except StopIteration: return False, {} try: _parse_locus([line]) except GenBankFormatError: return False, {} return True, {}
def _yield_section(lines, split_header, **kwargs): '''Yield the lines with the same header. Parameters ---------- split_header : function It accepts a string of line and returns the header and rest of the needed data as a tuple. If no header exists, return ``None`` and the data. Notes ----- The following example is a valid section:: DR GO GO:000001 DR GO GO:000002 This is also a section:: SQ ATGCA ATGCA ATGCA ATGCA ''' curr = [] header, _ = split_header(lines[0]) for line in _line_generator(lines, **kwargs): items = split_header(line) # if the header is changed, it is a new section if items[0] is not None and items[0] != header: if curr: yield header, curr curr = [] header = items[0] curr.append(items[1].strip()) # don't forget to return the last section in the file if curr: yield header, curr
def _parse_records(fh, constructor=None, **kwargs): res = None header_md = {} n = len(_REQUIRED_FIELDS) for line in _line_generator(fh, skip_blanks=True, strip=True): md = {} # parse the header (would be nice to abstract this pattern out) if line.startswith('@'): key, val = line.split('\t', 1) if key != '@CO': header_md[key] = val # parse the actual sequences else: tabs = line.split('\t') # zip stops generating after the shorter list of the two md = {k: _parse_required(v) for k, v in zip(_REQUIRED_FIELDS, tabs)} seq = md.pop('SEQ') opt = (_parse_optional(field) for field in tabs[n:]) md.update(opt) md.update(header_md) res = seq, md yield res
def _parse_records(fh): """ parses an tabular output file, generated by either cmsearch or cmscan of the Infernal package, to a collection of IntervalMetadata objects Parameters ---------- fh : file handle A file handle to the file that should be parsed Returns ------- A collection of IntervalMetadata objects """ program = 'CMSCAN' currentsequence = False annotations = {} for line in _line_generator(fh, skip_blanks=True, strip=True): if not line.startswith('#'): attributes = {} fields = line.split() # split at multiple occurrences of spaces # checking data for start end end position of the hit in the query # sequence. hitStart = -1 if fields[_COLUMNS.index('SEQUENCE_START_POSITION')].isdigit(): hitStart = int(fields[_COLUMNS.index('SEQUENCE_START_POSITION')]) else: raise CmscanFormatError("%s %i %s '%s'." % ( "Column", _COLUMNS.index('SEQUENCE_START_POSITION'), "must be an integer value for the start position of the" " hit. Here, it is", fields[_COLUMNS.index('SEQUENCE_START_POSITION')] )) hitEnd = -1 if fields[_COLUMNS.index('SEQUENCE_END_POSITION')].isdigit(): hitEnd = int(fields[_COLUMNS.index('SEQUENCE_END_POSITION')]) else: raise CmscanFormatError("%s %i %s '%s'." % ( "Column", _COLUMNS.index('SEQUENCE_END_POSITION'), "must be an integer value for the end position of the " "hit. Here, it is", fields[_COLUMNS.index('SEQUENCE_END_POSITION')], ".") ) hitOrientation = fields[_COLUMNS.index('STRAND')] if hitOrientation == "+": if hitStart > hitEnd: raise CmscanFormatError('%s %i %s %i %s' % ( "On the forward strand (+), start position of a hit " "must always be smaller than its end position." " This is not true for the hit between", hitStart, "and", hitEnd, ". It might be, that this hit is in fact on the " "reverse strand. Please check strand orientation and" " positions.")) elif hitOrientation == "-": if hitStart < hitEnd: raise CmscanFormatError('%s %i %s %i %s' % ( "On the reverse strand (-), start position of a hit " "must always be larger than its end position." " This is not true for the hit between", hitStart, "and", hitEnd, ". It might be, that this hit is in fact on the " "forward strand. Please check strand orientation and" " positions.")) else: hitStart, hitEnd = hitEnd, hitStart # swap orientation else: raise CmscanFormatError( "%s '%s' %s %i. %s" % ("Unknown strand character", hitOrientation, "in column", _COLUMNS.index('STRAND'), "Valid characters are '+' for the forward strand and " "'-' for the reverse strand.")) # since Infernal want the user to be aware of differences between # CMsearch and CMscan the information about model and query are at # different columns. Here, we contradict this design and store the # information always at the same key if program is 'CMSEARCH': attributes['SEQUENCE_NAME'] = fields[0] attributes['SEQUENCE_ACCESSION'] = fields[1] attributes['MODEL_NAME'] = fields[2] attributes['MODEL_ACCESSION'] = fields[3] elif program is 'CMSCAN': attributes['SEQUENCE_NAME'] = fields[2] attributes['SEQUENCE_ACCESSION'] = fields[3] attributes['MODEL_NAME'] = fields[0] attributes['MODEL_ACCESSION'] = fields[1] else: raise CmscanFormatError("Argument 'program' must be either " "'CMSEARCH' or 'CMSCAN'!") # iterate through all keys that have not already be handled above for key in _COLUMNS: if key in ['SEQUENCE_START_POSITION', 'SEQUENCE_END_POSITION', 'SEQUENCE_NAME', 'SEQUENCE_ACCESSION', 'MODEL_NAME', 'MODEL_ACCESSION']: continue attributes[key] = fields[_COLUMNS.index(key)] # cmscan works on multiple sequence in one FASTA file. We want to # yield a separate object for each sequence, thus we create a new # one whenever the ID changes if (currentsequence != attributes['SEQUENCE_NAME'] and currentsequence is not False): yield annotations annotations = {} # a metadata interval is made out of 'frozen dictionary' aka # Features object, where the its hashable value constitutes the # key and the value is a list of tuples, aka intervals annotations[Feature(attributes)] = [(hitStart, hitEnd)] # store current sequence id for the next iteration currentsequence = attributes['SEQUENCE_NAME'] yield annotations