def _fasta_sniffer(fh): # Strategy: # Ignore up to 5 blank/whitespace-only lines at the beginning of the # file. Read up to 10 records. If at least one record is read (i.e. # the file isn't empty) and no errors are thrown during reading, assume # the file is in FASTA format. If a record appears to be QUAL, do *not* # identify the file as FASTA since we don't want to sniff QUAL files as # FASTA (technically they can be read as FASTA since the sequences may # not be validated but it probably isn't what the user wanted). Also, if # we add QUAL as its own file format in the future, we wouldn't want the # FASTA and QUAL sniffers to both positively identify a QUAL file. if _too_many_blanks(fh, 5): return False, {} num_records = 10 empty = True try: parser = _parse_fasta_raw(fh, _sniffer_data_parser, FASTAFormatError) for _ in zip(range(num_records), parser): empty = False except FASTAFormatError: return False, {} if empty: return False, {} else: return True, {}
def _embl_sniffer(fh): # check the 1st real line is a valid ID line if _too_many_blanks(fh, 5): return False, {} try: line = next(_line_generator(fh, skip_blanks=True, strip=False)) except StopIteration: return False, {} if line.startswith('ID'): return True, {} else: return False, {}
def _genbank_sniffer(fh): # check the 1st real line is a valid LOCUS line if _too_many_blanks(fh, 5): return False, {} try: line = next(_line_generator(fh, skip_blanks=True, strip=False)) except StopIteration: return False, {} try: _parse_locus([line]) except GenBankFormatError: return False, {} return True, {}
def _gff3_sniffer(fh): # check the 1st real line is a valid ID line if _too_many_blanks(fh, 5): return False, {} try: line = next(_line_generator(fh, skip_blanks=True, strip=False)) except StopIteration: return False, {} if re.match(r'##gff-version\s+3', line): return True, {} else: return False, {}
def _fastq_sniffer(fh): # Strategy: # Ignore up to 5 blank/whitespace-only lines at the beginning of the # file. Read up to 10 records. If at least one record is read (i.e. the # file isn't empty) and the quality scores are in printable ASCII range, # assume the file is FASTQ. if _too_many_blanks(fh, 5): return False, {} try: not_empty = False for _ in zip(range(10), _fastq_to_generator(fh, phred_offset=33)): not_empty = True return not_empty, {} except (FASTQFormatError, ValueError): return False, {}
def _fastq_sniffer(fh): # Strategy: # Ignore up to 5 blank/whitespace-only lines at the beginning of the # file. Read up to 10 records. If at least one record is read (i.e. the # file isn't empty) and the quality scores are in printable ASCII range, # assume the file is FASTQ. if _too_many_blanks(fh, 5): return False, {} try: not_empty = False for _, seq in zip(range(10), _fastq_to_generator(fh, phred_offset=33)): split_length = len((seq.metadata['id'] + seq.metadata['description']).split(':')) description = seq.metadata['description'].split(':') if split_length == 10 and description[1] in 'YN': return True, {'variant': 'illumina1.8'} not_empty = True return not_empty, {} except (FASTQFormatError, ValueError): return False, {}