def _get_some_qual_and_lengths(fhand, force_file_as_non_seek): 'It returns the quality characters and the lengths' seqs_to_peek = get_setting('SEQS_TO_GUESS_FASTQ_VERSION') chunk_size = get_setting('CHUNK_TO_GUESS_FASTQ_VERSION') lengths = array('I') seqs_analyzed = 0 if fhand_is_seekable(fhand) and not force_file_as_non_seek: fmt_fhand = fhand chunk = fmt_fhand.read(chunk_size) fhand.seek(0) else: chunk = peek_chunk_from_file(fhand, chunk_size) fmt_fhand = cStringIO.StringIO(chunk) try: for seq in FastqGeneralIterator(fmt_fhand): qual = [ord(char) for char in seq[2]] sanger_chars = [q for q in qual if q < 64] if sanger_chars: fhand.seek(0) return None, True, chunk # no quals, no lengths, is_sanger lengths.append(len(qual)) seqs_analyzed += 1 if seqs_analyzed > seqs_to_peek: break except ValueError: msg = 'The file is Fastq, but the version is difficult to guess' raise UndecidedFastqVersionError(msg) finally: fhand.seek(0) return lengths, None, chunk # don't know if it's sanger
def _guess_format(fhand, force_file_as_non_seek): '''It guesses the format of the sequence file. This function is just for testing forcing the fhand as non-seekable. It does ignore the solexa fastq version. ''' chunk_size = 2048 chunk = peek_chunk_from_file(fhand, chunk_size) if not chunk: raise FileIsEmptyError('The file is empty') lines = chunk.splitlines() if chunk.startswith('>'): if lines[1].startswith('>'): raise UnknownFormatError('Malformed fasta') else: first_item = lines[1].strip().split()[0] if first_item.isdigit(): return 'qual' else: return 'fasta' elif chunk.startswith('@'): return _guess_fastq_version(fhand, force_file_as_non_seek) elif chunk.startswith('LOCUS'): return 'genbank' elif chunk.startswith('ID'): return 'embl' raise UnknownFormatError('Sequence file of unknown format.')
def _guess_format(fhand, force_file_as_non_seek): """It guesses the format of the sequence file. This function is just for testing forcing the fhand as non-seekable. It does ignore the solexa fastq version. """ chunk_size = 1024 chunk = peek_chunk_from_file(fhand, chunk_size) if not chunk: raise UnknownFormatError("The file is empty") lines = chunk.splitlines() if chunk.startswith(">"): if lines[1].startswith(">"): raise UnknownFormatError("Malformed fasta") else: first_item = lines[1].strip().split()[0] if first_item.isdigit(): return "qual" else: return "fasta" elif chunk.startswith("@"): return _guess_fastq_version(fhand, force_file_as_non_seek) elif chunk.startswith("LOCUS"): return "genbank" elif chunk.startswith("ID"): return "embl" raise UnknownFormatError("Sequence file of unknown format.")
def _get_some_qual_and_lengths(fhand, force_file_as_non_seek): "It returns the quality characters and the lengths" seqs_to_peek = SEQS_TO_GUESS_FASTQ_VERSION chunk_size = CHUNK_TO_GUESS_FASTQ_VERSION lengths = array("I") seqs_analyzed = 0 if fhand_is_seekable(fhand) and not force_file_as_non_seek: fmt_fhand = fhand else: chunk = peek_chunk_from_file(fhand, chunk_size) fmt_fhand = cStringIO.StringIO(chunk) try: for seq in FastqGeneralIterator(fmt_fhand): qual = [ord(char) for char in seq[2]] sanger_chars = [q for q in qual if q < 64] if sanger_chars: fhand.seek(0) return None, True # no quals, no lengths, is_sanger lengths.append(len(qual)) seqs_analyzed += 1 if seqs_analyzed > seqs_to_peek: break except ValueError: raise UnknownFormatError("Malformed fastq") finally: fhand.seek(0) return lengths, None # quals, lengths, don't know if it's sanger