def fastqIteratorComplex(fn, useMutableString=False, verbose=False): """ A generator function which yields FastqSequence objects read from a file or stream. This iterator can handle fastq files that have their sequence and/or their quality data split across multiple lines (i.e. there are newline characters in the sequence and quality strings). :param fn: A file-like stream or a string; if this is a string, it's treated as a filename specifying the location of an input fastq file, else it's treated as a file-like object, which must have a readline() method. :param useMustableString: if True, construct sequences from lists of chars, rather than python string objects, to allow more efficient editing. Use with caution. :param verbose: if True, print messages on progress to stderr. :param debug: if True, print debugging messages to stderr. :param sanger: if True, assume quality scores are in sanger format. Otherwise, assume they're in Illumina format. """ fh = fn if type(fh).__name__ == "str": fh = open(fh) prevLine = None # try to get an idea of how much data we have... pind = None if verbose: try: totalLines = linesInFile(fh.name) pind = ProgressIndicator( totalToDo=totalLines, messagePrefix="completed", messageSuffix="of processing " + fh.name ) except AttributeError: sys.stderr.write("fastqIterator -- warning: " + "unable to show progress for stream") verbose = False while True: # either we have a sequence header left over from the # prev call, or we need to read a new one from the file... # try to do that now name, prevLine = fastq_complex_parse_seq_header(fh, prevLine, pind, verbose) # read lines until we hit a qual header --> this is our sequence data seqdata, line = fastq_complex_parse_seq(fh, pind, verbose) # <line> is now a qual header, keep reading until we see # a sequence header.. or we run out of lines.. this is our quality data qualdata, line, prevLine = fastq_complex_parse_qual(fh, line, prevLine, verbose, pind) # package it all up.. yield NGSRead(seqdata, name, qualdata, useMutableString) if verbose: pind.showProgress() # remember where we stopped for next call, or finish prevLine = line if prevLine == "": break
def regularWigIterator(fd, verbose=False, sortedby=None, scoreType=int): """ @param sortedBy: if not None, should be one of ITERATOR_SORTED_BY_START indicating an order that the input stream must be sorted in @raise WigIteratorError: if sortedBy is set and stream is not sorted """ if verbose: try: totalLines = linesInFile(fd) pind = ProgressIndicator(totalToDo=totalLines, messagePrefix="completed", messageSuffix="of processing " + getFDName(fd)) except AttributeError: sys.stderr.write("WigIterator -- warning: " + "unable to show progress for stream") verbose = False chromsSeen = set() prev = None fh = openFD(fd) for line in fh: if verbose: pind.done += 1 pind.showProgress() line = line.strip() if line == "": continue e = parseWigString(line, scoreType=scoreType) # on same chrom as the prev item, make sure order is right if prev is not None and sortedby is not None and e.chrom == prev.chrom: if sortedby == ITERATOR_SORTED_START and prev.start > e.start: raise WigIteratorError( "bed file " + fd.name + " not sorted by start index - saw item " + str(prev) + " before " + str(e)) # starting a new chrom.. make sure we haven't already seen it if prev is not None and prev.chrom != e.chrom: if (sortedby == ITERATOR_SORTED_START) and\ (e.chrom in chromsSeen or prev.chrom > e.chrom): raise WigIteratorError("BED file " + fd.name + " not sorted by chrom") chromsSeen.add(e.chrom) # all good.. yield e prev = e
def regularWigIterator(fd, verbose=False, sortedby=None, scoreType=int): """ @param sortedBy: if not None, should be one of ITERATOR_SORTED_BY_START indicating an order that the input stream must be sorted in @raise WigIteratorError: if sortedBy is set and stream is not sorted """ if verbose: try: totalLines = linesInFile(fd) pind = ProgressIndicator(totalToDo=totalLines, messagePrefix="completed", messageSuffix="of processing " + getFDName(fd)) except AttributeError: sys.stderr.write("WigIterator -- warning: " + "unable to show progress for stream") verbose = False chromsSeen = set() prev = None fh = openFD(fd) for line in fh: if verbose: pind.done += 1 pind.showProgress() line = line.strip() if line == "": continue e = parseWigString(line, scoreType=scoreType) # on same chrom as the prev item, make sure order is right if prev is not None and sortedby is not None and e.chrom == prev.chrom: if sortedby == ITERATOR_SORTED_START and prev.start > e.start: raise WigIteratorError("bed file " + fd.name + " not sorted by start index - saw item " + str(prev) + " before " + str(e)) # starting a new chrom.. make sure we haven't already seen it if prev is not None and prev.chrom != e.chrom: if (sortedby == ITERATOR_SORTED_START) and\ (e.chrom in chromsSeen or prev.chrom > e.chrom): raise WigIteratorError("BED file " + fd.name + " not sorted by chrom") chromsSeen.add(e.chrom) # all good.. yield e prev = e
def intervalTrees(reffh, scoreType=int, verbose=False): """ Build a dictionary of interval trees indexed by chrom from a BED stream or file :param reffh: This can be either a string, or a stream-like object. In the former case, it is treated as a filename. The format of the file/stream must be BED. :param scoreType: The data type for scores (the fifth column) in the BED file. :param verbose: output progress messages to sys.stderr if True """ if type(reffh).__name__ == "str": fh = open(reffh) else: fh = reffh # load all the regions and split them into lists for each chrom elements = {} if verbose and fh != sys.stdin: totalLines = linesInFile(fh.name) pind = ProgressIndicator(totalToDo=totalLines, messagePrefix="completed", messageSuffix="of loading " + fh.name) for element in BEDIterator(fh, scoreType=scoreType, verbose=verbose): if element.chrom not in elements: elements[element.chrom] = [] elements[element.chrom].append(element) if verbose and fh != sys.stdin: pind.done += 1 pind.showProgress() # create an interval tree for each list trees = {} if verbose: totalLines = len(elements) pind = ProgressIndicator(totalToDo=totalLines, messagePrefix="completed", messageSuffix="of making interval trees") for chrom in elements: trees[chrom] = IntervalTree(elements[chrom], openEnded=True) if verbose: pind.done += 1 pind.showProgress() return trees
def fastqIteratorComplex(fn, useMutableString=False, verbose=False): """ A generator function which yields FastqSequence objects read from a file or stream. This iterator can handle fastq files that have their sequence and/or their quality data split across multiple lines (i.e. there are newline characters in the sequence and quality strings). :param fn: A file-like stream or a string; if this is a string, it's treated as a filename specifying the location of an input fastq file, else it's treated as a file-like object, which must have a readline() method. :param useMustableString: if True, construct sequences from lists of chars, rather than python string objects, to allow more efficient editing. Use with caution. :param verbose: if True, print messages on progress to stderr. :param debug: if True, print debugging messages to stderr. :param sanger: if True, assume quality scores are in sanger format. Otherwise, assume they're in Illumina format. """ fh = fn if type(fh).__name__ == "str": fh = open(fh) prevLine = None # try to get an idea of how much data we have... pind = None if verbose: try: totalLines = linesInFile(fh.name) pind = ProgressIndicator(totalToDo=totalLines, messagePrefix="completed", messageSuffix="of processing " + fh.name) except AttributeError: sys.stderr.write("fastqIterator -- warning: " + "unable to show progress for stream") verbose = False while True: # either we have a sequence header left over from the # prev call, or we need to read a new one from the file... # try to do that now name, prevLine = fastq_complex_parse_seq_header(fh, prevLine, pind, verbose) # read lines until we hit a qual header --> this is our sequence data seqdata, line = fastq_complex_parse_seq(fh, pind, verbose) # <line> is now a qual header, keep reading until we see # a sequence header.. or we run out of lines.. this is our quality data qualdata, line, prevLine = fastq_complex_parse_qual(fh, line, prevLine, verbose, pind) # package it all up.. yield NGSRead(seqdata, name, qualdata, useMutableString) if verbose: pind.showProgress() # remember where we stopped for next call, or finish prevLine = line if prevLine == "": break