def fixedWigIterator(fd, verbose=False, sortedby=None, scoreType=int): """ @summary: """ fh = openFD(fd) if verbose: try: pind = ProgressIndicator(totalToDo=os.path.getsize(fh.name), messagePrefix="completed", messageSuffix="of processing " + fh.name) except AttributeError: sys.stderr.write("WigIterator -- warning: " + "unable to show progress for stream") verbose = False chromsSeen = set() prev = None # NUMBERS = set(['1','2','3','4','5','6','7','8','9','0','.']) currentChrom, at, step = None, None, None for line in fh: line = line.strip() if line == "": continue if line[0] == 't' or line[0] == 'f': parts = line.split() if parts[0] == "track": continue elif parts[0] == "fixedStep": currentChrom = parts[1].split("=")[1] at = int(parts[2].split("=")[1]) step = int(parts[3].split("=")[1]) else: val = float(line) e = GenomicInterval(currentChrom, at, at + step, None, val, scoreType=scoreType) # on same chrom as the prev item, make sure order is right if prev is not None and sortedby is not None and e.chrom == prev.chrom: if sortedby == ITERATOR_SORTED_START and prev.start > e.start: raise WigIteratorError("Wig file " + fd.name + " not sorted by start index - saw item " + str(prev) + " before " + str(e)) # starting a new chrom.. make sure we haven't already seen it if prev is not None and prev.chrom != e.chrom: if (sortedby == ITERATOR_SORTED_START) and\ (e.chrom in chromsSeen or prev.chrom > e.chrom): raise WigIteratorError("Wig file " + fd.name + " not sorted by chrom") chromsSeen.add(e.chrom) # all good.. yield e prev = e at += step if verbose : pind.done = fh.tell() pind.showProgress()
def __build_index(self, until=None, flush=False, verbose=False): """ build/expand the index for this file. :param until: expand the index until the record with this hash has been incorporated and then stop. If None, go until the iterator is exhausted. Note that if this hash is already in the index, no new items will be :param flush: if True, anything already in the index is discarded. """ assert(self._indexed_file_handle is not None) if flush: self._index = {} file_loc = self._indexed_file_handle.tell() if verbose: self._indexed_file_handle.seek(0, 2) # seek to end total = self._indexed_file_handle.tell() - file_loc self._indexed_file_handle.seek(file_loc) # back to where we were pind = ProgressIndicator(totalToDo=total, messagePrefix="completed", messageSuffix="of building out index") for item in self.record_iterator(self._indexed_file_handle): hash_val = self.record_hash_function(item) self._index[hash_val] = file_loc file_loc = self._indexed_file_handle.tell() if until is not None and hash_val == until: break if verbose: pind.done = file_loc pind.showProgress()
def genericFileIterator(fn, verbose=False): """ @summary: iterate over a file, returning non-blank lines @param fn: either a string representing the name of the file or a file object @param verbose: if True, output status messages to stderr """ if type(fn).__name__ == "str": fh = open(fn) else: fh = fn if verbose: try: pind = ProgressIndicator(totalToDo=os.path.getsize(fh.name), messagePrefix="completed", messageSuffix="of processing " + fh.name) junk = fh.tell() except: sys.stderr.write("Cannot show progress for stream.. doesn't behave " "like a file") verbose = False for line in fh: if verbose: pind.done = fh.tell() pind.showProgress() line = line.strip() if line == "": continue yield line
def genericFileIterator(fn, verbose=False): """ @summary: iterate over a file, returning non-blank lines @param fn: either a string representing the name of the file or a file object @param verbose: if True, output status messages to stderr """ if type(fn).__name__ == "str": fh = open(fn) else: fh = fn if verbose: try: pind = ProgressIndicator(totalToDo=os.path.getsize(fh.name), messagePrefix="completed", messageSuffix="of processing " + fh.name) junk = fh.tell() except: sys.stderr.write( "Cannot show progress for stream.. doesn't behave " "like a file") verbose = False for line in fh: if verbose: pind.done = fh.tell() pind.showProgress() line = line.strip() if line == "": continue yield line
def __build_index(self, until=None, flush=False, verbose=False): """ build/expand the index for this file. :param until: expand the index until the record with this hash has been incorporated and then stop. If None, go until the iterator is exhausted. Note that if this hash is already in the index, no new items will be :param flush: if True, anything already in the index is discarded. """ assert (self._indexed_file_handle is not None) if flush: self._index = {} file_loc = self._indexed_file_handle.tell() if verbose: self._indexed_file_handle.seek(0, 2) # seek to end total = self._indexed_file_handle.tell() - file_loc self._indexed_file_handle.seek(file_loc) # back to where we were pind = ProgressIndicator(totalToDo=total, messagePrefix="completed", messageSuffix="of building out index") for item in self.record_iterator(self._indexed_file_handle): hash_val = self.record_hash_function(item) self._index[hash_val] = file_loc file_loc = self._indexed_file_handle.tell() if until is not None and hash_val == until: break if verbose: pind.done = file_loc pind.showProgress()
def repeat_masker_iterator(fh, alignment_index=None, header=True, verbose=False): """ Iterator for repeatmasker coordinate annotation files. These files describe the location of repeat occurrences. There is (optionally) a two-line header with the names of the fields (ignored by the iterator, if present). Each line is a record of an occurrence. The description of fields for each line is given in from_repeat_masker_string. :param fh: stream-like object, or string filename, to load the annotations from :param alignment_index: an IndexedFile for full alignments; keys should be repeat-masker IDs :param header: if True, expect and discard the two-line header; otherwise we will expect there is no header :param verbose: if True, output additional status messages about progress to stderr. """ strm = fh if type(fh).__name__ == "str": strm = open(fh) # try to get an idea of how much data we have... if verbose: try: total = os.path.getsize(strm.name) pind = ProgressIndicator(totalToDo=total, messagePrefix="completed", messageSuffix="of processing " + strm.name) except AttributeError as e: sys.stderr.write(str(e)) sys.stderr.write("completed [unknown] of processing index") verbose = False if header: # chomp first 2 lines next(strm) next(strm) for line in strm: if verbose: pind.done = strm.tell() pind.showProgress() line = line.strip() if line == "": continue rto = retrotransposon.from_repeat_masker_string(line) if alignment_index is not None: rto.pairwise_alignment =\ JustInTimePairwiseAlignment(alignment_index, rto.uniq_id) yield rto
def fastqIteratorSimple(fn, verbose=False, allowNameMissmatch=False): """ A generator function that yields FastqSequence objects read from a fastq-format stream or filename. This is iterator requires that all sequence and quality data is provided on a single line -- put another way, it cannot parse fastq files with newline characters interspersed in the sequence and/or quality strings. That's probably okay though, as fastq files tend not to be formated like that (famous last words..). :param fn: filename or stream to read data from. :param allowNameMismatch: don't throw error if name in sequence data and quality data parts of a read don't match. Newer version of CASVA seem to output data like this, probably to save space. :param verbose: if True, output additional status messages to stderr about progress. """ fh = fn if type(fh).__name__ == "str": fh = open(fh) # try to get an idea of how much data we have... if verbose: try: totalLines = os.path.getsize(fh.name) pind = ProgressIndicator( totalToDo=totalLines, messagePrefix="completed", messageSuffix="of processing " + fh.name ) except AttributeError: sys.stderr.write("fastqIterator -- warning: " + "unable to show progress for stream") verbose = False while True: # read four lines.. if we can't get four lines, something is wrong lines = [] gotLines = 0 while gotLines < 4: l = fh.readline() if verbose: pind.done = fh.tell() pind.showProgress() if l == "": # end of file found... if gotLines == 0: # ok, not in the middle of a sequence break else: raise FastqFileFormatError("reached end of file in the " + "middle of sequence data") l = l.strip() if l == "": continue lines.append(l) gotLines += 1 # couldn't get any more data.. we're done if gotLines == 0: break # got our 4 lines, assemble our read.. # first check that names match if lines[0][1:] != lines[2][1:] and not allowNameMissmatch: raise FastqFileFormatError( "names in sequence don't match : " + str(lines[0][1:]) + " != " + str(lines[2][1:]) ) name = lines[0][1:] seq = lines[1] qual = lines[3] yield NGSRead(seq, name, qual)
def fixedWigIterator(fd, verbose=False, sortedby=None, scoreType=int): """ @summary: """ fh = openFD(fd) if verbose: try: pind = ProgressIndicator(totalToDo=os.path.getsize(fh.name), messagePrefix="completed", messageSuffix="of processing " + fh.name) except AttributeError: sys.stderr.write("WigIterator -- warning: " + "unable to show progress for stream") verbose = False chromsSeen = set() prev = None # NUMBERS = set(['1','2','3','4','5','6','7','8','9','0','.']) currentChrom, at, step = None, None, None for line in fh: line = line.strip() if line == "": continue if line[0] == 't' or line[0] == 'f': parts = line.split() if parts[0] == "track": continue elif parts[0] == "fixedStep": currentChrom = parts[1].split("=")[1] at = int(parts[2].split("=")[1]) step = int(parts[3].split("=")[1]) else: val = float(line) e = GenomicInterval(currentChrom, at, at + step, None, val, scoreType=scoreType) # on same chrom as the prev item, make sure order is right if prev is not None and sortedby is not None and e.chrom == prev.chrom: if sortedby == ITERATOR_SORTED_START and prev.start > e.start: raise WigIteratorError( "Wig file " + fd.name + " not sorted by start index - saw item " + str(prev) + " before " + str(e)) # starting a new chrom.. make sure we haven't already seen it if prev is not None and prev.chrom != e.chrom: if (sortedby == ITERATOR_SORTED_START) and\ (e.chrom in chromsSeen or prev.chrom > e.chrom): raise WigIteratorError("Wig file " + fd.name + " not sorted by chrom") chromsSeen.add(e.chrom) # all good.. yield e prev = e at += step if verbose: pind.done = fh.tell() pind.showProgress()
def build(self): currentBlock = None at = self.handle.tell() seenChroms = set() lastIndexSeen = -1 if self.verbose: try: pind = ProgressIndicator( totalToDo=os.path.getsize(self.handle.name), messagePrefix="completed", messageSuffix="of building index for " + self.handle.name, ) except: sys.stderr.write("IndexedWig -- warning: " + "unable to show progress for stream\n") self.verbose = False ### note, for loop seems to buffer the file and so tell() gives a ### location that is not where the current line was read from, so ### we stick to readline instead. rline = None while rline != "": # get the next element rline = self.handle.readline() line = rline.strip() if line == "": continue e = parseWigString(line) # keep track of what chroms we've seen for checking order if not e.chrom in seenChroms: seenChroms.add(e.chrom) lastIndexSeen = -1 # check chrom order is ok for seenChrom in seenChroms: if seenChrom > e.chrom: msg = ( "wig file is not sorted, entry for chrom " + str(seenChrom) + " appears after entry for " + str(e.chrom) ) raise IndexedWigError(msg) # check position order is ok if e.start < lastIndexSeen: msg = ( "wig file is not sorted, entry for chrom " + str(e.chrom) + " at " + str(e.start) + " appears after " + str(lastIndexSeen) ) raise IndexedWigError(msg) # update the last index we've seen lastIndexSeen = e.end # debugging message if the current block is full if self.debug is True: sys.stderr.write("processing " + str(e)) if currentBlock is not None: sys.stderr.write("; is current block full?" + str(currentBlock.isfull()) + "\n") else: sys.stderr.write("\n") # we might need to make a new block for this element if currentBlock is None or currentBlock.isfull() or currentBlock.chrom != e.chrom: if self.debug: sys.stderr.write("making new block with " + str(e) + "\n") if currentBlock is not None: if self.debug: sys.stderr.write("closed block: " + str(currentBlock) + "\n") if currentBlock.chrom not in self.blocksByChrom: self.blocksByChrom[currentBlock.chrom] = [] self.blocksByChrom[currentBlock.chrom].append(currentBlock) currentBlock = WigBlock(at, e, self.blocksize) # add the element to the current block currentBlock.add(e) at = self.handle.tell() if self.verbose: pind.done = self.handle.tell() pind.showProgress() # don't forget to add the final block if currentBlock != None: if self.debug: sys.stderr.write("closed block: " + str(currentBlock) + "\n") if currentBlock.chrom not in self.blocksByChrom: self.blocksByChrom[currentBlock.chrom] = [] self.blocksByChrom[currentBlock.chrom].append(currentBlock) # build the interval trees for chrom in self.blocksByChrom: self.itrees[chrom] = IntervalTree(self.blocksByChrom[chrom], openEnded=True)
def repeat_masker_alignment_iterator(fn, index_friendly=True, verbose=False): """ Iterator for repeat masker alignment files; yields multiple alignment objects. Iterate over a file/stream of full repeat alignments in the repeatmasker format. Briefly, this format is as follows: each record (alignment) begins with a header line (see _rm_parse_header_line documentation for details of header format), followed by the alignment itself (example below) and finally a set of key-value meta-data pairs. The actual alignment looks like this:: chr1 11 CCCTGGAGATTCTTATT--AGTGATTTGGGCT 41 ii v -- v i i v C MER5B#DNA/hAT 10 CCCCAGAGATTCTGATTTAATTGGTCTGGGGT 42 chr1 42 GACTG 47 v C MER5B#DNA/hAT 43 CACTG 48 The 'C' indicates that its the reverse complement of the consensus. The central string gives information about matches; "-" indicates an insertion/deletion, "i" a transition (G<->A, C<->T) and "v" a transversion (all other substitutions). :param fh: filename or stream-like object to read from. :param index_friendly: if True, we will ensure the file/stream position is before the start of the record when we yield it; this requires the ability to seek within the stream though, so if iterating over a stream wtihout that ability, you'll have to set this to false. Further, this will disable buffering for the file, to ensure file.tell() behaves correctly, so a performance hit will be incurred. :param verbose: if true, output progress messages to stderr. """ # step 1 -- build our iterator for the stream.. try: fh = open(fn) except (TypeError): fh = fn iterable = fh if index_friendly: iterable = iter(fh.readline, '') # build progress indicator, if we want one and we're able to if verbose: try: m_fn = ": " + fh.name except TypeError: m_fn = "" try: current = fh.tell() fh.seek(0, 2) total_progress = fh.tell() fh.seek(current) pind = ProgressIndicator(totalToDo=total_progress, messagePrefix="completed", messageSuffix="of processing repeat-masker " "alignment file" + m_fn) except IOError: pind = None old_fh_pos = None new_fh_pos = fh.tell() s1 = None s2 = None s1_name = None s2_name = None s1_start = None s1_end = None s2_start = None s2_end = None meta_data = None alignment_line_counter = 0 alig_l_space = 0 prev_seq_len = 0 rev_comp_match = None remaining_repeat = None remaining_genomic = None for line in iterable: if verbose and pind is not None: pind.done = fh.tell() pind.showProgress() if index_friendly: old_fh_pos = new_fh_pos new_fh_pos = fh.tell() line = line.rstrip() if line.lstrip() == "" and alignment_line_counter % 3 != 1: continue s_pres_split = re.split(r'(\s+)', line) parts = [x for x in s_pres_split if not (x.isspace() or x == "")] n = len(parts) for i in REPEATMASKER_FIELDS_TO_TRIM: if n >= i + 1: parts[i] = parts[i].strip() # decide what to do with this line -- is it a header line, part of the # alignment or a meta-data key-value line if alignment_line_counter % 3 == 1: if (REPEATMASKER_VALIDATE_MUTATIONS and not _rm_is_valid_annotation_line(line)): raise IOError("invalid mutation line: " + line) l_space = _rm_compute_leading_space(s_pres_split) - alig_l_space pad_right = prev_seq_len - (l_space + len(line.strip())) meta_data[ANNOTATION_KEY] += ((' ' * l_space) + line.strip() + (' ' * pad_right)) alignment_line_counter += 1 elif _rm_is_header_line(parts, n): if not (s1 is None and s2 is None and meta_data is None): if ANNOTATION_KEY in meta_data: meta_data[ANNOTATION_KEY] = meta_data[ANNOTATION_KEY].rstrip() if index_friendly: fh.seek(old_fh_pos) ss1 = Sequence(s1_name, s1, s1_start, s1_end, "+", remaining_genomic) s2s = "-" if rev_comp_match else "+" ss2 = Sequence(s2_name, s2, s2_start, s2_end, s2s, remaining_repeat) yield PairwiseAlignment(ss1, ss2, meta_data) if index_friendly: fh.seek(new_fh_pos) meta_data = {} s1 = "" s2 = "" s1_name, s2_name = _rm_get_names_from_header(parts) s1_start, s1_end = _rm_get_reference_coords_from_header(parts) s2_start, s2_end = _rm_get_repeat_coords_from_header(parts) rev_comp_match = _rm_is_reverse_comp_match(parts) remaining_repeat = _rm_get_remaining_repeat_from_header(parts) remaining_genomic = _rm_get_remaining_genomic_from_header(parts) _rm_parse_header_line(parts, meta_data) alignment_line_counter = 0 elif _rm_is_alignment_line(parts, s1_name, s2_name): alignment_line_counter += 1 name, seq = _rm_extract_sequence_and_name(parts, s1_name, s2_name) if name == s1_name: s1 += seq elif name == s2_name: s2 += seq alig_l_space = _rm_compute_leading_space_alig(s_pres_split, seq) prev_seq_len = len(seq) else: k, v = _rm_parse_meta_line(parts) meta_data[k] = v if index_friendly: fh.seek(old_fh_pos) ss1 = Sequence(s1_name, s1, s1_start, s1_end, "+", remaining_genomic) s2s = "-" if rev_comp_match else "+" ss2 = Sequence(s2_name, s2, s2_start, s2_end, s2s, remaining_repeat) yield PairwiseAlignment(ss1, ss2, meta_data) if index_friendly: fh.seek(new_fh_pos)
def repeat_masker_alignment_iterator(fn, index_friendly=True, verbose=False): """ Iterator for repeat masker alignment files; yields multiple alignment objects. Iterate over a file/stream of full repeat alignments in the repeatmasker format. Briefly, this format is as follows: each record (alignment) begins with a header line (see _rm_parse_header_line documentation for details of header format), followed by the alignment itself (example below) and finally a set of key-value meta-data pairs. The actual alignment looks like this:: chr1 11 CCCTGGAGATTCTTATT--AGTGATTTGGGCT 41 ii v -- v i i v C MER5B#DNA/hAT 10 CCCCAGAGATTCTGATTTAATTGGTCTGGGGT 42 chr1 42 GACTG 47 v C MER5B#DNA/hAT 43 CACTG 48 The 'C' indicates that its the reverse complement of the consensus. The central string gives information about matches; "-" indicates an insertion/deletion, "i" a transition (G<->A, C<->T) and "v" a transversion (all other substitutions). :param fh: filename or stream-like object to read from. :param index_friendly: if True, we will ensure the file/stream position is before the start of the record when we yield it; this requires the ability to seek within the stream though, so if iterating over a stream wtihout that ability, you'll have to set this to false. Further, this will disable buffering for the file, to ensure file.tell() behaves correctly, so a performance hit will be incurred. :param verbose: if true, output progress messages to stderr. """ # step 1 -- build our iterator for the stream.. try: fh = open(fn) except (TypeError): fh = fn iterable = fh if index_friendly: iterable = iter(fh.readline, '') # build progress indicator, if we want one and we're able to if verbose: try: m_fn = ": " + fh.name except TypeError: m_fn = "" try: current = fh.tell() fh.seek(0, 2) total_progress = fh.tell() fh.seek(current) pind = ProgressIndicator( totalToDo=total_progress, messagePrefix="completed", messageSuffix="of processing repeat-masker " "alignment file" + m_fn) except IOError: pind = None old_fh_pos = None new_fh_pos = fh.tell() s1 = None s2 = None s1_name = None s2_name = None s1_start = None s1_end = None s2_start = None s2_end = None meta_data = None alignment_line_counter = 0 alig_l_space = 0 prev_seq_len = 0 rev_comp_match = None remaining_repeat = None remaining_genomic = None for line in iterable: if verbose and pind is not None: pind.done = fh.tell() pind.showProgress() if index_friendly: old_fh_pos = new_fh_pos new_fh_pos = fh.tell() line = line.rstrip() if line.lstrip() == "" and alignment_line_counter % 3 != 1: continue s_pres_split = re.split(r'(\s+)', line) parts = [x for x in s_pres_split if not (x.isspace() or x == "")] n = len(parts) for i in REPEATMASKER_FIELDS_TO_TRIM: if n >= i + 1: parts[i] = parts[i].strip() # decide what to do with this line -- is it a header line, part of the # alignment or a meta-data key-value line if alignment_line_counter % 3 == 1: if (REPEATMASKER_VALIDATE_MUTATIONS and not _rm_is_valid_annotation_line(line)): raise IOError("invalid mutation line: " + line) l_space = _rm_compute_leading_space(s_pres_split) - alig_l_space pad_right = prev_seq_len - (l_space + len(line.strip())) meta_data[ANNOTATION_KEY] += ((' ' * l_space) + line.strip() + (' ' * pad_right)) alignment_line_counter += 1 elif _rm_is_header_line(parts, n): if not (s1 is None and s2 is None and meta_data is None): if ANNOTATION_KEY in meta_data: meta_data[ANNOTATION_KEY] = meta_data[ ANNOTATION_KEY].rstrip() if index_friendly: fh.seek(old_fh_pos) ss1 = Sequence(s1_name, s1, s1_start, s1_end, "+", remaining_genomic) s2s = "-" if rev_comp_match else "+" ss2 = Sequence(s2_name, s2, s2_start, s2_end, s2s, remaining_repeat) yield PairwiseAlignment(ss1, ss2, meta_data) if index_friendly: fh.seek(new_fh_pos) meta_data = {} s1 = "" s2 = "" s1_name, s2_name = _rm_get_names_from_header(parts) s1_start, s1_end = _rm_get_reference_coords_from_header(parts) s2_start, s2_end = _rm_get_repeat_coords_from_header(parts) rev_comp_match = _rm_is_reverse_comp_match(parts) remaining_repeat = _rm_get_remaining_repeat_from_header(parts) remaining_genomic = _rm_get_remaining_genomic_from_header(parts) _rm_parse_header_line(parts, meta_data) alignment_line_counter = 0 elif _rm_is_alignment_line(parts, s1_name, s2_name): alignment_line_counter += 1 name, seq = _rm_extract_sequence_and_name(parts, s1_name, s2_name) if name == s1_name: s1 += seq elif name == s2_name: s2 += seq alig_l_space = _rm_compute_leading_space_alig(s_pres_split, seq) prev_seq_len = len(seq) else: k, v = _rm_parse_meta_line(parts) meta_data[k] = v if index_friendly: fh.seek(old_fh_pos) ss1 = Sequence(s1_name, s1, s1_start, s1_end, "+", remaining_genomic) s2s = "-" if rev_comp_match else "+" ss2 = Sequence(s2_name, s2, s2_start, s2_end, s2s, remaining_repeat) yield PairwiseAlignment(ss1, ss2, meta_data) if index_friendly: fh.seek(new_fh_pos)
def read_index(self, fh, indexed_fh, rec_iterator=None, rec_hash_func=None, parse_hash=str, flush=True, no_reindex=True, verbose=False): """ Populate this index from a file. Input format is just a tab-separated file, one record per line. The last column is the file location for the record and all columns before that are collectively considered to be the hash key for that record (which is probably only 1 column, but this allows us to permit tabs in hash keys). Lines consisting only of whitespace are skipped. :param fh: filename or stream-like object to read from. :param indexed_fh: either the filename of the indexed file or handle to it. :param rec_iterator: a function that will return an interator for the indexed file type (not the iterator for the file itself). This function must take a single argument which is the name the file to iterate over, or a stream like object similar to a filestream. :param rec_hash_func: a function that accepts the record type produced by the iterator and produces a unique hash for each record. :param parse_hash: a function to convert the string representation of the hash into whatever type is needed. By default, we just leave these as strings. :param flush: remove everything currently in the index and discard any details about a file that is already fully/partially indexed by this object. This is the default behavior. If False, then data from <fh> is just added to the existing index data (potentially overwriting some of it) and the existing index can continue to be used as before. :param no_reindex: if True, after loading the index, a missing key will cause an exception, rather than trigger re-scanning the indexed file for the associated record. The only reason to set this to False would be if your index was incomplete. :param verbose: output status message to STDERR about progress reading the index (if possible). :raise IndexError: on malformed line in input file/stream """ # set the record iterator and hash functions, if they were given if rec_iterator is not None: self.record_iterator = rec_iterator if rec_hash_func is not None: self.record_hash_function = rec_hash_func # disable re-indexing? self._no_reindex = no_reindex # figure out what kind of index identifier we got: handle or filename? handle = fh try: handle = open(fh) except TypeError: # okay, not a filename, we'll try treating it as a stream to read from. pass # clear this index? if flush: self._index = {} self._indexed_file_handle = None self._indexed_file_name = None # replace the name/handle for the indexed file indexed_fn = None try: # try treating this as a filename self.indexed_file = (indexed_fh, None) indexed_fn = indexed_fh except TypeError: try: # try treating this as a file handle self.indexed_file = (None, indexed_fh) except TypeError: fn = " from " + str(fh) if indexed_fn is not None else "" raise IndexError("failed to read index" + fn + "; " "reason: expected indexed filename or stream-like " "object, got " + str(type(indexed_fh))) # try to get an idea of how much data we have... if verbose: try: total = os.path.getsize(handle.name) pind = ProgressIndicator(totalToDo=total, messagePrefix="completed", messageSuffix="of loading " + handle.name) except AttributeError as e: sys.stderr.write(str(e)) sys.stderr.write("completed [unknown] of loading index") verbose = False # read the index file and populate this object for line in handle: line = line.rstrip() if verbose: pind.done = handle.tell() pind.showProgress() if line.isspace(): continue parts = line.split("\t") if len(parts) < 2: raise IndexError("failed to parse line: '" + line + "'") key = parse_hash("\t".join(parts[:-1])) value = parts[-1] self._index[key] = int(value)
def fastqIteratorSimple(fn, verbose=False, allowNameMissmatch=False): """ A generator function that yields FastqSequence objects read from a fastq-format stream or filename. This is iterator requires that all sequence and quality data is provided on a single line -- put another way, it cannot parse fastq files with newline characters interspersed in the sequence and/or quality strings. That's probably okay though, as fastq files tend not to be formated like that (famous last words..). :param fn: filename or stream to read data from. :param allowNameMismatch: don't throw error if name in sequence data and quality data parts of a read don't match. Newer version of CASVA seem to output data like this, probably to save space. :param verbose: if True, output additional status messages to stderr about progress. """ fh = fn if type(fh).__name__ == "str": fh = open(fh) # try to get an idea of how much data we have... if verbose: try: totalLines = os.path.getsize(fh.name) pind = ProgressIndicator(totalToDo=totalLines, messagePrefix="completed", messageSuffix="of processing " + fh.name) except AttributeError: sys.stderr.write("fastqIterator -- warning: " + "unable to show progress for stream") verbose = False while True: # read four lines.. if we can't get four lines, something is wrong lines = [] gotLines = 0 while gotLines < 4: l = fh.readline() if verbose: pind.done = fh.tell() pind.showProgress() if l == "": # end of file found... if gotLines == 0: # ok, not in the middle of a sequence break else: raise FastqFileFormatError("reached end of file in the " + "middle of sequence data") l = l.strip() if l == "": continue lines.append(l) gotLines += 1 # couldn't get any more data.. we're done if gotLines == 0: break # got our 4 lines, assemble our read.. # first check that names match if lines[0][1:] != lines[2][1:] and not allowNameMissmatch: raise FastqFileFormatError("names in sequence don't match : " + str(lines[0][1:]) + " != " + str(lines[2][1:])) name = lines[0][1:] seq = lines[1] qual = lines[3] yield NGSRead(seq, name, qual)
def read_index(self, fh, indexed_fh, rec_iterator=None, rec_hash_func=None, parse_hash=str, flush=True, no_reindex=True, verbose=False): """ Populate this index from a file. Input format is just a tab-separated file, one record per line. The last column is the file location for the record and all columns before that are collectively considered to be the hash key for that record (which is probably only 1 column, but this allows us to permit tabs in hash keys). Lines consisting only of whitespace are skipped. :param fh: filename or stream-like object to read from. :param indexed_fh: either the filename of the indexed file or handle to it. :param rec_iterator: a function that will return an interator for the indexed file type (not the iterator for the file itself). This function must take a single argument which is the name the file to iterate over, or a stream like object similar to a filestream. :param rec_hash_func: a function that accepts the record type produced by the iterator and produces a unique hash for each record. :param parse_hash: a function to convert the string representation of the hash into whatever type is needed. By default, we just leave these as strings. :param flush: remove everything currently in the index and discard any details about a file that is already fully/partially indexed by this object. This is the default behavior. If False, then data from <fh> is just added to the existing index data (potentially overwriting some of it) and the existing index can continue to be used as before. :param no_reindex: if True, after loading the index, a missing key will cause an exception, rather than trigger re-scanning the indexed file for the associated record. The only reason to set this to False would be if your index was incomplete. :param verbose: output status message to STDERR about progress reading the index (if possible). :raise IndexError: on malformed line in input file/stream """ # set the record iterator and hash functions, if they were given if rec_iterator is not None: self.record_iterator = rec_iterator if rec_hash_func is not None: self.record_hash_function = rec_hash_func # disable re-indexing? self._no_reindex = no_reindex # figure out what kind of index identifier we got: handle or filename? handle = fh try: handle = open(fh) except TypeError: # okay, not a filename, we'll try treating it as a stream to read from. pass # clear this index? if flush: self._index = {} self._indexed_file_handle = None self._indexed_file_name = None # replace the name/handle for the indexed file indexed_fn = None try: # try treating this as a filename self.indexed_file = (indexed_fh, None) indexed_fn = indexed_fh except TypeError: try: # try treating this as a file handle self.indexed_file = (None, indexed_fh) except TypeError: fn = " from " + str(fh) if indexed_fn is not None else "" raise IndexError( "failed to read index" + fn + "; " "reason: expected indexed filename or stream-like " "object, got " + str(type(indexed_fh))) # try to get an idea of how much data we have... if verbose: try: total = os.path.getsize(handle.name) pind = ProgressIndicator(totalToDo=total, messagePrefix="completed", messageSuffix="of loading " + handle.name) except AttributeError as e: sys.stderr.write(str(e)) sys.stderr.write("completed [unknown] of loading index") verbose = False # read the index file and populate this object for line in handle: line = line.rstrip() if verbose: pind.done = handle.tell() pind.showProgress() if line.isspace(): continue parts = line.split("\t") if len(parts) < 2: raise IndexError("failed to parse line: '" + line + "'") key = parse_hash("\t".join(parts[:-1])) value = parts[-1] self._index[key] = int(value)
def build(self): currentBlock = None at = self.handle.tell() seenChroms = set() lastIndexSeen = -1 if self.verbose: try: pind = ProgressIndicator( totalToDo=os.path.getsize(self.handle.name), messagePrefix="completed", messageSuffix="of building index for " + self.handle.name) except: sys.stderr.write("IndexedWig -- warning: " + "unable to show progress for stream\n") self.verbose = False ### note, for loop seems to buffer the file and so tell() gives a ### location that is not where the current line was read from, so ### we stick to readline instead. rline = None while rline != "": # get the next element rline = self.handle.readline() line = rline.strip() if line == "": continue e = parseWigString(line) # keep track of what chroms we've seen for checking order if not e.chrom in seenChroms: seenChroms.add(e.chrom) lastIndexSeen = -1 # check chrom order is ok for seenChrom in seenChroms: if seenChrom > e.chrom: msg = "wig file is not sorted, entry for chrom " + str(seenChrom) +\ " appears after entry for " + str(e.chrom) raise IndexedWigError(msg) # check position order is ok if e.start < lastIndexSeen: msg = "wig file is not sorted, entry for chrom " + str(e.chrom) +\ " at " + str(e.start) + " appears after " + str(lastIndexSeen) raise IndexedWigError(msg) # update the last index we've seen lastIndexSeen = e.end # debugging message if the current block is full if self.debug is True: sys.stderr.write("processing " + str(e)) if currentBlock is not None: sys.stderr.write("; is current block full?" + str(currentBlock.isfull()) + "\n") else: sys.stderr.write("\n") # we might need to make a new block for this element if currentBlock is None or currentBlock.isfull() or \ currentBlock.chrom != e.chrom: if self.debug: sys.stderr.write("making new block with " + str(e) + "\n") if currentBlock is not None: if self.debug: sys.stderr.write("closed block: " + str(currentBlock) + "\n") if currentBlock.chrom not in self.blocksByChrom: self.blocksByChrom[currentBlock.chrom] = [] self.blocksByChrom[currentBlock.chrom].append(currentBlock) currentBlock = WigBlock(at, e, self.blocksize) # add the element to the current block currentBlock.add(e) at = self.handle.tell() if self.verbose: pind.done = self.handle.tell() pind.showProgress() # don't forget to add the final block if currentBlock != None: if self.debug: sys.stderr.write("closed block: " + str(currentBlock) + "\n") if currentBlock.chrom not in self.blocksByChrom: self.blocksByChrom[currentBlock.chrom] = [] self.blocksByChrom[currentBlock.chrom].append(currentBlock) # build the interval trees for chrom in self.blocksByChrom: self.itrees[chrom] = IntervalTree(self.blocksByChrom[chrom], openEnded=True)
def BEDIterator(filehandle, sortedby=None, verbose=False, scoreType=int, dropAfter=None): """ Get an iterator for a BED file :param filehandle: this can be either a string, or a stream-like object. In the former case, it is treated as a filename. The format of the file/stream must be BED. :param sortedby: if None, order is not checked. if == ITERATOR_SORTED_START, elements in file must be sorted by chrom and start index (an exception is raised if they are not) if == ITERATOR_SORTED_END, element must be sorted by chrom and end index. :param verbose: if True, output additional progress messages to stderr :param scoreType: The data type for scores (the fifth column) in the BED file. :param dropAfter: an int indicating that any fields after and including this field should be ignored as they don't conform to the BED format. By default, None, meaning we use all fields. Index from zero. :return: iterator where subsequent calls to next() yield the next BED element in the stream as a GenomicInterval object. """ chromsSeen = set() prev = None if type(filehandle).__name__ == "str": filehandle = open(filehandle) if verbose: try: pind = ProgressIndicator( totalToDo=os.path.getsize(filehandle.name), messagePrefix="completed", messageSuffix="of processing " + filehandle.name) except (AttributeError, OSError) as e: sys.stderr.write("BEDIterator -- warning: " + "unable to show progress for stream") verbose = False for line in filehandle: if verbose: pind.done = filehandle.tell() pind.showProgress() if line.strip() == "": continue try: e = parseBEDString(line, scoreType, dropAfter=dropAfter) except GenomicIntervalError as e: raise BEDError(str(e) + " on line " + line) # sorting by name? if ((sortedby == ITERATOR_SORTED_NAME and prev is not None) and (prev.name > e.name)): raise BEDError("bed file " + filehandle.name + " not sorted by element name" + " found " + e.name + " after " + prev.name) # first item if prev is None: chromsSeen.add(e.chrom) # on same chrom as the prev item, make sure order is right if prev is not None and sortedby is not None and e.chrom == prev.chrom: if sortedby == ITERATOR_SORTED_START and prev.start > e.start: raise BEDError("bed file " + filehandle.name + " not sorted by start index - saw item " + str(prev) + " before " + str(e)) if sortedby == ITERATOR_SORTED_END and prev.end > e.end: raise BEDError("bed file " + filehandle.name + " not sorted by end index - saw item " + str(prev) + " before " + str(e)) # starting a new chrom.. make sure we haven't already seen it if prev is not None and prev.chrom != e.chrom: if (sortedby == ITERATOR_SORTED_START or sortedby == ITERATOR_SORTED_END or sortedby == ITERATOR_SORTED_CHROM) and\ (e.chrom in chromsSeen or prev.chrom > e.chrom): try: e_fn = filehandle.name except AttributeError: e_fn = "UNNAMED STREAM" raise BEDError("BED file " + e_fn + " not sorted by chrom") chromsSeen.add(e.chrom) # all good.. yield e prev = e
def BEDIterator(filehandle, sortedby=None, verbose=False, scoreType=int, dropAfter=None): """ Get an iterator for a BED file :param filehandle: this can be either a string, or a stream-like object. In the former case, it is treated as a filename. The format of the file/stream must be BED. :param sortedby: if None, order is not checked. if == ITERATOR_SORTED_START, elements in file must be sorted by chrom and start index (an exception is raised if they are not) if == ITERATOR_SORTED_END, element must be sorted by chrom and end index. :param verbose: if True, output additional progress messages to stderr :param scoreType: The data type for scores (the fifth column) in the BED file. :param dropAfter: an int indicating that any fields after and including this field should be ignored as they don't conform to the BED format. By default, None, meaning we use all fields. Index from zero. :return: iterator where subsequent calls to next() yield the next BED element in the stream as a GenomicInterval object. """ chromsSeen = set() prev = None if type(filehandle).__name__ == "str": filehandle = open(filehandle) if verbose: try: pind = ProgressIndicator(totalToDo=os.path.getsize(filehandle.name), messagePrefix="completed", messageSuffix="of processing " + filehandle.name) except (AttributeError, OSError) as e: sys.stderr.write("BEDIterator -- warning: " + "unable to show progress for stream") verbose = False for line in filehandle: if verbose: pind.done = filehandle.tell() pind.showProgress() if line.strip() == "": continue try: e = parseBEDString(line, scoreType, dropAfter=dropAfter) except GenomicIntervalError as e: raise BEDError(str(e) + " on line " + line) # sorting by name? if ((sortedby == ITERATOR_SORTED_NAME and prev is not None) and (prev.name > e.name)): raise BEDError("bed file " + filehandle.name + " not sorted by element name" + " found " + e.name + " after " + prev.name) # first item if prev is None: chromsSeen.add(e.chrom) # on same chrom as the prev item, make sure order is right if prev is not None and sortedby is not None and e.chrom == prev.chrom: if sortedby == ITERATOR_SORTED_START and prev.start > e.start: raise BEDError("bed file " + filehandle.name + " not sorted by start index - saw item " + str(prev) + " before " + str(e)) if sortedby == ITERATOR_SORTED_END and prev.end > e.end: raise BEDError("bed file " + filehandle.name + " not sorted by end index - saw item " + str(prev) + " before " + str(e)) # starting a new chrom.. make sure we haven't already seen it if prev is not None and prev.chrom != e.chrom: if (sortedby == ITERATOR_SORTED_START or sortedby == ITERATOR_SORTED_END or sortedby == ITERATOR_SORTED_CHROM) and\ (e.chrom in chromsSeen or prev.chrom > e.chrom): try: e_fn = filehandle.name except AttributeError: e_fn = "UNNAMED STREAM" raise BEDError("BED file " + e_fn + " not sorted by chrom") chromsSeen.add(e.chrom) # all good.. yield e prev = e