示例#1
0
def fixedWigIterator(fd, verbose=False, sortedby=None, scoreType=int):
  """
    @summary:
  """
  fh = openFD(fd)
  if verbose:
    try:
      pind = ProgressIndicator(totalToDo=os.path.getsize(fh.name),
                               messagePrefix="completed",
                               messageSuffix="of processing " + fh.name)
    except AttributeError:
      sys.stderr.write("WigIterator -- warning: "
                       + "unable to show progress for stream")
      verbose = False

  chromsSeen = set()
  prev = None

  # NUMBERS = set(['1','2','3','4','5','6','7','8','9','0','.'])
  currentChrom, at, step = None, None, None
  for line in fh:
    line = line.strip()
    if line == "":
      continue

    if line[0] == 't' or line[0] == 'f':
      parts = line.split()
      if parts[0] == "track":
        continue
      elif parts[0] == "fixedStep":
        currentChrom = parts[1].split("=")[1]
        at = int(parts[2].split("=")[1])
        step = int(parts[3].split("=")[1])
    else:
      val = float(line)
      e = GenomicInterval(currentChrom, at, at + step, None,
                          val, scoreType=scoreType)

      # on same chrom as the prev item, make sure order is right
      if prev is not None and sortedby is not None and e.chrom == prev.chrom:
        if sortedby == ITERATOR_SORTED_START and prev.start > e.start:
          raise WigIteratorError("Wig file " + fd.name
                                 + " not sorted by start index - saw item "
                                 + str(prev) + " before " + str(e))

      # starting a new chrom.. make sure we haven't already seen it
      if prev is not None and prev.chrom != e.chrom:
        if (sortedby == ITERATOR_SORTED_START) and\
           (e.chrom in chromsSeen or prev.chrom > e.chrom):
          raise WigIteratorError("Wig file " + fd.name
                                 + " not sorted by chrom")
        chromsSeen.add(e.chrom)

      # all good..
      yield e
      prev = e
      at += step
      if verbose :
        pind.done = fh.tell()
        pind.showProgress()
示例#2
0
  def __build_index(self, until=None, flush=False, verbose=False):
    """
    build/expand the index for this file.

    :param until: expand the index until the record with this hash has been
                  incorporated and then stop. If None, go until the iterator
                  is exhausted. Note that if this hash is already in the index,
                  no new items will be
    :param flush: if True, anything already in the index is discarded.
    """
    assert(self._indexed_file_handle is not None)
    if flush:
      self._index = {}

    file_loc = self._indexed_file_handle.tell()

    if verbose:
      self._indexed_file_handle.seek(0, 2)  # seek to end
      total = self._indexed_file_handle.tell() - file_loc
      self._indexed_file_handle.seek(file_loc)  # back to where we were
      pind = ProgressIndicator(totalToDo=total,
                               messagePrefix="completed",
                               messageSuffix="of building out index")

    for item in self.record_iterator(self._indexed_file_handle):
      hash_val = self.record_hash_function(item)
      self._index[hash_val] = file_loc
      file_loc = self._indexed_file_handle.tell()
      if until is not None and hash_val == until:
        break
      if verbose:
        pind.done = file_loc
        pind.showProgress()
示例#3
0
def genericFileIterator(fn, verbose=False):
  """
    @summary: iterate over a file, returning non-blank lines
    @param fn: either a string representing the name of the file or a file
               object
    @param verbose: if True, output status messages to stderr
  """
  if type(fn).__name__ == "str":
    fh = open(fn)
  else:
    fh = fn

  if verbose:
    try:
      pind = ProgressIndicator(totalToDo=os.path.getsize(fh.name),
                               messagePrefix="completed",
                               messageSuffix="of processing " + fh.name)
      junk = fh.tell()
    except:
      sys.stderr.write("Cannot show progress for stream.. doesn't behave "
                       "like a file")
      verbose = False

  for line in fh:
    if verbose:
      pind.done = fh.tell()
      pind.showProgress()
    line = line.strip()
    if line == "":
      continue
    yield line
示例#4
0
def genericFileIterator(fn, verbose=False):
    """
    @summary: iterate over a file, returning non-blank lines
    @param fn: either a string representing the name of the file or a file
               object
    @param verbose: if True, output status messages to stderr
  """
    if type(fn).__name__ == "str":
        fh = open(fn)
    else:
        fh = fn

    if verbose:
        try:
            pind = ProgressIndicator(totalToDo=os.path.getsize(fh.name),
                                     messagePrefix="completed",
                                     messageSuffix="of processing " + fh.name)
            junk = fh.tell()
        except:
            sys.stderr.write(
                "Cannot show progress for stream.. doesn't behave "
                "like a file")
            verbose = False

    for line in fh:
        if verbose:
            pind.done = fh.tell()
            pind.showProgress()
        line = line.strip()
        if line == "":
            continue
        yield line
示例#5
0
    def __build_index(self, until=None, flush=False, verbose=False):
        """
    build/expand the index for this file.

    :param until: expand the index until the record with this hash has been
                  incorporated and then stop. If None, go until the iterator
                  is exhausted. Note that if this hash is already in the index,
                  no new items will be
    :param flush: if True, anything already in the index is discarded.
    """
        assert (self._indexed_file_handle is not None)
        if flush:
            self._index = {}

        file_loc = self._indexed_file_handle.tell()

        if verbose:
            self._indexed_file_handle.seek(0, 2)  # seek to end
            total = self._indexed_file_handle.tell() - file_loc
            self._indexed_file_handle.seek(file_loc)  # back to where we were
            pind = ProgressIndicator(totalToDo=total,
                                     messagePrefix="completed",
                                     messageSuffix="of building out index")

        for item in self.record_iterator(self._indexed_file_handle):
            hash_val = self.record_hash_function(item)
            self._index[hash_val] = file_loc
            file_loc = self._indexed_file_handle.tell()
            if until is not None and hash_val == until:
                break
            if verbose:
                pind.done = file_loc
                pind.showProgress()
示例#6
0
def repeat_masker_iterator(fh, alignment_index=None,
                           header=True, verbose=False):
  """
  Iterator for repeatmasker coordinate annotation files. These files describe
  the location of repeat occurrences. There is (optionally) a two-line header
  with the names of the fields (ignored by the iterator, if present). Each line
  is a record of an occurrence. The description of fields for each line is
  given in from_repeat_masker_string.

  :param fh:              stream-like object, or string filename, to load the
                          annotations from
  :param alignment_index: an IndexedFile for full alignments; keys should be
                          repeat-masker IDs
  :param header:          if True, expect and discard the two-line header;
                          otherwise we will expect there is no header
  :param verbose:         if True, output additional status messages about
                          progress to stderr.
  """

  strm = fh
  if type(fh).__name__ == "str":
    strm = open(fh)

  # try to get an idea of how much data we have...
  if verbose:
    try:
      total = os.path.getsize(strm.name)
      pind = ProgressIndicator(totalToDo=total, messagePrefix="completed",
                               messageSuffix="of processing " + strm.name)
    except AttributeError as e:
      sys.stderr.write(str(e))
      sys.stderr.write("completed [unknown] of processing index")
      verbose = False

  if header:
    # chomp first 2 lines
    next(strm)
    next(strm)

  for line in strm:
    if verbose:
      pind.done = strm.tell()
      pind.showProgress()

    line = line.strip()
    if line == "":
      continue
    rto = retrotransposon.from_repeat_masker_string(line)
    if alignment_index is not None:
      rto.pairwise_alignment =\
          JustInTimePairwiseAlignment(alignment_index, rto.uniq_id)
    yield rto
示例#7
0
def fastqIteratorSimple(fn, verbose=False, allowNameMissmatch=False):
    """
    A generator function that yields FastqSequence objects read from a
    fastq-format stream or filename. This is iterator requires that all
    sequence and quality data is provided on a single line -- put another way,
    it cannot parse fastq files with newline characters interspersed in the
    sequence and/or quality strings. That's probably okay though, as fastq
    files tend not to be formated like that (famous last words..).

    :param fn: filename or stream to read data from.
    :param allowNameMismatch:  don't throw error if name in sequence data
                               and quality data parts of a read don't match.
                               Newer version of CASVA seem to output data like
                               this, probably to save space.
    :param verbose: if True, output additional status messages to stderr about
                    progress.
  """
    fh = fn
    if type(fh).__name__ == "str":
        fh = open(fh)

    # try to get an idea of how much data we have...
    if verbose:
        try:
            totalLines = os.path.getsize(fh.name)
            pind = ProgressIndicator(
                totalToDo=totalLines, messagePrefix="completed", messageSuffix="of processing " + fh.name
            )
        except AttributeError:
            sys.stderr.write("fastqIterator -- warning: " + "unable to show progress for stream")
            verbose = False

    while True:
        # read four lines.. if we can't get four lines, something is wrong
        lines = []
        gotLines = 0
        while gotLines < 4:
            l = fh.readline()
            if verbose:
                pind.done = fh.tell()
                pind.showProgress()

            if l == "":
                # end of file found...
                if gotLines == 0:
                    # ok, not in the middle of a sequence
                    break
                else:
                    raise FastqFileFormatError("reached end of file in the " + "middle of sequence data")

            l = l.strip()
            if l == "":
                continue
            lines.append(l)
            gotLines += 1

        # couldn't get any more data.. we're done
        if gotLines == 0:
            break

        # got our 4 lines, assemble our read..
        # first check that names match
        if lines[0][1:] != lines[2][1:] and not allowNameMissmatch:
            raise FastqFileFormatError(
                "names in sequence don't match : " + str(lines[0][1:]) + " != " + str(lines[2][1:])
            )
        name = lines[0][1:]
        seq = lines[1]
        qual = lines[3]
        yield NGSRead(seq, name, qual)
示例#8
0
def fixedWigIterator(fd, verbose=False, sortedby=None, scoreType=int):
    """
    @summary:
  """
    fh = openFD(fd)
    if verbose:
        try:
            pind = ProgressIndicator(totalToDo=os.path.getsize(fh.name),
                                     messagePrefix="completed",
                                     messageSuffix="of processing " + fh.name)
        except AttributeError:
            sys.stderr.write("WigIterator -- warning: " +
                             "unable to show progress for stream")
            verbose = False

    chromsSeen = set()
    prev = None

    # NUMBERS = set(['1','2','3','4','5','6','7','8','9','0','.'])
    currentChrom, at, step = None, None, None
    for line in fh:
        line = line.strip()
        if line == "":
            continue

        if line[0] == 't' or line[0] == 'f':
            parts = line.split()
            if parts[0] == "track":
                continue
            elif parts[0] == "fixedStep":
                currentChrom = parts[1].split("=")[1]
                at = int(parts[2].split("=")[1])
                step = int(parts[3].split("=")[1])
        else:
            val = float(line)
            e = GenomicInterval(currentChrom,
                                at,
                                at + step,
                                None,
                                val,
                                scoreType=scoreType)

            # on same chrom as the prev item, make sure order is right
            if prev is not None and sortedby is not None and e.chrom == prev.chrom:
                if sortedby == ITERATOR_SORTED_START and prev.start > e.start:
                    raise WigIteratorError(
                        "Wig file " + fd.name +
                        " not sorted by start index - saw item " + str(prev) +
                        " before " + str(e))

            # starting a new chrom.. make sure we haven't already seen it
            if prev is not None and prev.chrom != e.chrom:
                if (sortedby == ITERATOR_SORTED_START) and\
                   (e.chrom in chromsSeen or prev.chrom > e.chrom):
                    raise WigIteratorError("Wig file " + fd.name +
                                           " not sorted by chrom")
                chromsSeen.add(e.chrom)

            # all good..
            yield e
            prev = e
            at += step
            if verbose:
                pind.done = fh.tell()
                pind.showProgress()
示例#9
0
    def build(self):
        currentBlock = None
        at = self.handle.tell()
        seenChroms = set()
        lastIndexSeen = -1

        if self.verbose:
            try:
                pind = ProgressIndicator(
                    totalToDo=os.path.getsize(self.handle.name),
                    messagePrefix="completed",
                    messageSuffix="of building index for " + self.handle.name,
                )
            except:
                sys.stderr.write("IndexedWig -- warning: " + "unable to show progress for stream\n")
                self.verbose = False

        ### note, for loop seems to buffer the file and so tell() gives a
        ### location that is not where the current line was read from, so
        ### we stick to readline instead.
        rline = None
        while rline != "":
            # get the next element
            rline = self.handle.readline()
            line = rline.strip()
            if line == "":
                continue
            e = parseWigString(line)

            # keep track of what chroms we've seen for checking order
            if not e.chrom in seenChroms:
                seenChroms.add(e.chrom)
                lastIndexSeen = -1

            # check chrom order is ok
            for seenChrom in seenChroms:
                if seenChrom > e.chrom:
                    msg = (
                        "wig file is not sorted, entry for chrom "
                        + str(seenChrom)
                        + " appears after entry for "
                        + str(e.chrom)
                    )
                    raise IndexedWigError(msg)
            # check position order is ok
            if e.start < lastIndexSeen:
                msg = (
                    "wig file is not sorted, entry for chrom "
                    + str(e.chrom)
                    + " at "
                    + str(e.start)
                    + " appears after "
                    + str(lastIndexSeen)
                )
                raise IndexedWigError(msg)

            # update the last index we've seen
            lastIndexSeen = e.end

            # debugging message if the current block is full
            if self.debug is True:
                sys.stderr.write("processing " + str(e))
                if currentBlock is not None:
                    sys.stderr.write("; is current block full?" + str(currentBlock.isfull()) + "\n")
                else:
                    sys.stderr.write("\n")

            # we might need to make a new block for this element
            if currentBlock is None or currentBlock.isfull() or currentBlock.chrom != e.chrom:
                if self.debug:
                    sys.stderr.write("making new block with " + str(e) + "\n")
                if currentBlock is not None:
                    if self.debug:
                        sys.stderr.write("closed block: " + str(currentBlock) + "\n")
                    if currentBlock.chrom not in self.blocksByChrom:
                        self.blocksByChrom[currentBlock.chrom] = []
                    self.blocksByChrom[currentBlock.chrom].append(currentBlock)
                currentBlock = WigBlock(at, e, self.blocksize)

            # add the element to the current block
            currentBlock.add(e)

            at = self.handle.tell()

            if self.verbose:
                pind.done = self.handle.tell()
                pind.showProgress()

        # don't forget to add the final block
        if currentBlock != None:
            if self.debug:
                sys.stderr.write("closed block: " + str(currentBlock) + "\n")
            if currentBlock.chrom not in self.blocksByChrom:
                self.blocksByChrom[currentBlock.chrom] = []
            self.blocksByChrom[currentBlock.chrom].append(currentBlock)

        # build the interval trees
        for chrom in self.blocksByChrom:
            self.itrees[chrom] = IntervalTree(self.blocksByChrom[chrom], openEnded=True)
示例#10
0
def repeat_masker_alignment_iterator(fn, index_friendly=True, verbose=False):
  """
  Iterator for repeat masker alignment files; yields multiple alignment objects.

  Iterate over a file/stream of full repeat alignments in the repeatmasker
  format. Briefly, this format is as follows: each record (alignment) begins
  with a header line (see _rm_parse_header_line documentation for details of
  header format), followed by the alignment itself (example below) and finally
  a set of key-value meta-data pairs.

  The actual alignment looks like this::

    chr1               11 CCCTGGAGATTCTTATT--AGTGATTTGGGCT 41
                             ii        v   -- v  i i    v
    C MER5B#DNA/hAT    10 CCCCAGAGATTCTGATTTAATTGGTCTGGGGT 42

    chr1               42 GACTG 47
                           v
    C MER5B#DNA/hAT    43 CACTG 48

  The 'C' indicates that its the reverse complement of the consensus. The
  central string gives information about matches; "-" indicates an
  insertion/deletion, "i" a transition (G<->A, C<->T) and "v" a transversion
  (all other substitutions).

  :param fh:             filename or stream-like object to read from.
  :param index_friendly: if True, we will ensure the file/stream
                         position is before the start of the record when we
                         yield it; this requires the ability to seek within
                         the stream though, so if iterating over a
                         stream wtihout that ability, you'll have to set this
                         to false. Further, this will disable buffering for
                         the file, to ensure file.tell() behaves correctly,
                         so a performance hit will be incurred.
  :param verbose:        if true, output progress messages to stderr.
  """
  # step 1 -- build our iterator for the stream..
  try:
    fh = open(fn)
  except (TypeError):
    fh = fn
  iterable = fh
  if index_friendly:
    iterable = iter(fh.readline, '')

  # build progress indicator, if we want one and we're able to
  if verbose:
    try:
      m_fn = ": " + fh.name
    except TypeError:
      m_fn = ""
    try:
      current = fh.tell()
      fh.seek(0, 2)
      total_progress = fh.tell()
      fh.seek(current)
      pind = ProgressIndicator(totalToDo=total_progress,
                               messagePrefix="completed",
                               messageSuffix="of processing repeat-masker "
                                             "alignment file" + m_fn)
    except IOError:
      pind = None

  old_fh_pos = None
  new_fh_pos = fh.tell()

  s1 = None
  s2 = None
  s1_name = None
  s2_name = None
  s1_start = None
  s1_end = None
  s2_start = None
  s2_end = None
  meta_data = None
  alignment_line_counter = 0
  alig_l_space = 0
  prev_seq_len = 0
  rev_comp_match = None
  remaining_repeat = None
  remaining_genomic = None

  for line in iterable:
    if verbose and pind is not None:
      pind.done = fh.tell()
      pind.showProgress()

    if index_friendly:
      old_fh_pos = new_fh_pos
      new_fh_pos = fh.tell()
    line = line.rstrip()
    if line.lstrip() == "" and alignment_line_counter % 3 != 1:
      continue

    s_pres_split = re.split(r'(\s+)', line)
    parts = [x for x in s_pres_split if not (x.isspace() or x == "")]

    n = len(parts)
    for i in REPEATMASKER_FIELDS_TO_TRIM:
      if n >= i + 1:
        parts[i] = parts[i].strip()

    # decide what to do with this line -- is it a header line, part of the
    # alignment or a meta-data key-value line
    if alignment_line_counter % 3 == 1:
      if (REPEATMASKER_VALIDATE_MUTATIONS and
         not _rm_is_valid_annotation_line(line)):
        raise IOError("invalid mutation line: " + line)
      l_space = _rm_compute_leading_space(s_pres_split) - alig_l_space
      pad_right = prev_seq_len - (l_space + len(line.strip()))
      meta_data[ANNOTATION_KEY] += ((' ' * l_space) + line.strip() +
                                    (' ' * pad_right))
      alignment_line_counter += 1
    elif _rm_is_header_line(parts, n):
      if not (s1 is None and s2 is None and meta_data is None):
        if ANNOTATION_KEY in meta_data:
          meta_data[ANNOTATION_KEY] = meta_data[ANNOTATION_KEY].rstrip()
        if index_friendly:
          fh.seek(old_fh_pos)
        ss1 = Sequence(s1_name, s1, s1_start, s1_end, "+", remaining_genomic)
        s2s = "-" if rev_comp_match else "+"
        ss2 = Sequence(s2_name, s2, s2_start, s2_end, s2s, remaining_repeat)
        yield PairwiseAlignment(ss1, ss2, meta_data)
        if index_friendly:
          fh.seek(new_fh_pos)
      meta_data = {}
      s1 = ""
      s2 = ""
      s1_name, s2_name = _rm_get_names_from_header(parts)
      s1_start, s1_end = _rm_get_reference_coords_from_header(parts)
      s2_start, s2_end = _rm_get_repeat_coords_from_header(parts)
      rev_comp_match = _rm_is_reverse_comp_match(parts)
      remaining_repeat = _rm_get_remaining_repeat_from_header(parts)
      remaining_genomic = _rm_get_remaining_genomic_from_header(parts)

      _rm_parse_header_line(parts, meta_data)
      alignment_line_counter = 0
    elif _rm_is_alignment_line(parts, s1_name, s2_name):
      alignment_line_counter += 1
      name, seq = _rm_extract_sequence_and_name(parts, s1_name, s2_name)
      if name == s1_name:
        s1 += seq
      elif name == s2_name:
        s2 += seq
      alig_l_space = _rm_compute_leading_space_alig(s_pres_split, seq)
      prev_seq_len = len(seq)
    else:
      k, v = _rm_parse_meta_line(parts)
      meta_data[k] = v
  if index_friendly:
    fh.seek(old_fh_pos)
  ss1 = Sequence(s1_name, s1, s1_start, s1_end, "+", remaining_genomic)
  s2s = "-" if rev_comp_match else "+"
  ss2 = Sequence(s2_name, s2, s2_start, s2_end, s2s, remaining_repeat)
  yield PairwiseAlignment(ss1, ss2, meta_data)
  if index_friendly:
    fh.seek(new_fh_pos)
示例#11
0
def repeat_masker_alignment_iterator(fn, index_friendly=True, verbose=False):
    """
  Iterator for repeat masker alignment files; yields multiple alignment objects.

  Iterate over a file/stream of full repeat alignments in the repeatmasker
  format. Briefly, this format is as follows: each record (alignment) begins
  with a header line (see _rm_parse_header_line documentation for details of
  header format), followed by the alignment itself (example below) and finally
  a set of key-value meta-data pairs.

  The actual alignment looks like this::

    chr1               11 CCCTGGAGATTCTTATT--AGTGATTTGGGCT 41
                             ii        v   -- v  i i    v
    C MER5B#DNA/hAT    10 CCCCAGAGATTCTGATTTAATTGGTCTGGGGT 42

    chr1               42 GACTG 47
                           v
    C MER5B#DNA/hAT    43 CACTG 48

  The 'C' indicates that its the reverse complement of the consensus. The
  central string gives information about matches; "-" indicates an
  insertion/deletion, "i" a transition (G<->A, C<->T) and "v" a transversion
  (all other substitutions).

  :param fh:             filename or stream-like object to read from.
  :param index_friendly: if True, we will ensure the file/stream
                         position is before the start of the record when we
                         yield it; this requires the ability to seek within
                         the stream though, so if iterating over a
                         stream wtihout that ability, you'll have to set this
                         to false. Further, this will disable buffering for
                         the file, to ensure file.tell() behaves correctly,
                         so a performance hit will be incurred.
  :param verbose:        if true, output progress messages to stderr.
  """
    # step 1 -- build our iterator for the stream..
    try:
        fh = open(fn)
    except (TypeError):
        fh = fn
    iterable = fh
    if index_friendly:
        iterable = iter(fh.readline, '')

    # build progress indicator, if we want one and we're able to
    if verbose:
        try:
            m_fn = ": " + fh.name
        except TypeError:
            m_fn = ""
        try:
            current = fh.tell()
            fh.seek(0, 2)
            total_progress = fh.tell()
            fh.seek(current)
            pind = ProgressIndicator(
                totalToDo=total_progress,
                messagePrefix="completed",
                messageSuffix="of processing repeat-masker "
                "alignment file" + m_fn)
        except IOError:
            pind = None

    old_fh_pos = None
    new_fh_pos = fh.tell()

    s1 = None
    s2 = None
    s1_name = None
    s2_name = None
    s1_start = None
    s1_end = None
    s2_start = None
    s2_end = None
    meta_data = None
    alignment_line_counter = 0
    alig_l_space = 0
    prev_seq_len = 0
    rev_comp_match = None
    remaining_repeat = None
    remaining_genomic = None

    for line in iterable:
        if verbose and pind is not None:
            pind.done = fh.tell()
            pind.showProgress()

        if index_friendly:
            old_fh_pos = new_fh_pos
            new_fh_pos = fh.tell()
        line = line.rstrip()
        if line.lstrip() == "" and alignment_line_counter % 3 != 1:
            continue

        s_pres_split = re.split(r'(\s+)', line)
        parts = [x for x in s_pres_split if not (x.isspace() or x == "")]

        n = len(parts)
        for i in REPEATMASKER_FIELDS_TO_TRIM:
            if n >= i + 1:
                parts[i] = parts[i].strip()

        # decide what to do with this line -- is it a header line, part of the
        # alignment or a meta-data key-value line
        if alignment_line_counter % 3 == 1:
            if (REPEATMASKER_VALIDATE_MUTATIONS
                    and not _rm_is_valid_annotation_line(line)):
                raise IOError("invalid mutation line: " + line)
            l_space = _rm_compute_leading_space(s_pres_split) - alig_l_space
            pad_right = prev_seq_len - (l_space + len(line.strip()))
            meta_data[ANNOTATION_KEY] += ((' ' * l_space) + line.strip() +
                                          (' ' * pad_right))
            alignment_line_counter += 1
        elif _rm_is_header_line(parts, n):
            if not (s1 is None and s2 is None and meta_data is None):
                if ANNOTATION_KEY in meta_data:
                    meta_data[ANNOTATION_KEY] = meta_data[
                        ANNOTATION_KEY].rstrip()
                if index_friendly:
                    fh.seek(old_fh_pos)
                ss1 = Sequence(s1_name, s1, s1_start, s1_end, "+",
                               remaining_genomic)
                s2s = "-" if rev_comp_match else "+"
                ss2 = Sequence(s2_name, s2, s2_start, s2_end, s2s,
                               remaining_repeat)
                yield PairwiseAlignment(ss1, ss2, meta_data)
                if index_friendly:
                    fh.seek(new_fh_pos)
            meta_data = {}
            s1 = ""
            s2 = ""
            s1_name, s2_name = _rm_get_names_from_header(parts)
            s1_start, s1_end = _rm_get_reference_coords_from_header(parts)
            s2_start, s2_end = _rm_get_repeat_coords_from_header(parts)
            rev_comp_match = _rm_is_reverse_comp_match(parts)
            remaining_repeat = _rm_get_remaining_repeat_from_header(parts)
            remaining_genomic = _rm_get_remaining_genomic_from_header(parts)

            _rm_parse_header_line(parts, meta_data)
            alignment_line_counter = 0
        elif _rm_is_alignment_line(parts, s1_name, s2_name):
            alignment_line_counter += 1
            name, seq = _rm_extract_sequence_and_name(parts, s1_name, s2_name)
            if name == s1_name:
                s1 += seq
            elif name == s2_name:
                s2 += seq
            alig_l_space = _rm_compute_leading_space_alig(s_pres_split, seq)
            prev_seq_len = len(seq)
        else:
            k, v = _rm_parse_meta_line(parts)
            meta_data[k] = v
    if index_friendly:
        fh.seek(old_fh_pos)
    ss1 = Sequence(s1_name, s1, s1_start, s1_end, "+", remaining_genomic)
    s2s = "-" if rev_comp_match else "+"
    ss2 = Sequence(s2_name, s2, s2_start, s2_end, s2s, remaining_repeat)
    yield PairwiseAlignment(ss1, ss2, meta_data)
    if index_friendly:
        fh.seek(new_fh_pos)
示例#12
0
  def read_index(self, fh, indexed_fh, rec_iterator=None,
                 rec_hash_func=None, parse_hash=str, flush=True,
                 no_reindex=True, verbose=False):
    """
    Populate this index from a file. Input format is just a tab-separated file,
    one record per line. The last column is the file location for the record
    and all columns before that are collectively considered to be the hash key
    for that record (which is probably only 1 column, but this allows us to
    permit tabs in hash keys). Lines consisting only of whitespace are skipped.

    :param fh:            filename or stream-like object to read from.
    :param indexed_fh:    either the filename of the indexed file or handle to
                          it.
    :param rec_iterator:  a function that will return an interator for the
                          indexed file type (not the iterator for the file
                          itself). This function must take a single argument
                          which is the name the file to iterate over, or a
                          stream like object similar to a filestream.
    :param rec_hash_func: a function that accepts the record type produced by
                          the iterator and produces a unique hash for each
                          record.
    :param parse_hash:    a function to convert the string representation of
                          the hash into whatever type is needed. By default,
                          we just leave these as strings.
    :param flush:         remove everything currently in the index and discard
                          any details about a file that is already
                          fully/partially indexed by this object. This is the
                          default behavior. If False, then data from <fh> is
                          just added to the existing index data (potentially
                          overwriting some of it) and the existing index can
                          continue to be used as before.
    :param no_reindex:    if True, after loading the index, a missing key will
                          cause an exception, rather than trigger re-scanning
                          the indexed file for the associated record. The only
                          reason to set this to False would be if your index
                          was incomplete.
    :param verbose:       output status message to STDERR about progress
                          reading the index (if possible).

    :raise IndexError: on malformed line in input file/stream
    """
    # set the record iterator and hash functions, if they were given
    if rec_iterator is not None:
      self.record_iterator = rec_iterator
    if rec_hash_func is not None:
      self.record_hash_function = rec_hash_func

    # disable re-indexing?
    self._no_reindex = no_reindex

    # figure out what kind of index identifier we got: handle or filename?
    handle = fh
    try:
      handle = open(fh)
    except TypeError:
      # okay, not a filename, we'll try treating it as a stream to read from.
      pass

    # clear this index?
    if flush:
      self._index = {}
      self._indexed_file_handle = None
      self._indexed_file_name = None

    # replace the name/handle for the indexed file
    indexed_fn = None
    try:
      # try treating this as a filename
      self.indexed_file = (indexed_fh, None)
      indexed_fn = indexed_fh
    except TypeError:
      try:
        # try treating this as a file handle
        self.indexed_file = (None, indexed_fh)
      except TypeError:
        fn = " from " + str(fh) if indexed_fn is not None else ""
        raise IndexError("failed to read index" + fn + "; "
                         "reason: expected indexed filename or stream-like "
                         "object, got " + str(type(indexed_fh)))

    # try to get an idea of how much data we have...
    if verbose:
      try:
        total = os.path.getsize(handle.name)
        pind = ProgressIndicator(totalToDo=total, messagePrefix="completed",
                                 messageSuffix="of loading " + handle.name)
      except AttributeError as e:
        sys.stderr.write(str(e))
        sys.stderr.write("completed [unknown] of loading index")
        verbose = False

    # read the index file and populate this object
    for line in handle:
      line = line.rstrip()

      if verbose:
        pind.done = handle.tell()
        pind.showProgress()

      if line.isspace():
        continue
      parts = line.split("\t")
      if len(parts) < 2:
        raise IndexError("failed to parse line: '" + line + "'")
      key = parse_hash("\t".join(parts[:-1]))
      value = parts[-1]
      self._index[key] = int(value)
示例#13
0
def fastqIteratorSimple(fn, verbose=False, allowNameMissmatch=False):
  """
    A generator function that yields FastqSequence objects read from a
    fastq-format stream or filename. This is iterator requires that all
    sequence and quality data is provided on a single line -- put another way,
    it cannot parse fastq files with newline characters interspersed in the
    sequence and/or quality strings. That's probably okay though, as fastq
    files tend not to be formated like that (famous last words..).

    :param fn: filename or stream to read data from.
    :param allowNameMismatch:  don't throw error if name in sequence data
                               and quality data parts of a read don't match.
                               Newer version of CASVA seem to output data like
                               this, probably to save space.
    :param verbose: if True, output additional status messages to stderr about
                    progress.
  """
  fh = fn
  if type(fh).__name__ == "str":
    fh = open(fh)

  # try to get an idea of how much data we have...
  if verbose:
    try:
      totalLines = os.path.getsize(fh.name)
      pind = ProgressIndicator(totalToDo=totalLines,
                               messagePrefix="completed",
                               messageSuffix="of processing "
                                             + fh.name)
    except AttributeError:
      sys.stderr.write("fastqIterator -- warning: " +
                       "unable to show progress for stream")
      verbose = False

  while True:
    # read four lines.. if we can't get four lines, something is wrong
    lines = []
    gotLines = 0
    while gotLines < 4:
      l = fh.readline()
      if verbose:
        pind.done = fh.tell()
        pind.showProgress()

      if l == "":
        # end of file found...
        if gotLines == 0:
          # ok, not in the middle of a sequence
          break
        else:
          raise FastqFileFormatError("reached end of file in the "
                                     + "middle of sequence data")

      l = l.strip()
      if l == "":
        continue
      lines.append(l)
      gotLines += 1

    # couldn't get any more data.. we're done
    if gotLines == 0:
      break

    # got our 4 lines, assemble our read..
    # first check that names match
    if lines[0][1:] != lines[2][1:] and not allowNameMissmatch:
      raise FastqFileFormatError("names in sequence don't match : " +
                                 str(lines[0][1:]) + " != " +
                                 str(lines[2][1:]))
    name = lines[0][1:]
    seq = lines[1]
    qual = lines[3]
    yield NGSRead(seq, name, qual)
示例#14
0
    def read_index(self,
                   fh,
                   indexed_fh,
                   rec_iterator=None,
                   rec_hash_func=None,
                   parse_hash=str,
                   flush=True,
                   no_reindex=True,
                   verbose=False):
        """
    Populate this index from a file. Input format is just a tab-separated file,
    one record per line. The last column is the file location for the record
    and all columns before that are collectively considered to be the hash key
    for that record (which is probably only 1 column, but this allows us to
    permit tabs in hash keys). Lines consisting only of whitespace are skipped.

    :param fh:            filename or stream-like object to read from.
    :param indexed_fh:    either the filename of the indexed file or handle to
                          it.
    :param rec_iterator:  a function that will return an interator for the
                          indexed file type (not the iterator for the file
                          itself). This function must take a single argument
                          which is the name the file to iterate over, or a
                          stream like object similar to a filestream.
    :param rec_hash_func: a function that accepts the record type produced by
                          the iterator and produces a unique hash for each
                          record.
    :param parse_hash:    a function to convert the string representation of
                          the hash into whatever type is needed. By default,
                          we just leave these as strings.
    :param flush:         remove everything currently in the index and discard
                          any details about a file that is already
                          fully/partially indexed by this object. This is the
                          default behavior. If False, then data from <fh> is
                          just added to the existing index data (potentially
                          overwriting some of it) and the existing index can
                          continue to be used as before.
    :param no_reindex:    if True, after loading the index, a missing key will
                          cause an exception, rather than trigger re-scanning
                          the indexed file for the associated record. The only
                          reason to set this to False would be if your index
                          was incomplete.
    :param verbose:       output status message to STDERR about progress
                          reading the index (if possible).

    :raise IndexError: on malformed line in input file/stream
    """
        # set the record iterator and hash functions, if they were given
        if rec_iterator is not None:
            self.record_iterator = rec_iterator
        if rec_hash_func is not None:
            self.record_hash_function = rec_hash_func

        # disable re-indexing?
        self._no_reindex = no_reindex

        # figure out what kind of index identifier we got: handle or filename?
        handle = fh
        try:
            handle = open(fh)
        except TypeError:
            # okay, not a filename, we'll try treating it as a stream to read from.
            pass

        # clear this index?
        if flush:
            self._index = {}
            self._indexed_file_handle = None
            self._indexed_file_name = None

        # replace the name/handle for the indexed file
        indexed_fn = None
        try:
            # try treating this as a filename
            self.indexed_file = (indexed_fh, None)
            indexed_fn = indexed_fh
        except TypeError:
            try:
                # try treating this as a file handle
                self.indexed_file = (None, indexed_fh)
            except TypeError:
                fn = " from " + str(fh) if indexed_fn is not None else ""
                raise IndexError(
                    "failed to read index" + fn + "; "
                    "reason: expected indexed filename or stream-like "
                    "object, got " + str(type(indexed_fh)))

        # try to get an idea of how much data we have...
        if verbose:
            try:
                total = os.path.getsize(handle.name)
                pind = ProgressIndicator(totalToDo=total,
                                         messagePrefix="completed",
                                         messageSuffix="of loading " +
                                         handle.name)
            except AttributeError as e:
                sys.stderr.write(str(e))
                sys.stderr.write("completed [unknown] of loading index")
                verbose = False

        # read the index file and populate this object
        for line in handle:
            line = line.rstrip()

            if verbose:
                pind.done = handle.tell()
                pind.showProgress()

            if line.isspace():
                continue
            parts = line.split("\t")
            if len(parts) < 2:
                raise IndexError("failed to parse line: '" + line + "'")
            key = parse_hash("\t".join(parts[:-1]))
            value = parts[-1]
            self._index[key] = int(value)
示例#15
0
    def build(self):
        currentBlock = None
        at = self.handle.tell()
        seenChroms = set()
        lastIndexSeen = -1

        if self.verbose:
            try:
                pind = ProgressIndicator(
                    totalToDo=os.path.getsize(self.handle.name),
                    messagePrefix="completed",
                    messageSuffix="of building index for " + self.handle.name)
            except:
                sys.stderr.write("IndexedWig -- warning: " +
                                 "unable to show progress for stream\n")
                self.verbose = False

        ### note, for loop seems to buffer the file and so tell() gives a
        ### location that is not where the current line was read from, so
        ### we stick to readline instead.
        rline = None
        while rline != "":
            # get the next element
            rline = self.handle.readline()
            line = rline.strip()
            if line == "": continue
            e = parseWigString(line)

            # keep track of what chroms we've seen for checking order
            if not e.chrom in seenChroms:
                seenChroms.add(e.chrom)
                lastIndexSeen = -1

            # check chrom order is ok
            for seenChrom in seenChroms:
                if seenChrom > e.chrom:
                    msg = "wig file is not sorted, entry for chrom " + str(seenChrom) +\
                          " appears after entry for " + str(e.chrom)
                    raise IndexedWigError(msg)
            # check position order is ok
            if e.start < lastIndexSeen:
                msg = "wig file is not sorted, entry for chrom " + str(e.chrom) +\
                      " at " + str(e.start) + " appears after " + str(lastIndexSeen)
                raise IndexedWigError(msg)

            # update the last index we've seen
            lastIndexSeen = e.end

            # debugging message if the current block is full
            if self.debug is True:
                sys.stderr.write("processing " + str(e))
                if currentBlock is not None:
                    sys.stderr.write("; is current block full?" +
                                     str(currentBlock.isfull()) + "\n")
                else:
                    sys.stderr.write("\n")

            # we might need to make a new block for this element
            if currentBlock is None or currentBlock.isfull() or \
               currentBlock.chrom != e.chrom:
                if self.debug:
                    sys.stderr.write("making new block with " + str(e) + "\n")
                if currentBlock is not None:
                    if self.debug:
                        sys.stderr.write("closed block: " + str(currentBlock) +
                                         "\n")
                    if currentBlock.chrom not in self.blocksByChrom:
                        self.blocksByChrom[currentBlock.chrom] = []
                    self.blocksByChrom[currentBlock.chrom].append(currentBlock)
                currentBlock = WigBlock(at, e, self.blocksize)

            # add the element to the current block
            currentBlock.add(e)

            at = self.handle.tell()

            if self.verbose:
                pind.done = self.handle.tell()
                pind.showProgress()

        # don't forget to add the final block
        if currentBlock != None:
            if self.debug:
                sys.stderr.write("closed block: " + str(currentBlock) + "\n")
            if currentBlock.chrom not in self.blocksByChrom:
                self.blocksByChrom[currentBlock.chrom] = []
            self.blocksByChrom[currentBlock.chrom].append(currentBlock)

        # build the interval trees
        for chrom in self.blocksByChrom:
            self.itrees[chrom] = IntervalTree(self.blocksByChrom[chrom],
                                              openEnded=True)
示例#16
0
def BEDIterator(filehandle,
                sortedby=None,
                verbose=False,
                scoreType=int,
                dropAfter=None):
    """
  Get an iterator for a BED file

  :param filehandle: this can be either a string, or a stream-like object. In
                     the former case, it is treated as a filename. The format
                     of the file/stream must be BED.
  :param sortedby: if None, order is not checked.
                   if == ITERATOR_SORTED_START, elements in file must
                   be sorted by chrom and start index (an exception
                   is raised if they are not)
                   if == ITERATOR_SORTED_END, element must be sorted
                   by chrom and end index.
  :param verbose: if True, output additional progress messages to stderr
  :param scoreType: The data type for scores (the fifth column) in the BED
                    file.
  :param dropAfter: an int indicating that any fields after and including this
                    field should be ignored as they don't conform to the BED
                    format. By default, None, meaning we use all fields. Index
                    from zero.
  :return: iterator where subsequent calls to next() yield the next BED
           element in the stream as a GenomicInterval object.
  """
    chromsSeen = set()
    prev = None
    if type(filehandle).__name__ == "str":
        filehandle = open(filehandle)

    if verbose:
        try:
            pind = ProgressIndicator(
                totalToDo=os.path.getsize(filehandle.name),
                messagePrefix="completed",
                messageSuffix="of processing " + filehandle.name)
        except (AttributeError, OSError) as e:
            sys.stderr.write("BEDIterator -- warning: " +
                             "unable to show progress for stream")
            verbose = False

    for line in filehandle:
        if verbose:
            pind.done = filehandle.tell()
            pind.showProgress()

        if line.strip() == "":
            continue
        try:
            e = parseBEDString(line, scoreType, dropAfter=dropAfter)
        except GenomicIntervalError as e:
            raise BEDError(str(e) + " on line " + line)

        # sorting by name?
        if ((sortedby == ITERATOR_SORTED_NAME and prev is not None)
                and (prev.name > e.name)):
            raise BEDError("bed file " + filehandle.name +
                           " not sorted by element name" + " found " + e.name +
                           " after " + prev.name)

        # first item
        if prev is None:
            chromsSeen.add(e.chrom)

        # on same chrom as the prev item, make sure order is right
        if prev is not None and sortedby is not None and e.chrom == prev.chrom:
            if sortedby == ITERATOR_SORTED_START and prev.start > e.start:
                raise BEDError("bed file " + filehandle.name +
                               " not sorted by start index - saw item " +
                               str(prev) + " before " + str(e))
            if sortedby == ITERATOR_SORTED_END and prev.end > e.end:
                raise BEDError("bed file " + filehandle.name +
                               " not sorted by end index - saw item " +
                               str(prev) + " before " + str(e))

        # starting a new chrom.. make sure we haven't already seen it
        if prev is not None and prev.chrom != e.chrom:
            if (sortedby == ITERATOR_SORTED_START or
                sortedby == ITERATOR_SORTED_END or
                sortedby == ITERATOR_SORTED_CHROM) and\
               (e.chrom in chromsSeen or prev.chrom > e.chrom):
                try:
                    e_fn = filehandle.name
                except AttributeError:
                    e_fn = "UNNAMED STREAM"
                raise BEDError("BED file " + e_fn + " not sorted by chrom")
            chromsSeen.add(e.chrom)

        # all good..
        yield e
        prev = e
示例#17
0
def BEDIterator(filehandle, sortedby=None, verbose=False, scoreType=int,
                dropAfter=None):
  """
  Get an iterator for a BED file

  :param filehandle: this can be either a string, or a stream-like object. In
                     the former case, it is treated as a filename. The format
                     of the file/stream must be BED.
  :param sortedby: if None, order is not checked.
                   if == ITERATOR_SORTED_START, elements in file must
                   be sorted by chrom and start index (an exception
                   is raised if they are not)
                   if == ITERATOR_SORTED_END, element must be sorted
                   by chrom and end index.
  :param verbose: if True, output additional progress messages to stderr
  :param scoreType: The data type for scores (the fifth column) in the BED
                    file.
  :param dropAfter: an int indicating that any fields after and including this
                    field should be ignored as they don't conform to the BED
                    format. By default, None, meaning we use all fields. Index
                    from zero.
  :return: iterator where subsequent calls to next() yield the next BED
           element in the stream as a GenomicInterval object.
  """
  chromsSeen = set()
  prev = None
  if type(filehandle).__name__ == "str":
    filehandle = open(filehandle)

  if verbose:
    try:
      pind = ProgressIndicator(totalToDo=os.path.getsize(filehandle.name),
                               messagePrefix="completed",
                               messageSuffix="of processing " +
                                             filehandle.name)
    except (AttributeError, OSError) as e:
      sys.stderr.write("BEDIterator -- warning: " +
                       "unable to show progress for stream")
      verbose = False

  for line in filehandle:
    if verbose:
      pind.done = filehandle.tell()
      pind.showProgress()

    if line.strip() == "":
      continue
    try:
      e = parseBEDString(line, scoreType, dropAfter=dropAfter)
    except GenomicIntervalError as e:
      raise BEDError(str(e) + " on line " + line)

    # sorting by name?
    if ((sortedby == ITERATOR_SORTED_NAME and prev is not None) and
       (prev.name > e.name)):
      raise BEDError("bed file " + filehandle.name +
                     " not sorted by element name" +
                     " found " + e.name + " after " +
                     prev.name)

    # first item
    if prev is None:
      chromsSeen.add(e.chrom)

    # on same chrom as the prev item, make sure order is right
    if prev is not None and sortedby is not None and e.chrom == prev.chrom:
      if sortedby == ITERATOR_SORTED_START and prev.start > e.start:
        raise BEDError("bed file " + filehandle.name +
                       " not sorted by start index - saw item " +
                       str(prev) + " before " + str(e))
      if sortedby == ITERATOR_SORTED_END and prev.end > e.end:
        raise BEDError("bed file " + filehandle.name +
                       " not sorted by end index - saw item " +
                       str(prev) + " before " + str(e))

    # starting a new chrom.. make sure we haven't already seen it
    if prev is not None and prev.chrom != e.chrom:
      if (sortedby == ITERATOR_SORTED_START or
          sortedby == ITERATOR_SORTED_END or
          sortedby == ITERATOR_SORTED_CHROM) and\
         (e.chrom in chromsSeen or prev.chrom > e.chrom):
        try:
          e_fn = filehandle.name
        except AttributeError:
          e_fn = "UNNAMED STREAM"
        raise BEDError("BED file " + e_fn + " not sorted by chrom")
      chromsSeen.add(e.chrom)

    # all good..
    yield e
    prev = e