Exemplo n.º 1
0
def fastqIteratorComplex(fn, useMutableString=False, verbose=False):
    """
    A generator function which yields FastqSequence objects read from a file or
    stream. This iterator can handle fastq files that have their sequence
    and/or their quality data split across multiple lines (i.e. there are
    newline characters in the sequence and quality strings).

    :param fn:                 A file-like stream or a string; if this is a
                               string, it's treated as a filename specifying
                               the location of an input fastq file, else it's
                               treated as a file-like object, which must have a
                               readline() method.
    :param useMustableString:  if True, construct sequences from lists of
                               chars, rather than python string objects, to
                               allow more efficient editing. Use with caution.
    :param verbose:            if True, print messages on progress to stderr.
    :param debug:              if True, print debugging messages to stderr.
    :param sanger:             if True, assume quality scores are in sanger
                               format. Otherwise, assume they're in Illumina
                               format.
  """
    fh = fn
    if type(fh).__name__ == "str":
        fh = open(fh)
    prevLine = None

    # try to get an idea of how much data we have...
    pind = None
    if verbose:
        try:
            totalLines = linesInFile(fh.name)
            pind = ProgressIndicator(
                totalToDo=totalLines, messagePrefix="completed", messageSuffix="of processing " + fh.name
            )
        except AttributeError:
            sys.stderr.write("fastqIterator -- warning: " + "unable to show progress for stream")
            verbose = False

    while True:
        # either we have a sequence header left over from the
        # prev call, or we need to read a new one from the file...
        # try to do that now
        name, prevLine = fastq_complex_parse_seq_header(fh, prevLine, pind, verbose)

        # read lines until we hit a qual header --> this is our sequence data
        seqdata, line = fastq_complex_parse_seq(fh, pind, verbose)

        # <line> is now a qual header, keep reading until we see
        # a sequence header.. or we run out of lines.. this is our quality data
        qualdata, line, prevLine = fastq_complex_parse_qual(fh, line, prevLine, verbose, pind)

        # package it all up..
        yield NGSRead(seqdata, name, qualdata, useMutableString)
        if verbose:
            pind.showProgress()

        # remember where we stopped for next call, or finish
        prevLine = line
        if prevLine == "":
            break
Exemplo n.º 2
0
def regularWigIterator(fd, verbose=False, sortedby=None, scoreType=int):
    """
    @param sortedBy: if not None, should be one of ITERATOR_SORTED_BY_START
                     indicating an order that the input stream must be
                     sorted in
    @raise WigIteratorError: if sortedBy is set and stream is not sorted
  """
    if verbose:
        try:
            totalLines = linesInFile(fd)
            pind = ProgressIndicator(totalToDo=totalLines,
                                     messagePrefix="completed",
                                     messageSuffix="of processing " +
                                     getFDName(fd))
        except AttributeError:
            sys.stderr.write("WigIterator -- warning: " +
                             "unable to show progress for stream")
            verbose = False

    chromsSeen = set()
    prev = None

    fh = openFD(fd)
    for line in fh:
        if verbose:
            pind.done += 1
            pind.showProgress()

        line = line.strip()
        if line == "":
            continue
        e = parseWigString(line, scoreType=scoreType)

        # on same chrom as the prev item, make sure order is right
        if prev is not None and sortedby is not None and e.chrom == prev.chrom:
            if sortedby == ITERATOR_SORTED_START and prev.start > e.start:
                raise WigIteratorError(
                    "bed file " + fd.name +
                    " not sorted by start index - saw item " + str(prev) +
                    " before " + str(e))

        # starting a new chrom.. make sure we haven't already seen it
        if prev is not None and prev.chrom != e.chrom:
            if (sortedby == ITERATOR_SORTED_START) and\
               (e.chrom in chromsSeen or prev.chrom > e.chrom):
                raise WigIteratorError("BED file " + fd.name +
                                       " not sorted by chrom")
            chromsSeen.add(e.chrom)

        # all good..
        yield e
        prev = e
Exemplo n.º 3
0
def regularWigIterator(fd, verbose=False, sortedby=None, scoreType=int):
  """
    @param sortedBy: if not None, should be one of ITERATOR_SORTED_BY_START
                     indicating an order that the input stream must be
                     sorted in
    @raise WigIteratorError: if sortedBy is set and stream is not sorted
  """
  if verbose:
    try:
      totalLines = linesInFile(fd)
      pind = ProgressIndicator(totalToDo=totalLines,
                               messagePrefix="completed",
                               messageSuffix="of processing " + getFDName(fd))
    except AttributeError:
      sys.stderr.write("WigIterator -- warning: "
                       + "unable to show progress for stream")
      verbose = False

  chromsSeen = set()
  prev = None

  fh = openFD(fd)
  for line in fh:
    if verbose:
      pind.done += 1
      pind.showProgress()

    line = line.strip()
    if line == "":
      continue
    e = parseWigString(line, scoreType=scoreType)

    # on same chrom as the prev item, make sure order is right
    if prev is not None and sortedby is not None and e.chrom == prev.chrom:
      if sortedby == ITERATOR_SORTED_START and prev.start > e.start:
        raise WigIteratorError("bed file " + fd.name
                               + " not sorted by start index - saw item "
                               + str(prev) + " before " + str(e))

    # starting a new chrom.. make sure we haven't already seen it
    if prev is not None and prev.chrom != e.chrom:
      if (sortedby == ITERATOR_SORTED_START) and\
         (e.chrom in chromsSeen or prev.chrom > e.chrom):
        raise WigIteratorError("BED file " + fd.name
                               + " not sorted by chrom")
      chromsSeen.add(e.chrom)

    # all good..
    yield e
    prev = e
Exemplo n.º 4
0
def intervalTrees(reffh, scoreType=int, verbose=False):
    """
  Build a dictionary of interval trees indexed by chrom from a BED stream or
  file

  :param reffh: This can be either a string, or a stream-like object. In the
                former case, it is treated as a filename. The format of the
                file/stream must be BED.
  :param scoreType: The data type for scores (the fifth column) in the BED
                    file.
  :param verbose: output progress messages to sys.stderr if True
  """
    if type(reffh).__name__ == "str":
        fh = open(reffh)
    else:
        fh = reffh

    # load all the regions and split them into lists for each chrom
    elements = {}
    if verbose and fh != sys.stdin:
        totalLines = linesInFile(fh.name)
        pind = ProgressIndicator(totalToDo=totalLines,
                                 messagePrefix="completed",
                                 messageSuffix="of loading " + fh.name)
    for element in BEDIterator(fh, scoreType=scoreType, verbose=verbose):
        if element.chrom not in elements:
            elements[element.chrom] = []
        elements[element.chrom].append(element)
        if verbose and fh != sys.stdin:
            pind.done += 1
            pind.showProgress()

    # create an interval tree for each list
    trees = {}
    if verbose:
        totalLines = len(elements)
        pind = ProgressIndicator(totalToDo=totalLines,
                                 messagePrefix="completed",
                                 messageSuffix="of making interval trees")
    for chrom in elements:
        trees[chrom] = IntervalTree(elements[chrom], openEnded=True)
        if verbose:
            pind.done += 1
            pind.showProgress()

    return trees
Exemplo n.º 5
0
def intervalTrees(reffh, scoreType=int, verbose=False):
  """
  Build a dictionary of interval trees indexed by chrom from a BED stream or
  file

  :param reffh: This can be either a string, or a stream-like object. In the
                former case, it is treated as a filename. The format of the
                file/stream must be BED.
  :param scoreType: The data type for scores (the fifth column) in the BED
                    file.
  :param verbose: output progress messages to sys.stderr if True
  """
  if type(reffh).__name__ == "str":
    fh = open(reffh)
  else:
    fh = reffh

  # load all the regions and split them into lists for each chrom
  elements = {}
  if verbose and fh != sys.stdin:
    totalLines = linesInFile(fh.name)
    pind = ProgressIndicator(totalToDo=totalLines,
                             messagePrefix="completed",
                             messageSuffix="of loading " + fh.name)
  for element in BEDIterator(fh, scoreType=scoreType, verbose=verbose):
    if element.chrom not in elements:
      elements[element.chrom] = []
    elements[element.chrom].append(element)
    if verbose and fh != sys.stdin:
      pind.done += 1
      pind.showProgress()

  # create an interval tree for each list
  trees = {}
  if verbose:
    totalLines = len(elements)
    pind = ProgressIndicator(totalToDo=totalLines,
                             messagePrefix="completed",
                             messageSuffix="of making interval trees")
  for chrom in elements:
    trees[chrom] = IntervalTree(elements[chrom], openEnded=True)
    if verbose:
      pind.done += 1
      pind.showProgress()

  return trees
Exemplo n.º 6
0
def fastqIteratorComplex(fn, useMutableString=False, verbose=False):
  """
    A generator function which yields FastqSequence objects read from a file or
    stream. This iterator can handle fastq files that have their sequence
    and/or their quality data split across multiple lines (i.e. there are
    newline characters in the sequence and quality strings).

    :param fn:                 A file-like stream or a string; if this is a
                               string, it's treated as a filename specifying
                               the location of an input fastq file, else it's
                               treated as a file-like object, which must have a
                               readline() method.
    :param useMustableString:  if True, construct sequences from lists of
                               chars, rather than python string objects, to
                               allow more efficient editing. Use with caution.
    :param verbose:            if True, print messages on progress to stderr.
    :param debug:              if True, print debugging messages to stderr.
    :param sanger:             if True, assume quality scores are in sanger
                               format. Otherwise, assume they're in Illumina
                               format.
  """
  fh = fn
  if type(fh).__name__ == "str":
    fh = open(fh)
  prevLine = None

  # try to get an idea of how much data we have...
  pind = None
  if verbose:
    try:
      totalLines = linesInFile(fh.name)
      pind = ProgressIndicator(totalToDo=totalLines,
                               messagePrefix="completed",
                               messageSuffix="of processing "
                                             + fh.name)
    except AttributeError:
      sys.stderr.write("fastqIterator -- warning: "
                       + "unable to show progress for stream")
      verbose = False

  while True:
    # either we have a sequence header left over from the
    # prev call, or we need to read a new one from the file...
    # try to do that now
    name, prevLine = fastq_complex_parse_seq_header(fh, prevLine,
                                                    pind, verbose)

    # read lines until we hit a qual header --> this is our sequence data
    seqdata, line = fastq_complex_parse_seq(fh, pind, verbose)

    # <line> is now a qual header, keep reading until we see
    # a sequence header.. or we run out of lines.. this is our quality data
    qualdata, line, prevLine = fastq_complex_parse_qual(fh, line, prevLine,
                                                        verbose, pind)

    # package it all up..
    yield NGSRead(seqdata, name, qualdata, useMutableString)
    if verbose:
      pind.showProgress()

    # remember where we stopped for next call, or finish
    prevLine = line
    if prevLine == "":
      break