Exemplo n.º 1
0
def main():

    logger.debug("%s starting" % sys.argv[0])

    opt, args = getParms()

    basFilename = args[0]
    logger.debug("bas file: %s" % basFilename)
    bf = H5BasFile.BasFile(basFilename)

    try:
        hole = int(args[1])
    except ValueError:
        logger.error('ERROR: second parameter must be an integer ZMW number')
        sys.exit()

    if opt.subreads:

        for region in bf.holeRegions(hole):
            regionHole, regionType, start, end, score = region
            if regionType == 1:  # a subread?
                printRange(bf, hole, opt, start, end)

    else:
        printRange(bf, hole, opt, opt.start, opt.end)

    logger.debug("complete")
Exemplo n.º 2
0
def main ():

    logger.debug("%s starting" % sys.argv[0])

    opt, args = getParms()

    basFilename = args[0]
    logger.debug("bas file: %s" % basFilename)
    basfile = H5BasFile.BasFile (basFilename)

    try:
        hole = int(args[1])
    except ValueError:
        logger.error('ERROR: second parameter must be an integer ZMW number')
        sys.exit()

    if not opt.reverse:
        sequence = basfile.getSequence(hole, opt.start, opt.end)    # end==None gets the whole read
    else:
        sequence = basfile.getRevCompSequence(hole, opt.start, opt.end)

    movie = basfile.movieName()
    length = len(sequence)
    print ">%s/%d/%d_%d" % (movie, hole, opt.start, opt.start+length)

    for ix in xrange(0,length,opt.flen):
        print sequence[ix:ix+opt.flen]

    logger.debug("complete")
Exemplo n.º 3
0
def main ():

    logger.debug("%s starting" % sys.argv[0])

    opt, args = getParms()

    basFilename = args[0]
    logger.debug("bas file: %s" % basFilename)
    bf = H5BasFile.BasFile (basFilename)

    try:
        hole = int(args[1])
    except ValueError:
        logger.error('ERROR: second parameter must be an integer ZMW number')
        sys.exit()

    if opt.subreads:

        for region in bf.holeRegions(hole):
            regionHole, regionType, start, end, score = region
            if regionType == 1:                              # a subread?
                printRange (bf, hole, opt, start, end)

    else:
        printRange (bf, hole, opt, opt.start, opt.end)

    logger.debug("complete")
Exemplo n.º 4
0
def submitFinalJobs (opt, chunkList):

    chunkFiles = ['%s \\\n' % chk.trimmedChunkName for chk in chunkList]

    sh = list()
    sh.append('#!/bin/bash\n\n')
    sh.append('set -o errexit\n')
    sh.append('set -o nounset\n\n')

    sh.append('cat \\\n')
    sh.extend(chunkFiles)
    sh.append(' > %s\n' % opt.output)

    if opt.report is not None:
        reportFiles = ['%s \\\n' % chk.reportChunkName for chk in chunkList]
        sh.append('\ncat \\\n')
        sh.extend(reportFiles)
        sh.append(' > %s\n' % opt.report)

    finalScriptName = '%s/trim_final.sh' % opt.tmpdir
    handle =  open (finalScriptName, 'w')
    handle.writelines (sh)
    handle.close()

    deps = ':'.join ([chk.jobno for chk in chunkList])

    cmd = list()
    cmd.append('qsub')
    cmd.append('-N trim_final')       # job name
    cmd.append('-o trim_final.out')   # output file
    cmd.append('-j oe')               # combine stdout and stderr
    cmd.append('-l nodes=1:ppn=1,walltime=4:00:00')    # resources required
    cmd.append('-d . ')               # working directory (strangely, ./ is not the default)
    cmd.append('-r n')                # do NOT attempt to restart on failure
    cmd.append('-V')                  # export all environment variables to job
    cmd.append('-W umask=0002')       # make logs rw-rw-r--
    cmd.append('-m n')                # don't send any mail
    cmd.append('-W depend=afterok:%s' % deps)
    cmd.append(finalScriptName)       # script to run

    command = ' '.join(cmd)
    logger.debug ('running %s' % command)
    
    popen_file = os.popen(command)
    response = popen_file.read().strip()
    rc = popen_file.close()
    if rc is not None:
        logger.error('command failed, rc=%d' % rc)
        raise RuntimeError

    logger.debug ('jobno is %s' % response)

    return response
Exemplo n.º 5
0
def submitFinalJobs(opt, chunkList):

    chunkFiles = ['%s \\\n' % chk.trimmedChunkName for chk in chunkList]

    sh = list()
    sh.append('#!/bin/bash\n\n')
    sh.append('set -o errexit\n')
    sh.append('set -o nounset\n\n')

    sh.append('cat \\\n')
    sh.extend(chunkFiles)
    sh.append(' > %s\n' % opt.output)

    if opt.report is not None:
        reportFiles = ['%s \\\n' % chk.reportChunkName for chk in chunkList]
        sh.append('\ncat \\\n')
        sh.extend(reportFiles)
        sh.append(' > %s\n' % opt.report)

    finalScriptName = '%s/trim_final.sh' % opt.tmpdir
    handle = open(finalScriptName, 'w')
    handle.writelines(sh)
    handle.close()

    deps = ':'.join([chk.jobno for chk in chunkList])

    cmd = list()
    cmd.append('qsub')
    cmd.append('-N trim_final')  # job name
    cmd.append('-o trim_final.out')  # output file
    cmd.append('-j oe')  # combine stdout and stderr
    cmd.append('-l nodes=1:ppn=1,walltime=4:00:00')  # resources required
    cmd.append('-d . ')  # working directory (strangely, ./ is not the default)
    cmd.append('-r n')  # do NOT attempt to restart on failure
    cmd.append('-V')  # export all environment variables to job
    cmd.append('-W umask=0002')  # make logs rw-rw-r--
    cmd.append('-m n')  # don't send any mail
    cmd.append('-W depend=afterok:%s' % deps)
    cmd.append(finalScriptName)  # script to run

    command = ' '.join(cmd)
    logger.debug('running %s' % command)

    popen_file = os.popen(command)
    response = popen_file.read().strip()
    rc = popen_file.close()
    if rc is not None:
        logger.error('command failed, rc=%d' % rc)
        raise RuntimeError

    logger.debug('jobno is %s' % response)

    return response
Exemplo n.º 6
0
def countSeqs (filename):
    '''Run grep -c ">" on a fasta file to count the sequences it contains.'''

    command = 'grep -c ">" %s' % filename
    popen_file = os.popen(command)
    response = popen_file.read().strip()
    rc       = popen_file.close()
    if rc is not None:
        logger.error("command failed, rc=%d" % rc)
        raise RuntimeError
    if not response.isdigit():
        logger.error("grep -c returned:" % response)
        raise RuntimeError
    
    return int(response)
Exemplo n.º 7
0
def countSeqs(filename):
    '''Run grep -c ">" on a fasta file to count the sequences it contains.'''

    command = 'grep -c ">" %s' % filename
    popen_file = os.popen(command)
    response = popen_file.read().strip()
    rc = popen_file.close()
    if rc is not None:
        logger.error("command failed, rc=%d" % rc)
        raise RuntimeError
    if not response.isdigit():
        logger.error("grep -c returned:" % response)
        raise RuntimeError

    return int(response)
Exemplo n.º 8
0
def main ():

    logger.debug("%s starting" % sys.argv[0])

    opt, args = getParms()

    if len(args) != 2:
        logger.error ("please specify bas.h5 and cmp.h5 files as parameters. See --help")
        sys.exit()

    # TODO: Actually, all we need from the bas file is the movie name
    # (maxHole will default to something clever). We don't need to
    # open the bas file to determine the movie name: It's part of the
    # filename. The only real reason we specify a bas file as the
    # first parameter is to match the command line interface of other
    # scripts.

    basFilename = args[0]
    logger.debug("bas file: %s" % basFilename)
    bf = H5BasFile.BasFile (basFilename)
    movie = bf.movieName()

    cmpFilename = args[1]
    logger.debug("cmp file: %s" % cmpFilename)
    cf  = H5CmpFile.CmpFile (fileName=cmpFilename)
    cmp = H5CmpFile.CmpMovie (cmpObject=cf,
                              movieName=movie,
                              maxHole=bf.maxZMW())

    if opt.ZMW is not None:                      # did we ask for a specific ZMW?

        for align in cmp.getAlignmentsForHole(opt.ZMW):

            printAlignment (align, cmp, opt.flen)

    else:                                        # else, print all ZMWs

        for align in cmp.getAlignmentsByHole():

            printAlignment (align, cmp, opt.flen)

    logger.debug("complete")
Exemplo n.º 9
0
    def submitScript(self):

        # Dependent job submission will fail if parent has already
        # completed. So delay all job startups by a short amount of time.

        startAt = datetime.datetime.now() + datetime.timedelta(0, STARTWAIT)
        startAtStr = startAt.strftime('%Y%m%d%H%M.%S')

        cmd = list()
        cmd.append('qsub')
        cmd.append('-N %s' % self.jobName)  # job name
        cmd.append('-o %s' % self.scriptOutput)  # output file
        cmd.append('-j oe')  # combine stdout and stderr
        cmd.append('-l nodes=1:ppn=1,walltime=4:00:00')  # resources required
        cmd.append('-a %s' % startAtStr)  # delay start, see above
        cmd.append(
            '-d . ')  # working directory (strangely, ./ is not the default)
        cmd.append('-r n')  # do NOT attempt to restart on failure
        cmd.append('-V')  # export all environment variables to job
        cmd.append('-W umask=0002')  # make logs rw-rw-r--
        cmd.append('-m n')  # don't send any mail
        cmd.append(self.scriptName)  # script to run

        command = ' '.join(cmd)
        logger.debug('running %s' % command)

        popen_file = os.popen(command)
        response = popen_file.read().strip()
        rc = popen_file.close()
        if rc is not None:
            logger.error('command failed, rc=%d' % rc)
            raise RuntimeError

        match = re.match(Chunk.JOBNO_PATTERN, response)
        if match is None:
            logger.error("invalid job sequence number: %s" % jobSeqStr)
            raise RuntimeError

        response = match.group(1)
        logger.debug('jobno is %s' % response)
        self.jobno = response
        return response
Exemplo n.º 10
0
    def submitScript (self):

        # Dependent job submission will fail if parent has already
        # completed. So delay all job startups by a short amount of time.

        startAt = datetime.datetime.now() + datetime.timedelta(0, STARTWAIT)
        startAtStr = startAt.strftime('%Y%m%d%H%M.%S')

        cmd = list()
        cmd.append('qsub')
        cmd.append('-N %s' % self.jobName)           # job name
        cmd.append('-o %s' % self.scriptOutput)      # output file
        cmd.append('-j oe')               # combine stdout and stderr
        cmd.append('-l nodes=1:ppn=1,walltime=4:00:00')    # resources required
        cmd.append('-a %s' % startAtStr)  # delay start, see above
        cmd.append('-d . ')               # working directory (strangely, ./ is not the default)
        cmd.append('-r n')                # do NOT attempt to restart on failure
        cmd.append('-V')                  # export all environment variables to job
        cmd.append('-W umask=0002')       # make logs rw-rw-r--
        cmd.append('-m n')                # don't send any mail
        cmd.append(self.scriptName)       # script to run

        command = ' '.join(cmd)
        logger.debug ('running %s' % command)
        
        popen_file = os.popen(command)
        response = popen_file.read().strip()
        rc = popen_file.close()
        if rc is not None:
            logger.error('command failed, rc=%d' % rc)
            raise RuntimeError

        match = re.match (Chunk.JOBNO_PATTERN, response)
        if match is None:
            logger.error("invalid job sequence number: %s" % jobSeqStr)
            raise RuntimeError

        response = match.group(1)
        logger.debug ('jobno is %s' % response)
        self.jobno = response
        return response
Exemplo n.º 11
0
def main ():

    logger.debug("%s starting" % sys.argv[0])

    opt, args = getParms()

    if len(args) != 2:
        logger.error ("please specify bas.h5 and cmp.h5 files as parameters. See --help")
        sys.exit()

    basFilename = args[0]
    logger.debug("bas file: %s" % basFilename)
    bf = H5BasFile.BasFile (basFilename)

    cmpFilename = args[1]
    logger.debug("cmp file: %s" % cmpFilename)
    cf  = H5CmpFile.CmpFile (fileName=cmpFilename)
    cmp = H5CmpFile.CmpMovie (cmpObject=cf,
                              movieName=bf.movieName(),
                              maxHole=bf.maxZMW())

    cf.printDetails()

    print " AlnID  RG   Hole Set Stb  SubRd Seq Ref St      Start        End RefStrt  RefEnd    OffStrt     OffEnd"
    print

    if opt.sort == 'hole':

        for align in cmp.getAlignmentsByHole():
            printAlign(align)

    else :                                                    # else, must be 'none' 

        for align in cmp.getAllAlignments():                  # generator function, returns a dict
            printAlign(align)

    logger.debug("complete")
Exemplo n.º 12
0
def main ():

    logger.debug("%s starting" % sys.argv[0])

    opt, args = getParms()

    basFilename = args[0]
    basfile = H5BasFile.BasFile (basFilename)

    try:
        hole = int(args[1])
    except ValueError:
        logger.error('ERROR: second parameter must be an integer ZMW number')
        sys.exit()

    start = opt.start
    end   = basfile.readLen(hole) if opt.end is None else opt.end

    sequence = basfile.getSequence(hole, start, end)

    aln = SWAligner.Aligner()
    aln.setRef (sequence)
    aln.setRead (H5BasFile.ADAPTER)
    aln.fillMatrix()
    allScores = aln.allScores()

    range = xrange(start,end)
    title = "ZMW %d (%d to %d)" % (hole, start, end)

    plt.suptitle(title, fontsize=14, fontweight='bold')

    plt.plot (range, allScores, COL_NOT_HQ, zorder=1, label='non-HQ')

    if not opt.nocol:           # finding HQ region takes a long time, so optionally turn it off

        # There doesn't seem to be a way to separately specify a
        # colour for each point in a plot. So we'll plot in one
        # colour, then overlay subregions of that with another
        # colour. Plot commands are rendered in increasing zorder.

        HQStart, HQEnd = basfile.HQregion(hole)[2:4]
        HQRange = xrange(HQStart, HQEnd)
        HQScores = allScores[HQStart:HQEnd]

        plt.plot (HQRange, HQScores, COL_HQ, zorder=2, label='HQ')

        label = 'adapter';             # I will only say this once...

        for region in basfile.holeRegions(hole):     # loop through the regions looking for adapters

            regionType, regionStart, regionEnd = region[1:4]

            if regionType == 0:        # an adapter?
                regionRange = xrange(regionStart, regionEnd)
                regionScores = allScores[regionStart:regionEnd]
                plt.plot (regionRange, regionScores, COL_ADAPT, zorder=3, label=label)

                label = '_nolegend_'   # don't generate multiple legend entries

        plt.legend(loc='best', prop={'size':10})     # add a legend box to figure

    plt.ylim (0, len(H5BasFile.ADAPTER))

    if opt.output is not None:
        outfile = opt.output
    else:
        outfile = "ZMW-%05d.png" % hole
    plt.savefig (outfile)

    logger.debug("complete")
Exemplo n.º 13
0
def main():

    logger.debug("%s starting" % sys.argv[0])

    opt, args = getParms()

    basFilename = args[0]
    basfile = H5BasFile.BasFile(basFilename)

    try:
        hole = int(args[1])
    except ValueError:
        logger.error('ERROR: second parameter must be an integer ZMW number')
        sys.exit()

    start = opt.start
    end = basfile.readLen(hole) if opt.end is None else opt.end

    sequence = basfile.getSequence(hole, start, end)

    aln = SWAligner.Aligner()
    aln.setRef(sequence)
    aln.setRead(H5BasFile.ADAPTER)
    aln.fillMatrix()
    allScores = aln.allScores()

    range = xrange(start, end)
    title = "ZMW %d (%d to %d)" % (hole, start, end)

    plt.suptitle(title, fontsize=14, fontweight='bold')

    plt.plot(range, allScores, COL_NOT_HQ, zorder=1, label='non-HQ')

    if not opt.nocol:  # finding HQ region takes a long time, so optionally turn it off

        # There doesn't seem to be a way to separately specify a
        # colour for each point in a plot. So we'll plot in one
        # colour, then overlay subregions of that with another
        # colour. Plot commands are rendered in increasing zorder.

        HQStart, HQEnd = basfile.HQregion(hole)[2:4]
        HQRange = xrange(HQStart, HQEnd)
        HQScores = allScores[HQStart:HQEnd]

        plt.plot(HQRange, HQScores, COL_HQ, zorder=2, label='HQ')

        label = 'adapter'
        # I will only say this once...

        for region in basfile.holeRegions(
                hole):  # loop through the regions looking for adapters

            regionType, regionStart, regionEnd = region[1:4]

            if regionType == 0:  # an adapter?
                regionRange = xrange(regionStart, regionEnd)
                regionScores = allScores[regionStart:regionEnd]
                plt.plot(regionRange,
                         regionScores,
                         COL_ADAPT,
                         zorder=3,
                         label=label)

                label = '_nolegend_'  # don't generate multiple legend entries

        plt.legend(loc='best', prop={'size': 10})  # add a legend box to figure

    plt.ylim(0, len(H5BasFile.ADAPTER))

    if opt.output is not None:
        outfile = opt.output
    else:
        outfile = "ZMW-%05d.png" % hole
    plt.savefig(outfile)

    logger.debug("complete")