Пример #1
0
 def run(self):
     logger.info("Preparing sequence for preprocessing")
     # chunk it up
     inChunkDirectory = makeSubDir(os.path.join(self.getGlobalTempDir(), "preprocessChunksIn"))
     inChunkList = [
         chunk
         for chunk in popenCatch(
             "cactus_blast_chunkSequences %s %i 0 %s %s"
             % (getLogLevelString(), self.prepOptions.chunkSize, inChunkDirectory, self.inSequencePath)
         ).split("\n")
         if chunk != ""
     ]
     outChunkDirectory = makeSubDir(os.path.join(self.getGlobalTempDir(), "preprocessChunksOut"))
     outChunkList = []
     # For each input chunk we create an output chunk, it is the output chunks that get concatenated together.
     for i in xrange(len(inChunkList)):
         outChunkList.append(os.path.join(outChunkDirectory, "chunk_%i" % i))
         # Calculate the number of chunks to use
         inChunkNumber = int(max(1, math.ceil(len(inChunkList) * self.prepOptions.proportionToSample)))
         assert inChunkNumber <= len(inChunkList) and inChunkNumber > 0
         # Now get the list of chunks flanking and including the current chunk
         j = max(0, i - inChunkNumber / 2)
         inChunks = inChunkList[j : j + inChunkNumber]
         if len(inChunks) < inChunkNumber:  # This logic is like making the list circular
             inChunks += inChunkList[: inChunkNumber - len(inChunks)]
         assert len(inChunks) == inChunkNumber
         self.addChildTarget(
             PreprocessChunk(
                 self.prepOptions, inChunks, float(inChunkNumber) / len(inChunkList), inChunkList[i], outChunkList[i]
             )
         )
     # follow on to merge chunks
     self.setFollowOnTarget(MergeChunks(self.prepOptions, outChunkList, self.outSequencePath))
Пример #2
0
    def run(self):
        # If the files are in a sub-dir then rip them out.
        if os.path.isdir(self.inputSequenceFileOrDirectory):
            tempFile = getTempFile(rootDir=self.getGlobalTempDir())
            catFiles(
                [
                    os.path.join(self.inputSequenceFileOrDirectory, f)
                    for f in os.listdir(self.inputSequenceFileOrDirectory)
                ],
                tempFile,
            )
            inputSequenceFile = tempFile
        else:
            inputSequenceFile = self.inputSequenceFileOrDirectory

        assert inputSequenceFile != self.outputSequenceFile

        prepXmlElems = self.configNode.findall("preprocessor")

        analysisString = runCactusAnalyseAssembly(inputSequenceFile)
        self.logToMaster(
            "Before running any preprocessing on the assembly: %s got following stats (assembly may be listed as temp file if input sequences from a directory): %s"
            % (self.inputSequenceFileOrDirectory, analysisString)
        )

        if len(prepXmlElems) == 0:  # Just cp the file to the output file
            system("cp %s %s" % (inputSequenceFile, self.outputSequenceFile))
        else:
            logger.info("Adding child batch_preprocessor target")
            self.addChildTarget(BatchPreprocessor(prepXmlElems, inputSequenceFile, self.outputSequenceFile, 0))
Пример #3
0
def runWorkflow_multipleExamples(inputGenFunction,
                                 testNumber=1,
                                 testRestrictions=(TestStatus.TEST_SHORT, TestStatus.TEST_MEDIUM, \
                                                   TestStatus.TEST_LONG, TestStatus.TEST_VERY_LONG,),
                               inverseTestRestrictions=False,
                               batchSystem="single_machine",
                               buildAvgs=False, buildReference=False,
                               buildReferenceSequence=False,
                               buildCactusPDF=False, buildAdjacencyPDF=False,
                               buildReferencePDF=False,
                               makeCactusTreeStats=False, makeMAFs=False,
                               configFile=None, buildJobTreeStats=False):
    """A wrapper to run a number of examples.
    """
    if (inverseTestRestrictions and TestStatus.getTestStatus() not in testRestrictions) or \
        (not inverseTestRestrictions and TestStatus.getTestStatus() in testRestrictions):
        for test in xrange(testNumber):
            tempDir = getTempDirectory(os.getcwd())
            sequences, newickTreeString = inputGenFunction(regionNumber=test,
                                                           tempDir=tempDir)
            runWorkflow_TestScript(sequences,
                                   newickTreeString,
                                   batchSystem=batchSystem,
                                   buildAvgs=buildAvgs,
                                   buildReference=buildReference,
                                   buildCactusPDF=buildCactusPDF,
                                   buildAdjacencyPDF=buildAdjacencyPDF,
                                   makeCactusTreeStats=makeCactusTreeStats,
                                   makeMAFs=makeMAFs,
                                   configFile=configFile,
                                   buildJobTreeStats=buildJobTreeStats)
            system("rm -rf %s" % tempDir)
            logger.info("Finished random test %i" % test)
Пример #4
0
def main():
    #Parse the inputs args/options
    parser = OptionParser(usage="usage: workingDir [options]",
                          version="%prog 0.1")
    options = Options()
    parser.add_option("--sequences",
                      dest="sequences",
                      help="Quoted list of fasta files containing sequences")
    parser.add_option("--alignments", dest="alignments", help="Cigar file ")
    addExpectationMaximisationOptions(parser, options)

    Stack.addJobTreeOptions(parser)
    options, args = parser.parse_args()
    setLoggingFromOptions(options)

    if len(args) != 0:
        raise RuntimeError("Expected no arguments, got %s arguments: %s" %
                           (len(args), " ".join(args)))

    #Log the inputs
    logger.info(
        "Got '%s' sequences, '%s' alignments file, '%s' output model and '%s' iterations of training"
        % (options.sequences, options.alignments, options.outputModel,
           options.iterations))

    #This line invokes jobTree
    i = Stack(
        Target.makeTargetFn(expectationMaximisationTrials,
                            args=(options.sequences, options.alignments,
                                  options.outputModel,
                                  options))).startJobTree(options)

    if i != 0:
        raise RuntimeError("Got failed jobs")
Пример #5
0
    def killJobs(self, jobIDs):
        """
        Kills the given job indexes and makes sure they're dead.
        """
        for jobID in jobIDs:
            slurmJobID = self.getSlurmJobID(jobID)
            logger.info("DEL: " + str(slurmJobID))
            self.currentjobs.remove(jobID)
            try:
                Slurm.killJob(slurmJobID)
            except Exception:
                pass

            #What is this????
            del self.jobIDs[self.slurmJobTasks[jobID]]
            del self.slurmJobTasks[jobID]

        toKill = set(jobIDs)
        maxattempts = 5
        attempts = 0
        while len(toKill) > 0 and attempts < maxattempts:
            for jobID in list(toKill):
                if SlurmBatchSystem.getJobExitCode(
                        self.slurmJobIDs[jobID]) is not None:
                    toKill.remove(jobID)

            if len(toKill) > 0:
                logger.critical(
                    "Tried to kill some jobs, but something happened and they are still going, so I'll try again"
                )
                time.sleep(5)
                attempts += 1
Пример #6
0
    def runComparisonOfBlastScriptVsNaiveBlast(self, blastMode):
        """We compare the output with a naive run of the blast program, to check the results are nearly
        equivalent.
        """
        encodeRegions = [ "ENm00" + str(i) for i in xrange(1,2) ] #, 2) ] #Could go to six
        species = ("human", "mouse", "dog")
        #Other species to try "rat", "monodelphis", "macaque", "chimp"
        for encodeRegion in encodeRegions:
            regionPath = os.path.join(self.encodePath, encodeRegion)
            for i in xrange(len(species)):
                species1 = species[i]
                for species2 in species[i+1:]:
                    seqFile1 = os.path.join(regionPath, "%s.%s.fa" % (species1, encodeRegion))
                    seqFile2 = os.path.join(regionPath, "%s.%s.fa" % (species2, encodeRegion))

                    #Run simple blast
                    runNaiveBlast(seqFile1, seqFile2, self.tempOutputFile, self.tempDir)
                    logger.info("Ran the naive blast okay")
                    
                    #Run cactus blast pipeline
                    toilDir = os.path.join(getTempDirectory(self.tempDir), "toil")
                    if blastMode == "allAgainstAll":
                        runCactusBlast(sequenceFiles=[ seqFile1, seqFile2 ],
                                       alignmentsFile=self.tempOutputFile2, toilDir=toilDir,
                                       chunkSize=500000, overlapSize=10000)
                    else:
                        runCactusBlast(sequenceFiles=[ seqFile1 ], alignmentsFile=self.tempOutputFile2,
                                       toilDir=toilDir, chunkSize=500000, overlapSize=10000,
                                       targetSequenceFiles=[ seqFile2 ])
                    logger.info("Ran cactus_blast okay")
                    logger.critical("Comparing cactus_blast and naive blast; using mode: %s" % blastMode)
                    checkCigar(self.tempOutputFile)
                    checkCigar(self.tempOutputFile2)
                    compareResultsFile(self.tempOutputFile, self.tempOutputFile2)
Пример #7
0
def bsub(bsubline):
    process = subprocess.Popen(" ".join(bsubline), shell=True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
    liney = process.stdout.readline()
    logger.info("BSUB: " + liney)
    result = int(liney.strip().split()[1].strip('<>'))
    logger.debug("Got the job id: %s" % (str(result)))
    return result
Пример #8
0
 def run(self):
     localTempDir = self.getLocalTempDir()
     i = 0
     localfiles = []
     for f in self.files:
         if not os.path.exists(f): #HACK
             continue
         localname = os.path.join(localTempDir, "%s%d.bam" %(os.path.basename(f).split('.')[0], i))
         system("scp -C %s %s" %(f, localname))
         localfiles.append(localname)
         i += 1
     mergeFile = os.path.join(localTempDir, "merge.bam")
     if len(localfiles) == 1:
         system("mv %s %s" %(localfiles[0], mergeFile))
     else:
         bamStr = " ".join(localfiles)
         logger.info("Merging bams...\n")
         mergeCmd = "samtools merge %s %s" %(mergeFile, bamStr)
         system( mergeCmd )
     
     sortPrefix = os.path.join(localTempDir, "mergeSorted")
     sortCmp = "samtools sort %s %s" %( mergeFile, sortPrefix )
     system( sortCmp )
     
     system( "cp %s.bam %s" %(sortPrefix, self.outdir) )
     #Get Snps info:
     self.setFollowOnTarget( Snp(self.outdir, self.options) )
Пример #9
0
def runWorkflow_multipleExamples(inputGenFunction,
                                 testNumber=1, 
                                 testRestrictions=(TestStatus.TEST_SHORT, TestStatus.TEST_MEDIUM, \
                                                   TestStatus.TEST_LONG, TestStatus.TEST_VERY_LONG,),
                               inverseTestRestrictions=False,
                               batchSystem="single_machine",
                               buildAvgs=False, buildReference=False,
                               buildReferenceSequence=False,
                               buildCactusPDF=False, buildAdjacencyPDF=False,
                               buildReferencePDF=False,
                               makeCactusTreeStats=False, makeMAFs=False,
                               configFile=None, buildJobTreeStats=False):
    """A wrapper to run a number of examples.
    """
    if (inverseTestRestrictions and TestStatus.getTestStatus() not in testRestrictions) or \
        (not inverseTestRestrictions and TestStatus.getTestStatus() in testRestrictions):
        for test in xrange(testNumber): 
            tempDir = getTempDirectory(os.getcwd())
            sequences, newickTreeString = inputGenFunction(regionNumber=test, tempDir=tempDir)
            runWorkflow_TestScript(sequences, newickTreeString,
                                   batchSystem=batchSystem,
                                   buildAvgs=buildAvgs, buildReference=buildReference, 
                                   buildCactusPDF=buildCactusPDF, buildAdjacencyPDF=buildAdjacencyPDF,
                                   makeCactusTreeStats=makeCactusTreeStats, makeMAFs=makeMAFs, configFile=configFile,
                                   buildJobTreeStats=buildJobTreeStats)
            system("rm -rf %s" % tempDir)
            logger.info("Finished random test %i" % test)
Пример #10
0
    def run(self):
        setLogLevel("DEBUG")
        options = self.options
        system("mkdir -p %s" %(options.outdir))

        experiments, samples =  getExperiments(options.cactusdir)
        for i, exp in enumerate(experiments):
            sample = samples[i]
            logger.info("Experiment %s, sample %s\n" %(exp, sample) )
            self.addChildTarget( RunExperiment(options, exp, sample) )
        
        #Map to other refs, the structure of the directories is going to be:
        #outdir/
        #   otherRefs/
        #       sampleNA*/
        #           hg19/
        #           apd/
        #           ...
        refdir = os.path.join(options.outdir, "otherRefs")
        system("mkdir -p %s" %refdir)
        for sample in samples:
            sampleDir = os.path.join(refdir, sample)
            readdir = os.path.join(self.options.readdir, sample)
            system("mkdir -p %s" %sampleDir)
            for ref in self.options.refs:
                rdir = os.path.join(sampleDir, ref)
                system("mkdir -p %s" %rdir)
                self.addChildTarget( RunMapping(self.options, os.path.join(self.options.refdir, ref), rdir, readdir) )

        #Done mapping, now drawPlots
        self.setFollowOnTarget( Plots(options.outdir, os.path.join(options.outdir, "plots"), options.cleanup) )
Пример #11
0
    def run(self):
        geneFile = os.path.join(self.getLocalTempDir(), "refgene.bed")
        system("cp %s %s" %(self.geneFile, geneFile))

        system("cactus_genemapChain -c %s -o \"%s\" -s \"%s\" -g \"%s\"" \
                %(self.dbStr, self.outputFile, self.refSpecies, geneFile))
        logger.info("Done genemapChain for %s\n" %self.region)
Пример #12
0
def parseJobFile(absFileName):
    try:
        job = readJob(absFileName)
        return job
    except IOError:
        logger.info("Encountered error while parsing job file %s, so we will ignore it" % absFileName)
    return None
Пример #13
0
    def testCactusSetup(self):
        """Creates a bunch of random inputs and then passes them to cactus setup.
        """
        for test in xrange(self.testNo):
            tempDir = os.path.relpath(getTempDirectory(os.getcwd()))
            sequenceNumber = random.choice(xrange(100))
            sequences, newickTreeString = getCactusInputs_random(
                tempDir=tempDir, sequenceNumber=sequenceNumber)

            #Setup the flower disk.
            experiment = getCactusWorkflowExperimentForTest(
                sequences, newickTreeString,
                os.path.join('/data', os.path.relpath(tempDir)))
            cactusDiskDatabaseString = experiment.getDiskDatabaseString()
            cactusSequencesPath = os.path.join(experiment.getDbDir(),
                                               "cactusSequences")

            runCactusSetup(cactusDiskDatabaseString=cactusDiskDatabaseString,
                           cactusSequencesPath=cactusSequencesPath,
                           sequences=sequences,
                           newickTreeString=newickTreeString)
            runCactusSetup(cactusDiskDatabaseString=cactusDiskDatabaseString,
                           cactusSequencesPath=cactusSequencesPath,
                           sequences=sequences,
                           newickTreeString=newickTreeString)

            experiment.cleanupDb()
            system("rm -rf %s" % tempDir)
            logger.info("Finished test %i of cactus_setup.py", test)
Пример #14
0
def runCactusTreeStatsToLatexTables(inputFiles, regionNames, outputFile):
    assert len(regionNames) == len(inputFiles)
    k = " ".join(["%s %s" % (i, j) for i, j in zip(inputFiles, regionNames)])
    command = "cactus_treeStatsToLatexTables.py --outputFile %s %s" % (
        outputFile, k)
    system(command)
    logger.info("Ran cactus_treeStatsToLatexTables okay")
Пример #15
0
def parasolRestart():
    """Function starts the parasol hub and node.
    """
    parasolStop()
    while True:
        machineList = os.path.join(workflowRootPath(), "jobTree", "machineList")
        #pathEnvVar = os.environ["PATH"]
        os.system("paraNode start -hub=localhost") 
        #-umask=002 -userPath=%s -sysPath=%s" % (pathEnvVar, pathEnvVar))
        os.system("paraHub %s subnet=127.0.0 &" % (machineList,))
        tempFile = getTempFile()
        dead = True
        try:
            popen("parasol status", tempFile)
            fileHandle = open(tempFile, 'r')
            line = fileHandle.readline()
            while line != '':
                if "Nodes dead" in line:
                    print line
                    if int(line.split()[-1]) == 0:
                        dead = False
                line = fileHandle.readline()
            fileHandle.close()
        except RuntimeError:
            pass
        os.remove(tempFile)
        if not dead:
            break
        else:
            logger.info("Tried to restart the parasol process, but failed, will try again")
            parasolStop()
            time.sleep(5)
    logger.info("Restarted the parasol process")
Пример #16
0
    def run(self):
	localTempDir = self.getLocalTempDir()
        filelst = os.path.join(localTempDir, 'file.lst')
	f = open(filelst, 'w')
	for file in self.files:
	    #Copy necessary file to local tempdir first:
	    localbam = os.path.join( localTempDir, os.path.basename(file.path) )
	    #localbambai = os.path.join( localTempDir, "%s.bai" % os.path.basename(file.path) )
	    system("ln -s %s %s" %(file.path, localbam))

	    range = self.ref2info[file.ref][1]

	    filename = os.path.basename(file.path).rstrip('.bam')
	    localout = os.path.join( localTempDir, "%s-sorted" %(filename) )

	    logger.info("Pre-processing sample %s\n" %(filename))
	    f.write( "%s\t%s\n" %(filename, self.ref2info[file.ref][0]) )
	    #Extract range and sort by name:
            if os.path.exists( "%s.bai" %file.path ):
                system("ln -s %s.bai %s.bai" %(file.path, localbam))
            else:
                system("samtools index %s" %(localbam))
	    cmd = "samtools view -b %s %s | samtools sort -n - %s" %(localbam, range, localout)
	    #cmd = "samtools view -b %s %s | samtools sort -n - %s" %(file.path, range, localout)
	    system(cmd)
            system("scp -C %s.bam %s" %(localout, self.sampledir))
	    
	    #Clean up right away:
	    system("rm -f %s.bam" %localout)
	    #system("rm -f %s" %localbam)
        system("mv %s %s" %(filelst, self.sampledir))	
	f.close()
    def run(self):
        chrNameDict, revChrNameDict = lsc.extractChrNamesDict(self.thisDir)
        logger.info('CycleStep2Chromosome object running, %s %s %s' 
                    % (self.thisDir, self.thisChr, chrNameDict[self.thisChr]))
        lsc.verifyDirExists(self.thisDir)
        lsc.createTimestamp(os.path.join(self.thisDir, 'xml', 'cycle.step2.%s.start.xml' 
                                         % chrNameDict[self.thisChr]), 
                            extra = {'name': self.thisChr})

        # evolver intra on one chromosome
        cmds = lsc.evolverIntraStepCmd(self.thisDir, self.theChild, self.thisStepLength, 
                                       self.thisChr, self.options.seed, 
                                       self.options.paramsDir, self.getLocalTempDir(), self.options)
        lsc.runCommands(cmds, self.getLocalTempDir())

        # evolver conversion from .rev to fasta in localTempDir
        cmds = lsc.evolverIntraStepToFastaCmd(self.thisDir, self.thisStepLength, self.thisChr, 
                                              self.options.paramsDir, self.getLocalTempDir())
        lsc.runCommands(cmds, self.getLocalTempDir())
            
        # trf wrapper
        lsc.callEvolverIntraStepTRFCmd(self.thisDir, self.thisChr, self.getLocalTempDir())
        
        # move the resulting trf files out of localTempDir
        cmds = lsc.evolverIntraStepMoveTRFCmd(self.thisDir, self.thisChr, self.getLocalTempDir())
        lsc.runCommands(cmds, self.getLocalTempDir(), mode = 'p')
        
        lsc.createTimestamp(os.path.join(self.thisDir, 'xml', 
                                         'cycle.step2.%s.end.xml' % chrNameDict[self.thisChr]),
                            extra = {'name': self.thisChr})
Пример #18
0
def writeConfig(config):
    #Write the config file to disk
    fileHandle = open(getConfigFileName(config.attrib["job_tree"]), 'w')
    tree = ET.ElementTree(config)
    tree.write(fileHandle)
    fileHandle.close()
    logger.info("Written the config file")
Пример #19
0
    def run(self):
        setLogLevel("DEBUG")
	logger.info("Adding experiments to jobTree\n")
	if self.options.inputInfo:
	    self.addChildTarget( PreProcess(self.options) )
	else:
	    self.addChildTarget( Start(self.options) )
Пример #20
0
def runCactusReference(cactusDiskDatabaseString, flowerNames, logLevel=None,
                       matchingAlgorithm=None, 
                       referenceEventString=None, 
                       permutations=None,
                       useSimulatedAnnealing=None,
                       theta=None,
                       phi=None, 
                       maxWalkForCalculatingZ=None,
                       ignoreUnalignedGaps=None,
                       wiggle=None, 
                       numberOfNs=None,
                       minNumberOfSequencesToSupportAdjacency=None,
                       makeScaffolds=None):
    """Runs cactus reference.
    """
    logLevel = getLogLevelString2(logLevel)
    matchingAlgorithm = nameValue("matchingAlgorithm", matchingAlgorithm)
    referenceEventString = nameValue("referenceEventString", referenceEventString)
    permutations = nameValue("permutations", permutations, int)
    useSimulatedAnnealing = nameValue("useSimulatedAnnealing", useSimulatedAnnealing, bool)
    theta = nameValue("theta", theta, float)
    phi = nameValue("phi", phi, float)
    maxWalkForCalculatingZ = nameValue("maxWalkForCalculatingZ", maxWalkForCalculatingZ, int)
    ignoreUnalignedGaps = nameValue("ignoreUnalignedGaps", ignoreUnalignedGaps, bool)
    wiggle = nameValue("wiggle", wiggle, float)
    numberOfNs = nameValue("numberOfNs", numberOfNs, int)
    minNumberOfSequencesToSupportAdjacency = nameValue("minNumberOfSequencesToSupportAdjacency", minNumberOfSequencesToSupportAdjacency, int)
    makeScaffolds = nameValue("makeScaffolds", makeScaffolds, bool)
    command = "cactus_reference --cactusDisk '%s' --logLevel %s %s %s %s %s %s %s %s %s %s %s %s %s" % \
    (cactusDiskDatabaseString, logLevel, matchingAlgorithm, referenceEventString, permutations, 
     useSimulatedAnnealing, theta, phi, maxWalkForCalculatingZ, ignoreUnalignedGaps, wiggle, numberOfNs, minNumberOfSequencesToSupportAdjacency, makeScaffolds)
    masterMessages = popenCatch(command, stdinString=flowerNames)
    logger.info("Ran cactus_reference okay")
    return [ i for i in masterMessages.split("\n") if i != '' ]
Пример #21
0
    def run(self):
        self.logToMaster("RepSize\n")
        stime = time.time()
        name2sample = {}
        for sam in os.listdir(self.sampledir):
            filepath = os.path.join(self.sampledir, sam, sam)
            sample = pickle.load(gzip.open(filepath, 'rb'))
            name2sample[sam] = sample
        logger.info("RepSize, done loading %d samples in %.4f s." %
                    (len(name2sample), (time.time() - stime)))
        stime = time.time()

        # Get summary of samples' sizes:
        group2samples = self.options.group2samples
        group2avr = libcommon.get_group_avr(name2sample, group2samples)
        logger.info("RepSize, done computing group_avr in %.4f s." %
                    (time.time() - stime))
        
        txtfile = os.path.join(self.options.outdir, "clonesize.txt")
        repsize.repsize_table(name2sample, txtfile, group2avr, group2samples)
        texfile = os.path.join(self.options.outdir, "clonesize.tex")
        repsize.repsize_table(name2sample, texfile, group2avr, group2samples,
                              True)
        self.addChildTarget(diversity.DiversityRarefaction(self.sampledir,
                                                           self.options))
Пример #22
0
def main():
    ##########################################
    #Construct the arguments.
    ##########################################

    parser = OptionParser()
 
    parser.add_option("--haplotypeSequences", dest="haplotypeSequences")
    parser.add_option("--newickTree", dest="newickTree")
    parser.add_option("--assembliesDir", dest="assembliesDir")
    parser.add_option("--outputDir", dest="outputDir")
    parser.add_option("--configFile", dest="configFile")
    parser.add_option("--minimumNsForScaffoldGap", dest="minimumNsForScaffoldGap")
    parser.add_option("--assemblyEventString", dest="assemblyEventString")
    parser.add_option("--haplotype1EventString", dest="haplotype1EventString")
    parser.add_option("--haplotype2EventString", dest="haplotype2EventString")
    parser.add_option("--contaminationEventString", dest="contaminationEventString")
    parser.add_option("--featureBedFiles", dest="featureBedFiles")
    parser.add_option("--geneBedFiles", dest="geneBedFiles")
    
    Stack.addJobTreeOptions(parser)

    options, args = parser.parse_args()
    setLoggingFromOptions(options)

    if len(args) != 0:
        raise RuntimeError("Unrecognised input arguments: %s" % " ".join(args))

    Stack(MakeAlignments(newickTree=options.newickTree, 
                         haplotypeSequences=options.haplotypeSequences.split(), 
                         assembliesDir=options.assembliesDir, 
                         outputDir=options.outputDir, 
                         configFile=options.configFile, 
                         options=options)).startJobTree(options)
    logger.info("Done with job tree")
Пример #23
0
    def obtainSystemConstants(self):
        p = subprocess.Popen(["lshosts"], stdout = subprocess.PIPE, stderr = subprocess.STDOUT)

        line = p.stdout.readline()
        items = line.strip().split()
        num_columns = len(items)
        cpu_index = None
        mem_index = None        
        for i in range(num_columns): 
                if items[i] == 'ncpus':
                        cpu_index = i
                elif items[i] == 'maxmem':
                        mem_index = i

        if cpu_index is None or mem_index is None:
                RuntimeError("lshosts command does not return ncpus or maxmem columns")

        p.stdout.readline()

        self.maxCPU = 0
        self.maxMEM = MemoryString("0")
        for line in p.stdout:
                items = line.strip().split()
                if len(items) < num_columns:
                        RuntimeError("lshosts output has a varying number of columns")
                if items[cpu_index] != '-' and items[cpu_index] > self.maxCPU:
                        self.maxCPU = items[cpu_index]
                if items[mem_index] != '-' and MemoryString(items[mem_index]) > self.maxMEM:
                        self.maxMEM = MemoryString(items[mem_index])

        if self.maxCPU is 0 or self.maxMEM is 0:
                RuntimeError("lshosts returns null ncpus or maxmem info")
        logger.info("Got the maxCPU: %s" % (self.maxMEM))
Пример #24
0
def runCactusBlast(sequenceFiles, outputFile, jobTreeDir,
                   chunkSize=None, overlapSize=None, 
                   logLevel=None, 
                   blastString=None, 
                   selfBlastString=None,
                   compressFiles=None,
                   lastzMemory=None,
                   targetSequenceFiles=None):
    logLevel = getLogLevelString2(logLevel)
    chunkSize = nameValue("chunkSize", chunkSize, int)
    overlapSize = nameValue("overlapSize", overlapSize, int)
    blastString = nameValue("blastString", blastString, str)
    selfBlastString = nameValue("selfBlastString", selfBlastString, str)
    compressFiles = nameValue("compressFiles", compressFiles, bool)
    lastzMemory = nameValue("lastzMemory", lastzMemory, int)
    if targetSequenceFiles != None: 
        targetSequenceFiles = " ".join(targetSequenceFiles)
    targetSequenceFiles = nameValue("targetSequenceFiles", targetSequenceFiles, quotes=True)
    command = "cactus_blast.py %s  --cigars %s %s %s %s %s %s %s %s --jobTree %s --logLevel %s" % \
            (" ".join(sequenceFiles), outputFile,
             chunkSize, overlapSize, blastString, selfBlastString, compressFiles, 
             lastzMemory, targetSequenceFiles, jobTreeDir, logLevel)
    logger.info("Running command : %s" % command)
    system(command)
    logger.info("Ran the cactus_blast command okay")
Пример #25
0
    def run(self):
        localTempDir = self.getLocalTempDir()
        filelst = os.path.join(localTempDir, 'file.lst')
        f = open(filelst, 'w')
        for file in self.files:
            #Copy necessary file to local tempdir first:
            localbam = os.path.join(localTempDir, os.path.basename(file.path))
            #localbambai = os.path.join( localTempDir, "%s.bai" % os.path.basename(file.path) )
            system("ln -s %s %s" % (file.path, localbam))

            range = self.ref2info[file.ref][1]

            filename = os.path.basename(file.path).rstrip('.bam')
            localout = os.path.join(localTempDir, "%s-sorted" % (filename))

            logger.info("Pre-processing sample %s\n" % (filename))
            f.write("%s\t%s\n" % (filename, self.ref2info[file.ref][0]))
            #Extract range and sort by name:
            if os.path.exists("%s.bai" % file.path):
                system("ln -s %s.bai %s.bai" % (file.path, localbam))
            else:
                system("samtools index %s" % (localbam))
            cmd = "samtools view -b %s %s | samtools sort -n - %s" % (
                localbam, range, localout)
            #cmd = "samtools view -b %s %s | samtools sort -n - %s" %(file.path, range, localout)
            system(cmd)
            system("scp -C %s.bam %s" % (localout, self.sampledir))

            #Clean up right away:
            system("rm -f %s.bam" % localout)
            #system("rm -f %s" %localbam)
        system("mv %s %s" % (filelst, self.sampledir))
        f.close()
Пример #26
0
def main():
    ##########################################
    #Construct the arguments.
    ##########################################

    parser = getBasicOptionParser("usage: %prog [options] treeStatsFiles",
                                  "%prog 0.1")

    options, args = parseBasicOptions(parser)

    logger.info("Parsed arguments")

    ##########################################
    #Get the input data etc.
    ##########################################

    assert len(args) % 2 == 0
    stats = [(ET.parse(statsFile).getroot(), regionName)
             for statsFile, regionName in zip(args[::2], args[1::2])]

    ##########################################
    #Make the scatter plots
    ##########################################

    chainScatterPlots(stats)
    blockScatterPlots(stats)
Пример #27
0
    def runComparisonOfBlastScriptVsNaiveBlast(self, blastMode):
        """We compare the output with a naive run of the blast program, to check the results are nearly
        equivalent.
        """
        encodeRegions = [ "ENm00" + str(i) for i in xrange(1,2) ] #, 2) ] #Could go to six
        species = ("human", "mouse", "dog")
        #Other species to try "rat", "monodelphis", "macaque", "chimp"
        for encodeRegion in encodeRegions:
            regionPath = os.path.join(self.encodePath, encodeRegion)
            for i in xrange(len(species)):
                species1 = species[i]
                for species2 in species[i+1:]:
                    seqFile1 = os.path.join(regionPath, "%s.%s.fa" % (species1, encodeRegion))
                    seqFile2 = os.path.join(regionPath, "%s.%s.fa" % (species2, encodeRegion))

                    #Run simple blast
                    runNaiveBlast(seqFile1, seqFile2, self.tempOutputFile, self.tempDir)
                    logger.info("Ran the naive blast okay")
                    
                    #Run cactus blast pipeline
                    toilDir = os.path.join(getTempDirectory(self.tempDir), "toil")
                    if blastMode == "allAgainstAll":
                        runCactusBlast(sequenceFiles=[ seqFile1, seqFile2 ],
                                       alignmentsFile=self.tempOutputFile2, toilDir=toilDir,
                                       chunkSize=500000, overlapSize=10000)
                    else:
                        runCactusBlast(sequenceFiles=[ seqFile1 ], alignmentsFile=self.tempOutputFile2,
                                       toilDir=toilDir, chunkSize=500000, overlapSize=10000,
                                       targetSequenceFiles=[ seqFile2 ])
                    logger.info("Ran cactus_blast okay")
                    logger.critical("Comparing cactus_blast and naive blast; using mode: %s" % blastMode)
                    checkCigar(self.tempOutputFile)
                    checkCigar(self.tempOutputFile2)
                    compareResultsFile(self.tempOutputFile, self.tempOutputFile2)
Пример #28
0
 def run(self):
     ##########################################
     #Setup a file tree.
     ##########################################
         
     tempFileTree = TempFileTree(os.path.join(self.getGlobalTempDir(), getRandomAlphaNumericString()))   
     
     fileTreeRootFile = tempFileTree.getTempFile()
 
     makeFileTree(fileTreeRootFile, \
                  self.depth, tempFileTree)
     
     treePointer = tempFileTree.getTempFile()
     
     makeTreePointer(fileTreeRootFile, treePointer)
     
     logger.info("We've set up the file tree")
     
     if random.random() > 0.5:
         raise RuntimeError()
     
     ##########################################
     #Issue the child and follow on jobs
     ##########################################
     
     self.addChildTarget(ChildTarget(treePointer))
     
     self.setFollowOnTarget(DestructFileTree(tempFileTree))
     
     logger.info("We've added the child target and finished SetupFileTree.run()")
Пример #29
0
def writeConfig(config):
    #Write the config file to disk
    fileHandle = open(getConfigFileName(config.attrib["job_tree"]), 'w')
    tree = ET.ElementTree(config)
    tree.write(fileHandle)
    fileHandle.close()
    logger.info("Written the config file")
Пример #30
0
def main():
   usg = "Usage: %prog [options]\n"
   parser = OptionParser(usage=usg)
   parser.add_option("-d", "--simList", dest="sim", help="List of simulation directories. Default: simulations.lst", default="simulations.lst")
   parser.add_option("-c", "--configStartFile", dest="config", help="cactus_workflow_config.xml", default="cactus_workflow_config.xml")
   parser.add_option("-o", "--outputDir", dest="outputDir", help="Directory for the outputs of the runs. Default: out", default="out/")
   parser.add_option("-m", "--simTrueMafDir", dest="simTrueMafDir", help="Directory for 'true' mafs of the simulations. Default: sim/", default="sim/")
   parser.add_option("-t", "--tree", dest="tree", help="Phylogeny tree of the species of interest, in Newick format.Default: tree", default="tree")
   parser.add_option("-s", "--species", dest="species", help="List of species in the order as they appear in the  Newick tree. Default: species.lst", default="species.lst")
   parser.add_option("-j", "--job", dest="jobFile", help="Job file containing command to run.", default=None)
   (options, args) = parser.parse_args()
   #Process options:
   options.outputDir = modify_dirname(options.outputDir)
   check_dir(options.outputDir)
   options.tree = getFirstLine(options.tree)
   #assert options.tree == ''
   options.species = getFirstLine(options.species).split()
   #assert len(options.species) == 0
   options.sim = getList(options.sim)
   #assert len(options.sim) == 0
   #options.config = getList(options.config)
   #assert len(options.config) == 0
   logger.info("Processed options\n")
   #Tuning
   cactusTuningWrapper = CactusTuningWrapper(options)
   cactusTuningWrapper.execute(options.jobFile)
Пример #31
0
   def run(self):
      #--------------------------------------------
      #Run cactus & evaluations for each simulation
      #--------------------------------------------
      logger.info("CactusTuningSimulationsWrapper: going to issue cactus runs for all simulations for parameter %s\n" %(self.paraFile))
      simNum = 0
      for sim in self.options.sim:
         sim = modify_dirname(sim)
         simName = getRootDir(sim)
         
         #Get path to sequence file of each species
	 sequenceFiles = " ".join([ os.path.join(sim, spc) for spc in self.options.species ])
         logger.info("Got sequence files: %s\n" % (sequenceFiles))

	 #add child
      	 #self.addChildTarget(CactusWorkflowWrapper(sim, simNum, self.paraFile, self.outDir, sequenceFiles, self.options.tree))
      	 self.addChildTarget(CactusWorkflowWrapper(sim, simName, self.options.simTrueMafDir, self.paraFile, self.outDir, sequenceFiles, self.options.tree))
         logger.info("Added child CactusWorkflowWrapper for sim %s and confi %s\n" % (sim, self.paraFile))
         simNum += 1
	
      #----------------------------------------------------------------
      #Done running cactus & evaluations steps for all the simulations. 
      #Now Merge results & clean up.
      #----------------------------------------------------------------
      logger.info("Done running cactus & evaluations for parameter %s. Now merge results and clean up.\n" %(self.paraFile))
      self.setFollowOnTarget(CactusMergeResultsAndCleanup(simNum, self.outDir, self.options))
      logger.info("Added CactusMergeResultsAndCleanup as FollowOnTarget for %s\n" %(self.outDir))
Пример #32
0
    def progressiveWithSubtreeRootFunction(self, experimentFile, toilDir,
                                           batchSystem, buildAvgs,
                                           buildReference,
                                           buildHal,
                                           buildFasta,
                                           toilStats):
        """Choose an arbitrary subtree from the larger species tree to run the
        alignment on. This function is necessary to keep
        runWorkflow_multipleExamples general (specifying a subtree
        root doesn't make sense for runCactusWorkflow).
        """
        # Get valid internal nodes that are the root of the subtree we
        # want to align
        expWrapper = ExperimentWrapper(ET.parse(experimentFile).getroot())
        tree = expWrapper.getTree()
        validNodes = []
        for node in tree.postOrderTraversal():
            if tree.hasName(node) and not tree.isLeaf(node):
                validNodes.append(tree.getName(node))

        # Choose a random valid subtree root (NB: the entire species
        # tree is a valid subtree)
        subtreeRoot = random.choice(validNodes)
        logger.info("Chose subtree root %s to test from species tree "
                    "%s" % (subtreeRoot, NXNewick().writeString(tree)))

        self.progressiveFunction(experimentFile, toilDir,
                                 batchSystem, buildAvgs,
                                 buildReference,
                                 buildHal,
                                 buildFasta,
                                 toilStats, subtreeRoot)
Пример #33
0
    def progressiveWithSubtreeRootFunction(self, experimentFile, toilDir,
                                           batchSystem, buildAvgs,
                                           buildReference, buildHal,
                                           buildFasta, toilStats):
        """Choose an arbitrary subtree from the larger species tree to run the
        alignment on. This function is necessary to keep
        runWorkflow_multipleExamples general (specifying a subtree
        root doesn't make sense for runCactusWorkflow).
        """
        # Get valid internal nodes that are the root of the subtree we
        # want to align
        expWrapper = ExperimentWrapper(ET.parse(experimentFile).getroot())
        tree = expWrapper.getTree()
        validNodes = []
        for node in tree.postOrderTraversal():
            if tree.hasName(node) and not tree.isLeaf(node):
                validNodes.append(tree.getName(node))

        # Choose a random valid subtree root (NB: the entire species
        # tree is a valid subtree)
        subtreeRoot = random.choice(validNodes)
        logger.info("Chose subtree root %s to test from species tree "
                    "%s" % (subtreeRoot, NXNewick().writeString(tree)))

        self.progressiveFunction(experimentFile, toilDir, batchSystem,
                                 buildAvgs, buildReference, buildHal,
                                 buildFasta, toilStats, subtreeRoot)
Пример #34
0
 def run(self):
     setLogLevel("DEBUG")
     logger.info("Adding experiments to jobTree\n")
     if self.options.inputInfo:
         self.addChildTarget(PreProcess(self.options))
     else:
         self.addChildTarget(Start(self.options))
Пример #35
0
    def runMarginAlign(self, readFastqFile, referenceFastaFile, args=""):
        startTime = time.time()
        system("\t".join([
            self.marginAlign, readFastqFile, referenceFastaFile,
            self.outputSamFile,
            "--jobTree=%s" % self.jobTree, args
        ]))
        runTime = time.time() - startTime
        readAlignmentStats = self.validateSam(self.outputSamFile,
                                              readFastqFile,
                                              referenceFastaFile)
        #Get some stats to print
        readIdentity = numpy.average(
            map(lambda rAS: rAS.readIdentity(), readAlignmentStats))
        alignmentIdentity = numpy.average(
            map(lambda rAS: rAS.alignmentIdentity(), readAlignmentStats))
        mismatchesPerAlignedBase = numpy.average(
            map(lambda rAS: rAS.mismatchesPerAlignedBase(),
                readAlignmentStats))
        insertionsPerReadBase = numpy.average(
            map(lambda rAS: rAS.insertionsPerReadBase(), readAlignmentStats))
        deletionsPerReadBase = numpy.average(
            map(lambda rAS: rAS.deletionsPerReadBase(), readAlignmentStats))

        logger.info("Ran marginAlign with args: %s, with reference: %s and reads: %s. \
        Got Read Identity: %s, Alignment Identity: %s, Mismatches per aligned base: %s, Insertions per read base: %s, \
        Deletions per read base: %s, Took: %s seconds"                                                       % \
                    (args, readFastqFile, referenceFastaFile, readIdentity, alignmentIdentity,
                     mismatchesPerAlignedBase, insertionsPerReadBase,
                     deletionsPerReadBase, runTime))
        system("rm -rf %s" % self.jobTree)
Пример #36
0
 def __init__(self, config, maxCpus, maxMemory, workerFn=worker):
     AbstractBatchSystem.__init__(self, config, maxCpus,
                                  maxMemory)  #Call the parent constructor
     self.jobIndex = 0
     self.jobs = {}
     self.maxThreads = int(config.attrib["max_threads"])
     logger.info(
         "Setting up the thread pool with %i threads given the max threads %i and the max cpus %i"
         % (min(self.maxThreads,
                self.maxCpus), self.maxThreads, self.maxCpus))
     self.maxThreads = min(self.maxThreads, self.maxCpus)
     self.cpusPerThread = float(self.maxCpus) / float(self.maxThreads)
     self.memoryPerThread = self.maxThreads + float(self.maxMemory) / float(
         self.maxThreads
     )  #Add the maxThreads to avoid losing memory by rounding.
     assert self.cpusPerThread >= 1
     assert self.maxThreads >= 1
     assert self.maxMemory >= 1
     assert self.memoryPerThread >= 1
     self.inputQueue = Queue()
     self.outputQueue = Queue()
     self.workerFn = workerFn
     for i in xrange(self.maxThreads):  #Setup the threads
         worker = Process(target=workerFn,
                          args=(self.inputQueue, self.outputQueue))
         worker.daemon = True
         worker.start()
Пример #37
0
 def issueJobs(self, jobCommands):
     """Issues parasol with job commands.
     """
     issuedJobs = {}
     for jobCommand, memory, cpu, logFile in jobCommands:
         assert memory != None
         assert cpu != None
         assert logFile != None
         pattern = re.compile("your job ([0-9]+).*")
         command = "parasol -verbose -ram=%i -cpu=%i -results=%s add job '%s'" % (memory, cpu, self.parasolResultsFile, jobCommand)
         while True:
             #time.sleep(0.1) #Sleep to let parasol catch up #Apparently unnecessary
             popenParasolCommand(command, self.scratchFile)
             fileHandle = open(self.scratchFile, 'r')
             line = fileHandle.readline()
             fileHandle.close()
             match = pattern.match(line)
             if match != None: #This is because parasol add job will return success, even if the job was not properly issued!
                 break
             else:
                 logger.info("We failed to properly add the job, we will try again after a sleep")
                 time.sleep(5)
         jobID = int(match.group(1))
         logger.debug("Got the job id: %s from line: %s" % (jobID, line))
         assert jobID not in issuedJobs.keys()
         issuedJobs[jobID] = jobCommand
         logger.debug("Issued the job command: %s with job id: %i " % (command, jobID))
     return issuedJobs
Пример #38
0
def runCactusProgressive(inputDir,
                      jobTreeDir, 
                      logLevel=None, retryCount=0, 
                      batchSystem="single_machine", 
                      rescueJobFrequency=None,
                      skipAlignments=False,
                      buildHal=None,
                      buildFasta=None,
                      buildAvgs=False, 
                      jobTreeStats=False,
                      maxThreads=None,
                      maxCpus=None,
                      defaultMemory=None,
                      recursive=None,
                      logFile=None,
                      event=None,
                      extraJobTreeArgumentsString="",
                      profileFile=None):
    command = ("cactus_progressive.py %s" % inputDir) + " " + _fn(jobTreeDir, 
                      logLevel, retryCount, batchSystem, rescueJobFrequency, skipAlignments,
                      buildAvgs, None,
                      buildHal,
                      buildFasta,
                      jobTreeStats, maxThreads, maxCpus, defaultMemory, logFile, extraJobTreeArgumentsString=extraJobTreeArgumentsString) + \
                      (" %s %s" % (nameValue("recursive", recursive, bool),
                                      nameValue("event", event)))
    if profileFile != None:
        command = "python -m cProfile -o %s %s/bin/%s" % (profileFile, cactusRootPath(), command)
    system(command)                   
    logger.info("Ran the cactus progressive okay")
Пример #39
0
    def run(self):
        # filter by size
        starttime = time.time()
        opts = self.opts
        clones = pickle.load(gzip.open(self.samplefile, 'rb'))
        if (opts.mincount > 1 or opts.maxcount > 0 or opts.minfreq > 0 or
            opts.maxfreq > 0):
            clones = filter_by_size(clones, opts.mincount, opts.maxcount,
                                    opts.minfreq, opts.maxfreq)
        msg = ("Filter_by_size for file %s done in %.4f s" %
                                 (self.samplefile, time.time() - starttime))
        logger.info(msg)
        starttime = time.time()

        # filter by status
        pclones = filter_by_status(clones, True)
        npclones = filter_by_status(clones, False)
        
        filename = os.path.basename(self.samplefile)
        if pclones:
            pdir = os.path.join(self.outdir, "productive", self.name)
            system("mkdir -p %s" % pdir)
            pfile = os.path.join(pdir, filename)
            pickle.dump(pclones, gzip.open(pfile, "wb"))
        if npclones:    
            npdir = os.path.join(self.outdir, "non_productive", self.name)
            system("mkdir -p %s" % npdir)
            npfile = os.path.join(npdir, filename)
            pickle.dump(npclones, gzip.open(npfile, "wb"))
        msg = ("Filter_by_status for file %s done in %.4f s" %
                                 (self.samplefile, time.time() - starttime))
        logger.info(msg)
        self.setFollowOnTarget(libcommon.CleanupFile(self.samplefile))
Пример #40
0
    def run(self):
        geneFile = os.path.join(self.getLocalTempDir(), "refgene.bed")
        system("cp %s %s" % (self.geneFile, geneFile))

        system("cactus_genemapChain -c %s -o \"%s\" -s \"%s\" -g \"%s\"" \
                %(self.dbStr, self.outputFile, self.refSpecies, geneFile))
        logger.info("Done genemapChain for %s\n" % self.region)
    def run(self):
        logger.info('CycleStep4 object running, %s' % self.thisDir)
        lsc.verifyDirExists(self.thisDir)
        lsc.createTimestamp(os.path.join(self.thisDir, 'xml', 'cycle.step4.start.xml'))
        # lsc.subTypeTimestamp(self.thisDir, 'cycle', 'CycleStep4_start')

        outname = os.path.join(self.thisDir, 'logs', 'gene_deactivation.log')
        if not os.path.exists(outname):
            if not self.options.noGeneDeactivation:
                # by default gene deactivation is turned on.
                cmd = lsc.evolverGeneDeactivationStep(self.thisDir, self.thisParentDir)
                p = subprocess.Popen(cmd, cwd = self.getLocalTempDir(), 
                                     stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
                out = p.communicate()[0]
                f=open(outname + '.tmp', 'w')
                f.write(out)
                f.close()
                os.rename(outname + '.tmp', outname)
            else:
                # this could cause a proliferation of gene creation.
                cmd = [lsc.which('cp')]
                cmd.append(os.path.join(thisDir, 'intra', 'evannots.gff'))
                cmd.append(os.path.join(thisDir, 'annots.gff'))
                cmds = [cmd]
                cmds.append([lsc.which('touch'), outname])
                lsc.runCommands(cmds, self.getLocalTempDir())
        lsc.createTimestamp(os.path.join(self.thisDir, 'xml', 'cycle.step4.end.xml'))
Пример #42
0
 def issueJob(self, command, memory, cpu):
     """Issues parasol with job commands.
     """
     self.checkResourceRequest(memory, cpu)
     pattern = re.compile("your job ([0-9]+).*")
     parasolCommand = "%s -verbose -ram=%i -cpu=%i -results=%s add job '%s'" % (self.parasolCommand, memory, cpu, self.parasolResultsFile, command)
     #Deal with the cpus
     self.usedCpus += cpu
     while True: #Process finished results with no wait
         try:
            jobID = self.outputQueue1.get_nowait()
            self.usedCpus -= self.jobIDsToCpu.pop(jobID)
            assert self.usedCpus >= 0
            self.outputQueue1.task_done()
         except Empty:
             break
     while self.usedCpus > self.maxCpus: #If we are still waiting
         self.usedCpus -= self.jobIDsToCpu.pop(self.outputQueue1.get())
         assert self.usedCpus >= 0
         self.outputQueue1.task_done()
     #Now keep going
     while True:
         #time.sleep(0.1) #Sleep to let parasol catch up #Apparently unnecessary
         line = popenParasolCommand(parasolCommand)[1][0]
         match = pattern.match(line)
         if match != None: #This is because parasol add job will return success, even if the job was not properly issued!
             break
         else:
             logger.info("We failed to properly add the job, we will try again after a sleep")
             time.sleep(5)
     jobID = int(match.group(1))
     self.jobIDsToCpu[jobID] = cpu
     logger.debug("Got the parasol job id: %s from line: %s" % (jobID, line))
     logger.debug("Issued the job command: %s with (parasol) job id: %i " % (parasolCommand, jobID))
     return jobID
Пример #43
0
 def run(self):
     cactusAlignmentName = "cactusAlignment"
     cactusAlignment = os.path.join(self.outputDir, cactusAlignmentName)
     if not os.path.exists(cactusAlignment):
         #Prepare the assembly
         #First copy it.
         if self.assemblyFile[-3:] == '.gz':
             tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(),
                                            suffix=".gz")
             system("cp %s %s" % (self.assemblyFile, tempAssemblyFile))
             system("gunzip %s" % tempAssemblyFile)
             tempAssemblyFile = tempAssemblyFile[:-3]
             assert os.path.exists(tempAssemblyFile)
         else:
             tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(),
                                            suffix="")
             system("cp %s %s" % (self.assemblyFile, tempAssemblyFile))
         #Make the supporting temporary files
         tempExperimentFile = getTempFile(rootDir=self.getLocalTempDir())
         tempJobTreeDir = os.path.join(self.getLocalTempDir(), "jobTree")
         #Make the experiment file
         cactusWorkflowExperiment = ExperimentWrapper.createExperimentWrapper(
             sequences=self.haplotypeSequences + [tempAssemblyFile],
             newickTreeString=self.newickTree,
             outputDir=self.getLocalTempDir(),
             configFile=self.configFile)
         cactusWorkflowExperiment.setDbName(cactusAlignmentName)
         cactusWorkflowExperiment.setDbDir(
             os.path.join(self.getLocalTempDir(),
                          cactusWorkflowExperiment.getDbName())
         )  #This needs to be set to ensure the thing gets put in the right directory
         cactusWorkflowExperiment.writeXML(tempExperimentFile)
         #Now run cactus workflow
         runCactusWorkflow(experimentFile=tempExperimentFile,
                           jobTreeDir=tempJobTreeDir,
                           buildAvgs=False,
                           buildReference=True,
                           batchSystem="single_machine",
                           maxThreads=1,
                           jobTreeStats=True)
         logger.info("Ran the workflow")
         #Check if the jobtree completed sucessively.
         runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir)
         logger.info("Checked the job tree dir")
         #Compute the stats
         cactusAlignmentDir = os.path.join(self.getLocalTempDir(),
                                           cactusAlignmentName)
         tempJobTreeStatsFile = os.path.join(self.getLocalTempDir(),
                                             "jobTreeStats.xml")
         system("jobTreeStats --jobTree %s --outputFile %s" %
                (tempJobTreeDir, tempJobTreeStatsFile))
         #Now copy the true assembly back to the output
         system("mv %s/* %s" % (self.getLocalTempDir(), self.outputDir))
         #system("mv %s %s/config.xml" % (tempExperimentFile, self.outputDir))
         #system("mv %s %s/" % (tempJobTreeStatsFile, self.outputDir))
         #system("mv %s %s/" % (cactusAlignmentDir, self.outputDir))
         assert os.path.exists(cactusAlignment)
         #We're done!
     self.addChildTarget(
         MakeStats1(self.outputDir, cactusAlignment, self.options))
Пример #44
0
def runCactusBar(cactusDiskDatabaseString, flowerNames, logLevel=None,
                         spanningTrees=None, maximumLength=None, 
                         gapGamma=None,
                         matchGamma=None,
                         splitMatrixBiggerThanThis=None,
                         anchorMatrixBiggerThanThis=None,
                         repeatMaskMatrixBiggerThanThis=None,
                         diagonalExpansion=None,
                         constraintDiagonalTrim=None,
                         minimumBlockDegree=None,
                         minimumIngroupDegree=None,
                         minimumOutgroupDegree=None,
                         alignAmbiguityCharacters=None,
                         pruneOutStubAlignments=None,
                         useProgressiveMerging=None,
                         calculateWhichEndsToComputeSeparately=None,
                         largeEndSize=None,
                         endAlignmentsToPrecomputeOutputFile=None,
                         precomputedAlignments=None,
                         ingroupCoverageFile=None,
                         minimumSizeToRescue=None,
                         minimumCoverageToRescue=None,
                         minimumNumberOfSpecies=None):
    """Runs cactus base aligner.
    """
    logLevel = getLogLevelString2(logLevel)
    maximumLength = nameValue("maximumLength", maximumLength, int)
    spanningTrees = nameValue("spanningTrees", spanningTrees, int)
    gapGamma = nameValue("gapGamma", gapGamma, float)
    matchGamma = nameValue("matchGamma", matchGamma, float)
    splitMatrixBiggerThanThis=nameValue("splitMatrixBiggerThanThis", splitMatrixBiggerThanThis, int)
    anchorMatrixBiggerThanThis=nameValue("anchorMatrixBiggerThanThis", anchorMatrixBiggerThanThis, int)
    repeatMaskMatrixBiggerThanThis=nameValue("repeatMaskMatrixBiggerThanThis", repeatMaskMatrixBiggerThanThis, int)                   
    diagonalExpansion=nameValue("diagonalExpansion", diagonalExpansion, int)
    constraintDiagonalTrim = nameValue("constraintDiagonalTrim", constraintDiagonalTrim, int)
    minimumBlockDegree = nameValue("minimumDegree", minimumBlockDegree, int)
    minimumIngroupDegree = nameValue("minimumIngroupDegree", minimumIngroupDegree, int)
    minimumOutgroupDegree = nameValue("minimumOutgroupDegree", minimumOutgroupDegree, int)
    pruneOutStubAlignments = nameValue("pruneOutStubAlignments", pruneOutStubAlignments, bool)
    alignAmbiguityCharacters = nameValue("alignAmbiguityCharacters", alignAmbiguityCharacters, bool)
    useProgressiveMerging=nameValue("useProgressiveMerging", useProgressiveMerging, bool)
    calculateWhichEndsToComputeSeparately=nameValue("calculateWhichEndsToComputeSeparately", calculateWhichEndsToComputeSeparately, bool)
    largeEndSize=nameValue("largeEndSize", largeEndSize, int)
    endAlignmentsToPrecomputeOutputFile=nameValue("endAlignmentsToPrecomputeOutputFile", endAlignmentsToPrecomputeOutputFile, str)
    precomputedAlignments=nameValue("precomputedAlignments", precomputedAlignments, str, quotes=True)
    ingroupCoverageFile = nameValue("ingroupCoverageFile", ingroupCoverageFile, str, quotes=True)
    minimumSizeToRescue = nameValue("minimumSizeToRescue", minimumSizeToRescue, int)
    minimumCoverageToRescue = nameValue("minimumCoverageToRescue", minimumCoverageToRescue, float)
    minimumNumberOfSpecies = nameValue("minimumNumberOfSpecies", minimumNumberOfSpecies, int)

    masterMessages = popenCatch("cactus_bar --cactusDisk '%s' --logLevel %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s" % 
           (cactusDiskDatabaseString, logLevel, spanningTrees, maximumLength, gapGamma, matchGamma,
            splitMatrixBiggerThanThis, anchorMatrixBiggerThanThis, repeatMaskMatrixBiggerThanThis,
            constraintDiagonalTrim, minimumBlockDegree, minimumIngroupDegree, minimumOutgroupDegree,  
            alignAmbiguityCharacters, pruneOutStubAlignments, diagonalExpansion,
            useProgressiveMerging, calculateWhichEndsToComputeSeparately,
            largeEndSize, endAlignmentsToPrecomputeOutputFile, precomputedAlignments, ingroupCoverageFile, minimumSizeToRescue, minimumCoverageToRescue, minimumNumberOfSpecies), stdinString=flowerNames)
    logger.info("Ran cactus_bar okay")
    return [ i for i in masterMessages.split("\n") if i != '' ]
Пример #45
0
 def issueJob(self, command, memory, cpu):
     jobID = self.nextJobID
     self.nextJobID += 1
     self.currentjobs.add(jobID)
     bsubline = prepareBsub(cpu, memory) + [command]
     self.newJobsQueue.put((jobID, bsubline))
     logger.info("Issued the job command: %s with job id: %s " % (command, str(jobID)))
     return jobID
Пример #46
0
def listChildDirs(jobDir):
    try:
        return listChildDirsUnsafe(jobDir)
    except:
        logger.info(
            "Encountered error while parsing job dir %s, so we will ignore it"
            % jobDir)
    return []
Пример #47
0
	def run(self): 
		args=self.options
		refhistoryid=args.refhistoryid + histseg.Global_BINWIDTH*self.i 
		outfile=os.path.join(self.outdir, "run_%d.dat" % self.i)
		if not os.path.exists(outfile):
			outfh=open(outfile, 'w')
			logger.info("running %d: get_history_distances_between_mcmc_steps(events, %s, %s, %s, %s, %s) > %s" % (self.i, refhistoryid, "" , args.numsteps, args.stepsize, args.stepsize, outfile))
			mcmcdist.get_history_distances_between_mcmc_steps(self.events, refhistoryid, "", args.numsteps, args.stepsize, args.stepsize, outfh) 
Пример #48
0
def runCactusTreeViewer(graphFile,
                        cactusDiskDatabaseString,
                        flowerName="0",
                        logLevel=None):
    logLevel = getLogLevelString2(logLevel)
    system("cactus_treeViewer --cactusDisk '%s' --flowerName %s --outputFile %s --logLevel %s" \
                    % (cactusDiskDatabaseString, flowerName, graphFile, logLevel))
    logger.info("Created a cactus tree graph")
Пример #49
0
def loadEnvironment(config):
    """Puts the environment in the pickle file.
    """
    #Dump out the environment of this process in the environment pickle file.
    fileHandle = open(getEnvironmentFileName(config.attrib["job_tree"]), 'w')
    cPickle.dump(os.environ, fileHandle)
    fileHandle.close()
    logger.info("Written the environment for the jobs to the environment file")
    def testMatchGraph(self):
        """ Tests matchGraph.py program using randGraph.py input
        """

        for test in range(self.testNo):
            tempInputFile = getTempFile()
            tempOutputFile = getTempFile()

            self.tempFiles.append(tempInputFile)
            self.tempFiles.append(tempOutputFile)

            # Create sample/test input graph file
            system("blossom_randGraph.py > %s" % tempInputFile)

            # Run matchGraph.py
            system("matchGraph.py -e %s -w %s" % (tempInputFile, tempOutputFile))

            # Now check if output is valid
            f = open(tempOutputFile, 'r')
            lineIdx = 0
            for line in f:
                line = line.rstrip()
                if lineIdx == 0:
                    (vertexNum, edgeNum) = line.split()
                    vertexNum = int(vertexNum)
                    edgeNum = int(edgeNum)
                    vertexArray = [0] * vertexNum

                    # Number of vertices must be even
                    self.assertEqual(vertexNum % 2, 0)

                    # Number of edges is half the number of vertices
                    self.assertEqual(vertexNum/2, edgeNum)
                else:
                    (vertexI, vertexJ,) = line.split()
                    vertexI = int(vertexI)
                    vertexJ = int(vertexJ)

                    vertexArray[vertexI] += 1
                    vertexArray[vertexJ] += 1

                    # Vertex indices must be 0<= i,j < V
                    self.assertTrue(vertexI in range(vertexNum))
                    self.assertTrue(vertexJ in range(vertexNum))
                lineIdx += 1

            # Must have the correct number of edges
            self.assertEqual(edgeNum, lineIdx-1)

            badCount = 0
            for i in vertexArray:
                if i != 1:
                    badCount += 1
            # Each vertex must be only in one edge
            self.assertEqual(badCount, 0)

            logger.info("Ran the test(s) of the matchGraph program okay")
Пример #51
0
 def issueJob(self, command, memory, cpu):
     jobID = self.nextJobID
     self.nextJobID += 1
     self.currentjobs.add(jobID)
     bsubline = prepareBsub(cpu, memory) + [command]
     self.newJobsQueue.put((jobID, bsubline))
     logger.info("Issued the job command: %s with job id: %s " %
                 (command, str(jobID)))
     return jobID
Пример #52
0
 def run(self):
     logger.info("At the end, this is the contents of the global temp dir...")
     system("ls -l %s" % self.getGlobalTempDir())
     logger.info("And done....")
     
     if random.random() > 0.5:
         raise RuntimeError()
     
     self.tempFileTree.destroyTempFiles()
Пример #53
0
    def run(self):
        geneFile = os.path.join(self.getLocalTempDir(), "refgene.bed")
        system("cp %s %s" % (self.geneFile, geneFile))

        command = "cactus_genemapHomolog -c %s -o \"%s\" -s \"%s\" -g \"%s\" > %s" \
                  %(self.dbStr, self.output1, self.refSpecies, geneFile, self.output2)
        system("%s" % command)
        logger.info("Done genemapHomolog for %s, command: %s\n" %
                    (self.region, command))
Пример #54
0
def parseJobFile(absFileName):
    try:
        job = Job.read(absFileName)
        return job
    except:
        logger.info(
            "Encountered error while parsing job file %s, so we will ignore it"
            % absFileName)
    return None
Пример #55
0
    def obtainSystemConstants(self):
        """
        This should be able to set self.maxCPU and self.maxMEM
        """
        self.maxCPU = 0
        self.maxMEM = 0

        if self.maxCPU is 0 or self.maxMEM is 0:
            RuntimeError("Can't read ncpus or maxmem info")
        logger.info("Got the maxCPU: %s" % (self.maxMEM))
Пример #56
0
 def run(self):
     sampleListName = "%s/%s/%s" % (self.options.dataDir, self.exp,
                                    "sample.lst")
     logger.info("sampleListName: %s\n" % (sampleListName))
     assert os.path.exists(sampleListName)
     samples = getList(sampleListName)
     for sample in samples:
         sampleDir = "%s/%s" % (self.exp, sample)
         self.addChildTarget(
             RunSample(self.exp, sample, sampleDir, self.options))
Пример #57
0
def bsub(bsubline):
    process = subprocess.Popen(" ".join(bsubline),
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.STDOUT)
    liney = process.stdout.readline()
    logger.info("BSUB: " + liney)
    result = int(liney.strip().split()[1].strip('<>'))
    logger.debug("Got the job id: %s" % (str(result)))
    return result
Пример #58
0
def runCactusAdjacencyGraphViewer(graphFile,
                                  cactusDiskDatabaseString,
                                  flowerName="0",
                                  logLevel=None,
                                  includeInternalAdjacencies=False):
    logLevel = getLogLevelString2(logLevel)
    includeInternalAdjacencies = nameValue("includeInternalAdjacencies",
                                           includeInternalAdjacencies, bool)
    system("cactus_adjacencyGraphViewer --cactusDisk '%s' --flowerName %s --outputFile %s --logLevel %s" \
                    % (cactusDiskDatabaseString, flowerName, graphFile, logLevel))
    logger.info("Created a break point graph of the problem")
Пример #59
0
def createFirstJob(command, config, memory=None, cpu=None, time=sys.maxint):
    """Adds the first job to to the jobtree.
    """
    logger.info("Adding the first job")
    if memory == None or memory == sys.maxint:
        memory = float(config.attrib["default_memory"])
    if cpu == None or cpu == sys.maxint:
        cpu = float(config.attrib["default_cpu"])
    job = Job(command=command, memory=memory, cpu=cpu, 
              tryCount=int(config.attrib["try_count"]), jobDir=getJobFileDirName(config.attrib["job_tree"]))
    job.write()
    logger.info("Added the first job")