예제 #1
0
    def run(self):
        # If the files are in a sub-dir then rip them out.
        if os.path.isdir(self.inputSequenceFileOrDirectory):
            tempFile = getTempFile(rootDir=self.getGlobalTempDir())
            catFiles(
                [
                    os.path.join(self.inputSequenceFileOrDirectory, f)
                    for f in os.listdir(self.inputSequenceFileOrDirectory)
                ],
                tempFile,
            )
            inputSequenceFile = tempFile
        else:
            inputSequenceFile = self.inputSequenceFileOrDirectory

        assert inputSequenceFile != self.outputSequenceFile

        prepXmlElems = self.configNode.findall("preprocessor")

        analysisString = runCactusAnalyseAssembly(inputSequenceFile)
        self.logToMaster(
            "Before running any preprocessing on the assembly: %s got following stats (assembly may be listed as temp file if input sequences from a directory): %s"
            % (self.inputSequenceFileOrDirectory, analysisString)
        )

        if len(prepXmlElems) == 0:  # Just cp the file to the output file
            system("cp %s %s" % (inputSequenceFile, self.outputSequenceFile))
        else:
            logger.info("Adding child batch_preprocessor target")
            self.addChildTarget(BatchPreprocessor(prepXmlElems, inputSequenceFile, self.outputSequenceFile, 0))
예제 #2
0
 def run(self, fileStore):
     logger.info("Results IDs: %s" % self.resultsFileIDs)
     resultsFiles = [readGlobalFileWithoutCache(fileStore, fileID) for fileID in self.resultsFileIDs]
     collatedResultsFile = fileStore.getLocalTempFile()
     catFiles(resultsFiles, collatedResultsFile)
     logger.info("Collated the alignments to the file: %s",  collatedResultsFile)
     collatedResultsID = fileStore.writeGlobalFile(collatedResultsFile)
     for resultsFileID in self.resultsFileIDs:
         fileStore.deleteGlobalFile(resultsFileID)
     return collatedResultsID
예제 #3
0
 def testCompression(self):
     tempSeqFile = os.path.join(self.tempDir, "tempSeq.fa")
     tempSeqFile2 = os.path.join(self.tempDir, "tempSeq2.fa")
     self.tempFiles.append(tempSeqFile)
     self.tempFiles.append(tempSeqFile2)
     self.encodePath = os.path.join(self.encodePath, "ENm001")
     catFiles([ os.path.join(self.encodePath, fileName) for fileName in os.listdir(self.encodePath) ], tempSeqFile)
     startTime = time.time()
     compressFastaFile(tempSeqFile)
     logger.critical("It took %s seconds to compress the fasta file" % (time.time() - startTime))
     startTime = time.time()
     system("rm %s" % tempSeqFile + ".bz2")
     system("bzip2 --keep --fast %s" % tempSeqFile)
     logger.critical("It took %s seconds to compress the fasta file by system functions" % (time.time() - startTime))
     startTime = time.time()
     decompressFastaFile(tempSeqFile + ".bz2", tempSeqFile2)
     logger.critical("It took %s seconds to decompress the fasta file" % (time.time() - startTime))
     system("rm %s" % tempSeqFile2)
     startTime = time.time()
     system("bunzip2 --stdout %s > %s" % (tempSeqFile + ".bz2", tempSeqFile2))
     logger.critical("It took %s seconds to decompress the fasta file using system function" % (time.time() - startTime))
     logger.critical("File sizes, before: %s, compressed: %s, decompressed: %s" % (os.stat(tempSeqFile).st_size, os.stat(tempSeqFile + ".bz2").st_size, os.stat(tempSeqFile2).st_size))
예제 #4
0
 def testCompression(self):
     tempSeqFile = os.path.join(self.tempDir, "tempSeq.fa")
     tempSeqFile2 = os.path.join(self.tempDir, "tempSeq2.fa")
     self.tempFiles.append(tempSeqFile)
     self.tempFiles.append(tempSeqFile2)
     self.encodePath = os.path.join(self.encodePath, "ENm001")
     catFiles([
         os.path.join(self.encodePath, fileName)
         for fileName in os.listdir(self.encodePath)
     ], tempSeqFile)
     startTime = time.time()
     compressFastaFile(tempSeqFile)
     logger.critical("It took %s seconds to compress the fasta file" %
                     (time.time() - startTime))
     startTime = time.time()
     system("rm %s" % tempSeqFile + ".bz2")
     system("bzip2 --keep --fast %s" % tempSeqFile)
     logger.critical(
         "It took %s seconds to compress the fasta file by system functions"
         % (time.time() - startTime))
     startTime = time.time()
     decompressFastaFile(tempSeqFile + ".bz2", tempSeqFile2)
     logger.critical("It took %s seconds to decompress the fasta file" %
                     (time.time() - startTime))
     system("rm %s" % tempSeqFile2)
     startTime = time.time()
     system("bunzip2 --stdout %s > %s" %
            (tempSeqFile + ".bz2", tempSeqFile2))
     logger.critical(
         "It took %s seconds to decompress the fasta file using system function"
         % (time.time() - startTime))
     logger.critical(
         "File sizes, before: %s, compressed: %s, decompressed: %s" %
         (os.stat(tempSeqFile).st_size,
          os.stat(tempSeqFile + ".bz2").st_size,
          os.stat(tempSeqFile2).st_size))
예제 #5
0
def cat(target, genome, output_gtf, unsorted_tmp_file, out_file_tree):
    """
    Concatenates all of the results into one big genePred, and sorts it by chromosome/pos
    """
    catFiles(out_file_tree.listFiles(), unsorted_tmp_file)
    system("sort -k1,1 -k4,4 {} > {}".format(unsorted_tmp_file, output_gtf))
예제 #6
0
 def run(self):
     catFiles(self.resultsFiles, self.finalResultsFile)
     logger.info("Collated the alignments to the file: %s",  self.finalResultsFile)
def cat(target, genome, file_tree, out_db):
    tmp_file = os.path.join(target.getGlobalTempDir(), "tmp.txt")
    catFiles(file_tree.listFiles(), tmp_file)
    target.setFollowOnTargetFn(load_db, args=[genome, tmp_file, out_db])
예제 #8
0
def main():
    ##########################################
    #Construct the arguments.
    ##########################################    
    
    usage = "usage: %prog [options] <query> <target> <output>\n\n" + \
            "    <query>:  fasta sequence to search for repeats\n" + \
            "    <target>: fasta sequence to mask\n" + \
            "    <output>: softmasked version of <target>\n\n" + \
            "Example: %prog genome.fa chunk.fa chunk.masked.fa\n\n" 
    description = "softrepeat mask a fasta file using lastz.\n" + \
                    "NOTE: NEW VERSION OF LASTZ REQUIRED (MINIMUM 1.02.40)\n" + \
                    "lastz tools/ dir MUST BE IN SYSTEM PATH"
    parser = OptionParser(usage=usage, description=description)

    #output stuff
    parser.add_option("--fragment", dest="fragment", type="int",
                     help="The size of chunks passed to lastz (must be at least twice as big as overlap)",
                     default=200)
    
    parser.add_option("--minPeriod", dest="period", type="int",
                     help="minimum number of occurrences of a sequence for it to be masked",
                     default=10)
    
    parser.add_option("--lastzOpts", dest="lastzOptions", type="string",
                      help="lastz options for repeat identification",
                      default="")
    
    parser.add_option("--lastzCmd", dest="lastzCmd", type="string",
                      help="lastz executable",
                      default="cPecanLastz")
    
    parser.add_option("--unmaskInput", dest="unmaskInput", action="store_true",
                      help="Makes any previous masking of the input sequence invisible to the repeat masking process",
                      default=False)
    
    parser.add_option("--unmaskOutput", dest="unmaskOutput", action="store_true",
                      help="Discards any previous masking from the output sequence, uses just the masking discovered by lastz",
                      default=False)
    
    parser.add_option("--proportionSampled", dest="proportionSampled", type="float",
                     help="The amount of the genome that is being sampled for masking, used to adjust the minPeriod parameter according to sampling",
                     default="1.0")
    
    parser.add_option("--tempDir", dest="tempDir",
                     help="Location in which to place to temporary files",
                     default=os.getcwd())
    
    options, args = parser.parse_args()
    
    if len(args) != 2:
        parser.print_help()
        return 1
    
    queryFile = args[0]
    outputFile = args[1]
    targetFiles = sys.stdin.readline().split() #Read them from stdin
    
    assert os.path.isfile(queryFile)
    assert len(targetFiles) >= 1
    for targetFile in targetFiles:
        assert os.path.isfile(targetFile)
    assert options.fragment > 1
    
    #Adjust the period parameter using the amount of genome sampled
    options.period = max(1, round(options.proportionSampled * options.period))
    
    # make sure fragment size is even so they can overlap by exactly one half. 
    if options.fragment % 2:
        options.fragment += 1
    
    # make temporary working directory in same path as output
    tempDir = tempfile.mkdtemp(dir=options.tempDir)
    maskInfoFile = os.path.join(tempDir, "maskFile.dat")
    targetFile = os.path.join(tempDir, "target.fa")

    try:
        #Make temporary target file, if more than one file
        catFiles(targetFiles, targetFile)
        
        # chop up input fasta file into into fragments of specified size.  fragments overlap by 
        # half their length. 
        fragCmdLine = 'cat ' + queryFile + ' | cactus_fasta_fragments.py ' + '--fragment=' + \
                        str(options.fragment) + ' --step=' + str(options.fragment / 2) + " --origin=zero "
        
        # lastz each fragment against the entire input sequence.  Each time a fragment aligns to a base
        # in the sequence, that base's match count is incremented.  
        # the plus three for the period parameter is a fudge to ensure sufficient alignments are found
        lastZSequenceHandling  = '[multiple][nameparse=darkspace] /dev/stdin[nameparse=darkspace] '
        if options.unmaskInput:
            lastZSequenceHandling  = '[multiple,unmask][nameparse=darkspace] /dev/stdin[unmask][nameparse=darkspace] '
        lastzCmdLine = options.lastzCmd + ' ' + targetFile + \
        lastZSequenceHandling + options.lastzOptions + \
        (' --querydepth=keep,nowarn:%i --format=general:name1,zstart1,end1,name2,zstart2+,end2+ --markend ' % \
         (options.period+3))

        #This runs Bob's covered intervals program, which combins the lastz alignment info into intervals of the query.
        coveredIntervalsCmdLine = "cactus_covered_intervals --queryoffsets --origin=one M=%s > %s" % (int(options.period*2), maskInfoFile)

        system(fragCmdLine + ' | ' + lastzCmdLine + ' | ' + coveredIntervalsCmdLine)

        #open(maskInfoFile, "w").close()
        # the previous lastz command outputs a file of intervals (denoted with indices) to softmask.
        # we finish by applying these intervals to the input file, to produce the final, softmasked output. 
        unmaskString = ""
        if options.unmaskOutput:
            unmaskString = "--unmask"
        softMaskCmdLine = 'cat ' + queryFile + (' | cactus_fasta_softmask_intervals.py --origin=one %s ' % unmaskString) + \
            maskInfoFile +  ' > ' + outputFile
    
        system(softMaskCmdLine)
    
    except Exception, e:
        # delete the temporary files
        shutil.rmtree(tempDir)
        raise e
def cat(target, genome, output_gtf, unsorted_tmp_file, out_file_tree):
    """
    Concatenates all of the results into one big genePred, and sorts it by chromosome/pos
    """
    catFiles(out_file_tree.listFiles(), unsorted_tmp_file)
    system("sort -k1,1 -k4,4 {} > {}".format(unsorted_tmp_file, output_gtf))