def run(self): # If the files are in a sub-dir then rip them out. if os.path.isdir(self.inputSequenceFileOrDirectory): tempFile = getTempFile(rootDir=self.getGlobalTempDir()) catFiles( [ os.path.join(self.inputSequenceFileOrDirectory, f) for f in os.listdir(self.inputSequenceFileOrDirectory) ], tempFile, ) inputSequenceFile = tempFile else: inputSequenceFile = self.inputSequenceFileOrDirectory assert inputSequenceFile != self.outputSequenceFile prepXmlElems = self.configNode.findall("preprocessor") analysisString = runCactusAnalyseAssembly(inputSequenceFile) self.logToMaster( "Before running any preprocessing on the assembly: %s got following stats (assembly may be listed as temp file if input sequences from a directory): %s" % (self.inputSequenceFileOrDirectory, analysisString) ) if len(prepXmlElems) == 0: # Just cp the file to the output file system("cp %s %s" % (inputSequenceFile, self.outputSequenceFile)) else: logger.info("Adding child batch_preprocessor target") self.addChildTarget(BatchPreprocessor(prepXmlElems, inputSequenceFile, self.outputSequenceFile, 0))
def run(self, fileStore): logger.info("Results IDs: %s" % self.resultsFileIDs) resultsFiles = [readGlobalFileWithoutCache(fileStore, fileID) for fileID in self.resultsFileIDs] collatedResultsFile = fileStore.getLocalTempFile() catFiles(resultsFiles, collatedResultsFile) logger.info("Collated the alignments to the file: %s", collatedResultsFile) collatedResultsID = fileStore.writeGlobalFile(collatedResultsFile) for resultsFileID in self.resultsFileIDs: fileStore.deleteGlobalFile(resultsFileID) return collatedResultsID
def testCompression(self): tempSeqFile = os.path.join(self.tempDir, "tempSeq.fa") tempSeqFile2 = os.path.join(self.tempDir, "tempSeq2.fa") self.tempFiles.append(tempSeqFile) self.tempFiles.append(tempSeqFile2) self.encodePath = os.path.join(self.encodePath, "ENm001") catFiles([ os.path.join(self.encodePath, fileName) for fileName in os.listdir(self.encodePath) ], tempSeqFile) startTime = time.time() compressFastaFile(tempSeqFile) logger.critical("It took %s seconds to compress the fasta file" % (time.time() - startTime)) startTime = time.time() system("rm %s" % tempSeqFile + ".bz2") system("bzip2 --keep --fast %s" % tempSeqFile) logger.critical("It took %s seconds to compress the fasta file by system functions" % (time.time() - startTime)) startTime = time.time() decompressFastaFile(tempSeqFile + ".bz2", tempSeqFile2) logger.critical("It took %s seconds to decompress the fasta file" % (time.time() - startTime)) system("rm %s" % tempSeqFile2) startTime = time.time() system("bunzip2 --stdout %s > %s" % (tempSeqFile + ".bz2", tempSeqFile2)) logger.critical("It took %s seconds to decompress the fasta file using system function" % (time.time() - startTime)) logger.critical("File sizes, before: %s, compressed: %s, decompressed: %s" % (os.stat(tempSeqFile).st_size, os.stat(tempSeqFile + ".bz2").st_size, os.stat(tempSeqFile2).st_size))
def testCompression(self): tempSeqFile = os.path.join(self.tempDir, "tempSeq.fa") tempSeqFile2 = os.path.join(self.tempDir, "tempSeq2.fa") self.tempFiles.append(tempSeqFile) self.tempFiles.append(tempSeqFile2) self.encodePath = os.path.join(self.encodePath, "ENm001") catFiles([ os.path.join(self.encodePath, fileName) for fileName in os.listdir(self.encodePath) ], tempSeqFile) startTime = time.time() compressFastaFile(tempSeqFile) logger.critical("It took %s seconds to compress the fasta file" % (time.time() - startTime)) startTime = time.time() system("rm %s" % tempSeqFile + ".bz2") system("bzip2 --keep --fast %s" % tempSeqFile) logger.critical( "It took %s seconds to compress the fasta file by system functions" % (time.time() - startTime)) startTime = time.time() decompressFastaFile(tempSeqFile + ".bz2", tempSeqFile2) logger.critical("It took %s seconds to decompress the fasta file" % (time.time() - startTime)) system("rm %s" % tempSeqFile2) startTime = time.time() system("bunzip2 --stdout %s > %s" % (tempSeqFile + ".bz2", tempSeqFile2)) logger.critical( "It took %s seconds to decompress the fasta file using system function" % (time.time() - startTime)) logger.critical( "File sizes, before: %s, compressed: %s, decompressed: %s" % (os.stat(tempSeqFile).st_size, os.stat(tempSeqFile + ".bz2").st_size, os.stat(tempSeqFile2).st_size))
def cat(target, genome, output_gtf, unsorted_tmp_file, out_file_tree): """ Concatenates all of the results into one big genePred, and sorts it by chromosome/pos """ catFiles(out_file_tree.listFiles(), unsorted_tmp_file) system("sort -k1,1 -k4,4 {} > {}".format(unsorted_tmp_file, output_gtf))
def run(self): catFiles(self.resultsFiles, self.finalResultsFile) logger.info("Collated the alignments to the file: %s", self.finalResultsFile)
def cat(target, genome, file_tree, out_db): tmp_file = os.path.join(target.getGlobalTempDir(), "tmp.txt") catFiles(file_tree.listFiles(), tmp_file) target.setFollowOnTargetFn(load_db, args=[genome, tmp_file, out_db])
def main(): ########################################## #Construct the arguments. ########################################## usage = "usage: %prog [options] <query> <target> <output>\n\n" + \ " <query>: fasta sequence to search for repeats\n" + \ " <target>: fasta sequence to mask\n" + \ " <output>: softmasked version of <target>\n\n" + \ "Example: %prog genome.fa chunk.fa chunk.masked.fa\n\n" description = "softrepeat mask a fasta file using lastz.\n" + \ "NOTE: NEW VERSION OF LASTZ REQUIRED (MINIMUM 1.02.40)\n" + \ "lastz tools/ dir MUST BE IN SYSTEM PATH" parser = OptionParser(usage=usage, description=description) #output stuff parser.add_option("--fragment", dest="fragment", type="int", help="The size of chunks passed to lastz (must be at least twice as big as overlap)", default=200) parser.add_option("--minPeriod", dest="period", type="int", help="minimum number of occurrences of a sequence for it to be masked", default=10) parser.add_option("--lastzOpts", dest="lastzOptions", type="string", help="lastz options for repeat identification", default="") parser.add_option("--lastzCmd", dest="lastzCmd", type="string", help="lastz executable", default="cPecanLastz") parser.add_option("--unmaskInput", dest="unmaskInput", action="store_true", help="Makes any previous masking of the input sequence invisible to the repeat masking process", default=False) parser.add_option("--unmaskOutput", dest="unmaskOutput", action="store_true", help="Discards any previous masking from the output sequence, uses just the masking discovered by lastz", default=False) parser.add_option("--proportionSampled", dest="proportionSampled", type="float", help="The amount of the genome that is being sampled for masking, used to adjust the minPeriod parameter according to sampling", default="1.0") parser.add_option("--tempDir", dest="tempDir", help="Location in which to place to temporary files", default=os.getcwd()) options, args = parser.parse_args() if len(args) != 2: parser.print_help() return 1 queryFile = args[0] outputFile = args[1] targetFiles = sys.stdin.readline().split() #Read them from stdin assert os.path.isfile(queryFile) assert len(targetFiles) >= 1 for targetFile in targetFiles: assert os.path.isfile(targetFile) assert options.fragment > 1 #Adjust the period parameter using the amount of genome sampled options.period = max(1, round(options.proportionSampled * options.period)) # make sure fragment size is even so they can overlap by exactly one half. if options.fragment % 2: options.fragment += 1 # make temporary working directory in same path as output tempDir = tempfile.mkdtemp(dir=options.tempDir) maskInfoFile = os.path.join(tempDir, "maskFile.dat") targetFile = os.path.join(tempDir, "target.fa") try: #Make temporary target file, if more than one file catFiles(targetFiles, targetFile) # chop up input fasta file into into fragments of specified size. fragments overlap by # half their length. fragCmdLine = 'cat ' + queryFile + ' | cactus_fasta_fragments.py ' + '--fragment=' + \ str(options.fragment) + ' --step=' + str(options.fragment / 2) + " --origin=zero " # lastz each fragment against the entire input sequence. Each time a fragment aligns to a base # in the sequence, that base's match count is incremented. # the plus three for the period parameter is a fudge to ensure sufficient alignments are found lastZSequenceHandling = '[multiple][nameparse=darkspace] /dev/stdin[nameparse=darkspace] ' if options.unmaskInput: lastZSequenceHandling = '[multiple,unmask][nameparse=darkspace] /dev/stdin[unmask][nameparse=darkspace] ' lastzCmdLine = options.lastzCmd + ' ' + targetFile + \ lastZSequenceHandling + options.lastzOptions + \ (' --querydepth=keep,nowarn:%i --format=general:name1,zstart1,end1,name2,zstart2+,end2+ --markend ' % \ (options.period+3)) #This runs Bob's covered intervals program, which combins the lastz alignment info into intervals of the query. coveredIntervalsCmdLine = "cactus_covered_intervals --queryoffsets --origin=one M=%s > %s" % (int(options.period*2), maskInfoFile) system(fragCmdLine + ' | ' + lastzCmdLine + ' | ' + coveredIntervalsCmdLine) #open(maskInfoFile, "w").close() # the previous lastz command outputs a file of intervals (denoted with indices) to softmask. # we finish by applying these intervals to the input file, to produce the final, softmasked output. unmaskString = "" if options.unmaskOutput: unmaskString = "--unmask" softMaskCmdLine = 'cat ' + queryFile + (' | cactus_fasta_softmask_intervals.py --origin=one %s ' % unmaskString) + \ maskInfoFile + ' > ' + outputFile system(softMaskCmdLine) except Exception, e: # delete the temporary files shutil.rmtree(tempDir) raise e